[dpdk-dev] [PATCH v2 5/5] net/mlx5: add vectorized Rx/Tx burst for SSE4.1

Nélio Laranjeiro nelio.laranjeiro at 6wind.com
Tue Jul 4 10:58:52 CEST 2017
Previous message: [dpdk-dev] [PATCH v2 5/5] net/mlx5: add vectorized Rx/Tx burst for SSE4.1
Next message: [dpdk-dev] [PATCH v2 5/5] net/mlx5: add vectorized Rx/Tx burst for SSE4.1
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
Yongseok, some comments in this huge and great work,

On Fri, Jun 30, 2017 at 12:23:33PM -0700, Yongseok Koh wrote:
> To make vectorized burst routines enabled, it is required to run on x86_64
> architecture which can support at least SSE4.1. If all the conditions are
> met, the vectorized burst functions are enabled automatically. The decision
> is made individually on RX and TX. There's no PMD option to make a
> selection.
> 
> Signed-off-by: Yongseok Koh <yskoh at mellanox.com>
> ---
>  drivers/net/mlx5/Makefile            |   10 +
>  drivers/net/mlx5/mlx5_defs.h         |   18 +
>  drivers/net/mlx5/mlx5_ethdev.c       |   28 +-
>  drivers/net/mlx5/mlx5_rxq.c          |   55 +-
>  drivers/net/mlx5/mlx5_rxtx.c         |  339 ++------
>  drivers/net/mlx5/mlx5_rxtx.h         |  283 ++++++-
>  drivers/net/mlx5/mlx5_rxtx_vec_sse.c | 1451 ++++++++++++++++++++++++++++++++++
>  drivers/net/mlx5/mlx5_txq.c          |    2 +-
>  8 files changed, 1909 insertions(+), 277 deletions(-)
>  create mode 100644 drivers/net/mlx5/mlx5_rxtx_vec_sse.c
> 
> diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> index 51e258a15..2d0894fcd 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
>[...]
>  
> +#ifndef NDEBUG
> +
> +/**
> + * Verify or set magic value in CQE.
> + *
> + * @param cqe
> + *   Pointer to CQE.
> + *
> + * @return
> + *   0 the first time.
> + */
> +static inline int
> +check_cqe_seen(volatile struct mlx5_cqe *cqe)
>[...]
> +/**
> + * Check whether CQE is valid.
> + *
> + * @param cqe
> + *   Pointer to CQE.
> + * @param cqes_n
> + *   Size of completion queue.
> + * @param ci
> + *   Consumer index.
> + *
> + * @return
> + *   0 on success, 1 on failure.
> + */
> +static inline int
> +check_cqe(volatile struct mlx5_cqe *cqe,
> +	  unsigned int cqes_n, const uint16_t ci)
>[...]
> +
> +/**
> + * Return the address of the WQE.
> + *
> + * @param txq
> + *   Pointer to TX queue structure.
> + * @param  wqe_ci
> + *   WQE consumer index.
> + *
> + * @return
> + *   WQE address.
> + */
> +static inline uintptr_t *
> +tx_mlx5_wqe(struct txq *txq, uint16_t ci)
>[...]
> +
> +/**
> + * Manage TX completions.
> + *
> + * When sending a burst, mlx5_tx_burst() posts several WRs.
> + *
> + * @param txq
> + *   Pointer to TX queue structure.
> + */
> +static inline void
> +txq_complete(struct txq *txq)
>[...]
> +
> +/**
> + * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
> + * the cloned mbuf is allocated is returned instead.
> + *
> + * @param buf
> + *   Pointer to mbuf.
> + *
> + * @return
> + *   Memory pool where data is located for given mbuf.
> + */
> +static struct rte_mempool *
> +txq_mb2mp(struct rte_mbuf *buf)
>[...]
> +
> +/**
> + * Get Memory Region (MR) <-> rte_mbuf association from txq->mp2mr[].
> + * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
> + * remove an entry first.
> + *
> + * @param txq
> + *   Pointer to TX queue structure.
> + * @param[in] mp
> + *   Memory Pool for which a Memory Region lkey must be returned.
> + *
> + * @return
> + *   mr->lkey on success, (uint32_t)-1 on failure.
> + */
> +static inline uint32_t
> +txq_mb2mr(struct txq *txq, struct rte_mbuf *mb)
>[...]

Most of the function moved above should be prefix as there some have the
same name in mlx4.  Static linkage will face some issues when both are
compiled.

> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.c b/drivers/net/mlx5/mlx5_rxtx_vec_sse.c
> new file mode 100644
> index 000000000..566a60635
> --- /dev/null
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.c
> @@ -0,0 +1,1451 @@
>[...]
> +
> +#include <assert.h>
> +#include <stdint.h>
> +#include <string.h>
> +#include <stdlib.h>
> +
> +/* Verbs header. */
> +/* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
> +#ifdef PEDANTIC
> +#pragma GCC diagnostic ignored "-Wpedantic"
> +#endif
> +#include <infiniband/verbs.h>
> +#include <infiniband/mlx5_hw.h>
> +#include <infiniband/arch.h>
> +#ifdef PEDANTIC
> +#pragma GCC diagnostic error "-Wpedantic"
> +#endif
> +
> +/* DPDK headers don't like -pedantic. */
> +#ifdef PEDANTIC
> +#pragma GCC diagnostic ignored "-Wpedantic"
> +#endif
> +#include <rte_mbuf.h>
> +#include <rte_mempool.h>
> +#include <rte_prefetch.h>
> +#ifdef PEDANTIC
> +#pragma GCC diagnostic error "-Wpedantic"
> +#endif

Pragmas are not more necessary on DPDK headers.

> +
> +#include "mlx5.h"
> +#include "mlx5_utils.h"
> +#include "mlx5_rxtx.h"
> +#include "mlx5_autoconf.h"
> +#include "mlx5_defs.h"
> +#include "mlx5_prm.h"
> +
> +#include <smmintrin.h>

This include should be present with the system includes block to follow
a hierarchy:

 System
 Verbs libs
 DPDK libs
 PMD header.

> +#ifndef __INTEL_COMPILER
> +#pragma GCC diagnostic ignored "-Wcast-qual"
> +#endif
> +
> +/**
> + * Fill in DSEGs in Tx descriptors (WQE).

Unless for people whom work in Mellanox PMD, DSEG and WQE does not mean
anything.  You should avoid such acronyms in comments.

>[...]
> +/**
> + * Send multi-segmented packets until it encounters a single segment packet in
> + * the pkts list.
> + *
> + * @param txq
> + *   Pointer to TX queue structure.
> + * @param pkts
> + *   Pointer to array of packets to be sent.
> + * @param pkts_n
> + *   Number of packets to be sent.
> + *
> + * @return
> + *   Number of packets successfully transmitted (<= pkts_n).
> + */
> +static uint16_t
> +txq_scatter_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n)
> +{
> +	uint16_t elts_head = txq->elts_head;
> +	const uint16_t elts_n = 1 << txq->elts_n;
> +	const uint16_t elts_m = elts_n - 1;
> +	const uint16_t wq_n = 1 << txq->wqe_n;
> +	const uint16_t wq_mask = wq_n - 1;
> +	const unsigned int nb_dword_per_wqebb =
> +		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
> +	const unsigned int nb_dword_in_hdr =
> +		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
> +	unsigned int n;
> +	volatile struct mlx5_wqe *wqe = NULL;
> +
> +	assert(elts_n > pkts_n);
> +	txq_complete(txq);
> +	/* A CQE slot must always be available. */
> +	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));

This assert should be moved to the txq_complete(), or it should not be
an assert.

>[...]
> +/**
> + * Send burst of packets with Enhanced MPW. If it encounters a multi-seg packet,
> + * it returns to make it processed by txq_scatter_v(). All the packets in
> + * the pkts list should be single segment packets having same offload flags.
> + * This must be checked by txq_check_multiseg() and txq_calc_offload().
> + *
> + * @param txq
> + *   Pointer to TX queue structure.
> + * @param pkts
> + *   Pointer to array of packets to be sent.
> + * @param pkts_n
> + *   Number of packets to be sent.

A small comment here telling pkts_n <= MLX5_VPMD_TX_MAX_BURST would
remove the assert few lines later.  The caller must respect the
contract.

> + * @param cs_flags
> + *   Checksum offload flags to be written in the descriptor.
> + *
> + * @return
> + *   Number of packets successfully transmitted (<= pkts_n).
> + */
> +static inline uint16_t
> +txq_burst_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
> +	    uint8_t cs_flags)
> +{
> +	struct rte_mbuf **elts;
> +	uint16_t elts_head = txq->elts_head;
> +	const uint16_t elts_n = 1 << txq->elts_n;
> +	const uint16_t elts_m = elts_n - 1;
> +	const unsigned int nb_dword_per_wqebb =
> +		MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE;
> +	const unsigned int nb_dword_in_hdr =
> +		sizeof(struct mlx5_wqe) / MLX5_WQE_DWORD_SIZE;
> +	unsigned int n = 0;
> +	unsigned int pos;
> +	uint16_t max_elts;
> +	uint16_t max_wqe;
> +	uint32_t comp_req = 0;
> +	const uint16_t wq_n = 1 << txq->wqe_n;
> +	const uint16_t wq_mask = wq_n - 1;
> +	uint16_t wq_idx = txq->wqe_ci & wq_mask;
> +	volatile struct mlx5_wqe64 *wq =
> +		&((volatile struct mlx5_wqe64 *)txq->wqes)[wq_idx];
> +	volatile struct mlx5_wqe *wqe = (volatile struct mlx5_wqe *)wq;
> +	const __m128i shuf_mask_ctrl =
> +		_mm_set_epi8(15, 14, 13, 12,
> +			      8,  9, 10, 11, /* bswap32 */
> +			      4,  5,  6,  7, /* bswap32 */
> +			      0,  1,  2,  3  /* bswap32 */);
> +	__m128i *t_wqe, *dseg;
> +	__m128i ctrl;
> +
> +	/* Make sure all packets can fit into a single WQE. */
> +	assert(pkts_n <= MLX5_VPMD_TX_MAX_BURST);
> +	assert(elts_n > pkts_n);
> +	txq_complete(txq);
> +	max_elts = (elts_n - (elts_head - txq->elts_tail));
> +	/* A CQE slot must always be available. */
> +	assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));

Same point here, this assert should be in txq_complete().

>[...] 
> +/**
> + * DPDK callback for vectorized TX with multi-seg packets and offload.
> + *
> + * @param dpdk_txq
> + *   Generic pointer to TX queue structure.
> + * @param[in] pkts
> + *   Packets to transmit.
> + * @param pkts_n
> + *   Number of packets in array.
> + *
> + * @return
> + *   Number of packets successfully transmitted (<= pkts_n).
> + */
> +uint16_t
> +mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
> +{
> +	struct txq *txq = (struct txq *)dpdk_txq;
> +	uint16_t nb_tx = 0;
> +
> +	while (pkts_n > nb_tx) {
> +		uint8_t cs_flags = 0;
> +		uint16_t n;
> +		uint16_t ret;
> +
> +		/* Transmit multi-seg packets in the head of pkts list. */
> +		if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS) &&
> +		    NB_SEGS(pkts[nb_tx]) > 1)
> +			nb_tx += txq_scatter_v(
> +					txq, &pkts[nb_tx], pkts_n - nb_tx);

The indentation here does not respect the PMD code style.

> +		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
> +		if (!(txq->flags & ETH_TXQ_FLAGS_NOMULTSEGS))
> +			n = txq_check_multiseg(&pkts[nb_tx], n);
> +		if (!(txq->flags & ETH_TXQ_FLAGS_NOOFFLOADS))
> +			n = txq_calc_offload(txq, &pkts[nb_tx], n, &cs_flags);
> +		ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags);
> +		nb_tx += ret;
> +		if (!ret)
> +			break;
> +	}
> +	return nb_tx;
> +}
>[...]
> +/**
> + * Replenish buffers for RX in bulk.
> + *
> + * @param rxq
> + *   Pointer to RX queue structure.
> + */
> +static inline void
> +rxq_replenish_bulk_mbuf(struct rxq *rxq)
> +{
> +	const uint16_t q_n = 1 << rxq->elts_n;
> +	const uint16_t q_mask = q_n - 1;
> +	const uint16_t elts_idx = rxq->rq_ci & q_mask;
> +	struct rte_mbuf **elts = &(*rxq->elts)[elts_idx];
> +	volatile struct mlx5_wqe_data_seg *wq = &(*rxq->wqes)[elts_idx];
> +	uint16_t n = q_n - (rxq->rq_ci - rxq->rq_pi);

This computation is already performed in the parent, couldn't it be also
passed as parameter to this function?

> +	unsigned int i;
> +
> +	assert(n >= MLX5_VPMD_RXQ_RPLNSH_THRESH);
> +	assert(MLX5_VPMD_RXQ_RPLNSH_THRESH > MLX5_VPMD_DESCS_PER_LOOP);
> +	/* Not to cross queue end. */
> +	n = RTE_MIN(n - MLX5_VPMD_DESCS_PER_LOOP, q_n - elts_idx);
> +	if (rte_mempool_get_bulk(rxq->mp, (void *)elts, n) < 0) {
> +		rxq->stats.rx_nombuf += n;
> +		return;
> +	}
> +	for (i = 0; i < n; ++i)
> +		wq[i].addr = htonll(rte_pktmbuf_mtod(elts[i], uintptr_t));
> +	rxq->rq_ci += n;
> +	rte_wmb();
> +	*rxq->rq_db = htonl(rxq->rq_ci);
> +}
> +
>[...]
> +/*
> + * The index to the array should have:
> + * bit[1:0] = l3_hdr_type, bit[2] = tunneled, bit[3] = outer_l3_type
> + */
> +static uint32_t mlx5_ptype_table[] = {
> +	RTE_PTYPE_UNKNOWN,
> +	RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,               /* b0001 */
> +	RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,               /* b0010 */
> +	RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
> +	RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
> +		RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, /* b0101 */
> +	RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
> +		RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, /* b0110 */
> +	RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
> +	RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,               /* b1001 */
> +	RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,               /* b1010 */
> +	RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
> +	RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
> +		RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN, /* b1101 */
> +	RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
> +		RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN, /* b1110 */
> +	RTE_PTYPE_ALL_MASK			     /* b1111 */
> +};

This table could be shared with the regular Rx function.

> +/**
> + * Calculate packet type and offload flag for mbuf and store it.
> + *
> + * @param rxq
> + *   Pointer to RX queue structure.
> + * @param cqes[4]
> + *   Array of four 16bytes CQEs extracted from the original CQE.
> + * @param op_err
> + *   Opcode vector having responder error status. Each field is 4B.
> + * @param pkts
> + *   Pointer to array of packets to be filled.
> + */
> +static inline void
> +rxq_cq_to_ptype_oflags_v(struct rxq *rxq, __m128i cqes[4], __m128i op_err,
> +			 struct rte_mbuf **pkts)
> +{
> +	__m128i pinfo0;
> +	__m128i pinfo1;
> +	__m128i pinfo, ptype;
> +	__m128i ol_flags = _mm_set1_epi32(
> +			rxq->rss_hash * PKT_RX_RSS_HASH);

This two can be on the same line.

> +	__m128i cv_flags;
> +	const __m128i zero = _mm_setzero_si128();
> +	const __m128i ptype_mask =
> +		_mm_set_epi32(0xd06, 0xd06, 0xd06, 0xd06);
> +	const __m128i ptype_ol_mask =
> +		_mm_set_epi32(0x106, 0x106, 0x106, 0x106);
> +	const __m128i pinfo_mask =
> +		_mm_set_epi32(0x3, 0x3, 0x3, 0x3);
> +	const __m128i cv_flag_sel =
> +		_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0,
> +			     (uint8_t)((PKT_RX_IP_CKSUM_GOOD |
> +					     PKT_RX_L4_CKSUM_GOOD) >> 1),

The alignment is wrong.

> +			     0,
> +			     (uint8_t)(PKT_RX_L4_CKSUM_GOOD >> 1),
> +			     0,
> +			     (uint8_t)(PKT_RX_IP_CKSUM_GOOD >> 1),
> +			     (uint8_t)(PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED),
> +			     0);
> +	const __m128i cv_mask =
> +		_mm_set_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
> +				PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED,
> +			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
> +				PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED,
> +			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
> +				PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED,
> +			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
> +				PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED);

Same here.

> +	const __m128i mbuf_init =
> +		_mm_loadl_epi64((__m128i *)&rxq->mbuf_initializer);
> +	__m128i rearm0, rearm1, rearm2, rearm3;
>[...]
> +/**
> + * Check a TX queue can support vectorized TX.
> + *
> + * @param txq
> + *   Pointer to TX queue.
> + *
> + * @return
> + *   1 if supported, negative errno value if not.
> + */
> +int __attribute__((cold))
> +txq_check_vec_tx_support(struct txq *txq)
> +{
> +	/* Currently unused, but for possible future use. */

This comment is not useful as the PMD code style prefix reflects the
first parameter of the function, it is expected from the function name
to have txq as first parameter even if not use (for now).

> +	(void)txq;
> +	if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_1))
> +		return -ENOTSUP;
> +	return 1;
> +}
> +
>[...]

Thanks,

-- 
Nélio Laranjeiro
6WIND
Previous message: [dpdk-dev] [PATCH v2 5/5] net/mlx5: add vectorized Rx/Tx burst for SSE4.1
Next message: [dpdk-dev] [PATCH v2 5/5] net/mlx5: add vectorized Rx/Tx burst for SSE4.1
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
More information about the dev mailing list