[dpdk-dev] [PATCH v9 7/7] event/cnxk: add Tx event vector fastpath
Jerin Jacob
jerinjacobk at gmail.com
Fri Jul 16 14:19:34 CEST 2021
On Wed, Jul 14, 2021 at 2:33 PM <pbhagavatula at marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula at marvell.com>
>
> Add Tx event vector fastpath, integrate event vector Tx routine
> into Tx burst.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula at marvell.com>
Series Acked-by: Jerin Jacob <jerinj at marvell.com>
Series v9 Applied to dpdk-next-net-eventdev/for-main. Thanks
> ---
> drivers/common/cnxk/roc_sso.h | 23 ++++++
> drivers/event/cnxk/cn10k_eventdev.c | 3 +-
> drivers/event/cnxk/cn10k_worker.h | 104 +++++++++++++++++++++++++--
> drivers/event/cnxk/cn9k_worker.h | 4 +-
> drivers/event/cnxk/cnxk_worker.h | 22 ------
> drivers/net/cnxk/cn10k_tx.c | 2 +-
> drivers/net/cnxk/cn10k_tx.h | 52 +++++++++-----
> drivers/net/cnxk/cn10k_tx_mseg.c | 3 +-
> drivers/net/cnxk/cn10k_tx_vec.c | 2 +-
> drivers/net/cnxk/cn10k_tx_vec_mseg.c | 2 +-
> 10 files changed, 165 insertions(+), 52 deletions(-)
>
> diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
> index a6030e7d8a..b28f6089cc 100644
> --- a/drivers/common/cnxk/roc_sso.h
> +++ b/drivers/common/cnxk/roc_sso.h
> @@ -44,6 +44,29 @@ struct roc_sso {
> uint8_t reserved[ROC_SSO_MEM_SZ] __plt_cache_aligned;
> } __plt_cache_aligned;
>
> +static __plt_always_inline void
> +roc_sso_hws_head_wait(uintptr_t tag_op)
> +{
> +#ifdef RTE_ARCH_ARM64
> + uint64_t tag;
> +
> + asm volatile(PLT_CPU_FEATURE_PREAMBLE
> + " ldr %[tag], [%[tag_op]] \n"
> + " tbnz %[tag], 35, done%= \n"
> + " sevl \n"
> + "rty%=: wfe \n"
> + " ldr %[tag], [%[tag_op]] \n"
> + " tbz %[tag], 35, rty%= \n"
> + "done%=: \n"
> + : [tag] "=&r"(tag)
> + : [tag_op] "r"(tag_op));
> +#else
> + /* Wait for the SWTAG/SWTAG_FULL operation */
> + while (!(plt_read64(tag_op) & BIT_ULL(35)))
> + ;
> +#endif
> +}
> +
> /* SSO device initialization */
> int __roc_api roc_sso_dev_init(struct roc_sso *roc_sso);
> int __roc_api roc_sso_dev_fini(struct roc_sso *roc_sso);
> diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
> index e85fa4785d..6f37c5bd23 100644
> --- a/drivers/event/cnxk/cn10k_eventdev.c
> +++ b/drivers/event/cnxk/cn10k_eventdev.c
> @@ -782,7 +782,8 @@ cn10k_sso_tx_adapter_caps_get(const struct rte_eventdev *dev,
> if (ret)
> *caps = 0;
> else
> - *caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT;
> + *caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT |
> + RTE_EVENT_ETH_TX_ADAPTER_CAP_EVENT_VECTOR;
>
> return 0;
> }
> diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
> index 7a48a6b17d..9cc0992063 100644
> --- a/drivers/event/cnxk/cn10k_worker.h
> +++ b/drivers/event/cnxk/cn10k_worker.h
> @@ -308,29 +308,120 @@ uint16_t __rte_hot cn10k_sso_hws_enq_fwd_burst(void *port,
> NIX_RX_FASTPATH_MODES
> #undef R
>
> -static __rte_always_inline const struct cn10k_eth_txq *
> +static __rte_always_inline struct cn10k_eth_txq *
> cn10k_sso_hws_xtract_meta(struct rte_mbuf *m,
> const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
> {
> - return (const struct cn10k_eth_txq *)
> + return (struct cn10k_eth_txq *)
> txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
> }
>
> +static __rte_always_inline void
> +cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
> + uint64_t *cmd, uint16_t lmt_id, uintptr_t lmt_addr,
> + uint8_t sched_type, uintptr_t base,
> + const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
> + const uint32_t flags)
> +{
> + uint16_t port[4], queue[4];
> + struct cn10k_eth_txq *txq;
> + uint16_t i, j;
> + uintptr_t pa;
> +
> + for (i = 0; i < nb_mbufs; i += 4) {
> + port[0] = mbufs[i]->port;
> + port[1] = mbufs[i + 1]->port;
> + port[2] = mbufs[i + 2]->port;
> + port[3] = mbufs[i + 3]->port;
> +
> + queue[0] = rte_event_eth_tx_adapter_txq_get(mbufs[i]);
> + queue[1] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 1]);
> + queue[2] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 2]);
> + queue[3] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 3]);
> +
> + if (((port[0] ^ port[1]) & (port[2] ^ port[3])) ||
> + ((queue[0] ^ queue[1]) & (queue[2] ^ queue[3]))) {
> +
> + for (j = 0; j < 4; j++) {
> + struct rte_mbuf *m = mbufs[i + j];
> +
> + txq = (struct cn10k_eth_txq *)
> + txq_data[port[j]][queue[j]];
> + cn10k_nix_tx_skeleton(txq, cmd, flags);
> + /* Perform header writes before barrier
> + * for TSO
> + */
> + if (flags & NIX_TX_OFFLOAD_TSO_F)
> + cn10k_nix_xmit_prepare_tso(m, flags);
> +
> + cn10k_nix_xmit_prepare(m, cmd, lmt_addr, flags,
> + txq->lso_tun_fmt);
> + if (flags & NIX_TX_MULTI_SEG_F) {
> + const uint16_t segdw =
> + cn10k_nix_prepare_mseg(
> + m, (uint64_t *)lmt_addr,
> + flags);
> + pa = txq->io_addr | ((segdw - 1) << 4);
> + } else {
> + pa = txq->io_addr |
> + (cn10k_nix_tx_ext_subs(flags) + 1)
> + << 4;
> + }
> + if (!sched_type)
> + roc_sso_hws_head_wait(base +
> + SSOW_LF_GWS_TAG);
> +
> + roc_lmt_submit_steorl(lmt_id, pa);
> + }
> + } else {
> + txq = (struct cn10k_eth_txq *)
> + txq_data[port[0]][queue[0]];
> + cn10k_nix_xmit_pkts_vector(txq, &mbufs[i], 4, cmd, base
> + + SSOW_LF_GWS_TAG,
> + flags | NIX_TX_VWQE_F);
> + }
> + }
> +}
> +
> static __rte_always_inline uint16_t
> cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
> uint64_t *cmd,
> const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
> const uint32_t flags)
> {
> - const struct cn10k_eth_txq *txq;
> - struct rte_mbuf *m = ev->mbuf;
> - uint16_t ref_cnt = m->refcnt;
> + struct cn10k_eth_txq *txq;
> + struct rte_mbuf *m;
> uintptr_t lmt_addr;
> + uint16_t ref_cnt;
> uint16_t lmt_id;
> uintptr_t pa;
>
> lmt_addr = ws->lmt_base;
> ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
> +
> + if (ev->event_type & RTE_EVENT_TYPE_VECTOR) {
> + struct rte_mbuf **mbufs = ev->vec->mbufs;
> + uint64_t meta = *(uint64_t *)ev->vec;
> +
> + if (meta & BIT(31)) {
> + txq = (struct cn10k_eth_txq *)
> + txq_data[meta >> 32][meta >> 48];
> +
> + cn10k_nix_xmit_pkts_vector(
> + txq, mbufs, meta & 0xFFFF, cmd,
> + ws->tx_base + SSOW_LF_GWS_TAG,
> + flags | NIX_TX_VWQE_F);
> + } else {
> + cn10k_sso_vwqe_split_tx(
> + mbufs, meta & 0xFFFF, cmd, lmt_id, lmt_addr,
> + ev->sched_type, ws->tx_base, txq_data, flags);
> + }
> + rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec);
> + return (meta & 0xFFFF);
> + }
> +
> + m = ev->mbuf;
> + ref_cnt = m->refcnt;
> txq = cn10k_sso_hws_xtract_meta(m, txq_data);
> cn10k_nix_tx_skeleton(txq, cmd, flags);
> /* Perform header writes before barrier for TSO */
> @@ -346,7 +437,7 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
> pa = txq->io_addr | (cn10k_nix_tx_ext_subs(flags) + 1) << 4;
> }
> if (!ev->sched_type)
> - cnxk_sso_hws_head_wait(ws->tx_base + SSOW_LF_GWS_TAG);
> + roc_sso_hws_head_wait(ws->tx_base + SSOW_LF_GWS_TAG);
>
> roc_lmt_submit_steorl(lmt_id, pa);
>
> @@ -357,7 +448,6 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
>
> cnxk_sso_hws_swtag_flush(ws->tx_base + SSOW_LF_GWS_TAG,
> ws->tx_base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
> -
> return 1;
> }
>
> diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
> index 3f9751211a..cc1e141957 100644
> --- a/drivers/event/cnxk/cn9k_worker.h
> +++ b/drivers/event/cnxk/cn9k_worker.h
> @@ -466,7 +466,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
> const uint16_t segdw = cn9k_nix_prepare_mseg(m, cmd, flags);
> if (!CNXK_TT_FROM_EVENT(ev->event)) {
> cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
> - cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
> + roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
> cn9k_sso_txq_fc_wait(txq);
> if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
> cn9k_nix_xmit_mseg_one(cmd, txq->lmt_addr,
> @@ -478,7 +478,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
> } else {
> if (!CNXK_TT_FROM_EVENT(ev->event)) {
> cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
> - cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
> + roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
> cn9k_sso_txq_fc_wait(txq);
> if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
> cn9k_nix_xmit_one(cmd, txq->lmt_addr,
> diff --git a/drivers/event/cnxk/cnxk_worker.h b/drivers/event/cnxk/cnxk_worker.h
> index 7891b749df..9f9ceab8a1 100644
> --- a/drivers/event/cnxk/cnxk_worker.h
> +++ b/drivers/event/cnxk/cnxk_worker.h
> @@ -75,26 +75,4 @@ cnxk_sso_hws_swtag_wait(uintptr_t tag_op)
> #endif
> }
>
> -static __rte_always_inline void
> -cnxk_sso_hws_head_wait(uintptr_t tag_op)
> -{
> -#ifdef RTE_ARCH_ARM64
> - uint64_t tag;
> -
> - asm volatile(" ldr %[tag], [%[tag_op]] \n"
> - " tbnz %[tag], 35, done%= \n"
> - " sevl \n"
> - "rty%=: wfe \n"
> - " ldr %[tag], [%[tag_op]] \n"
> - " tbz %[tag], 35, rty%= \n"
> - "done%=: \n"
> - : [tag] "=&r"(tag)
> - : [tag_op] "r"(tag_op));
> -#else
> - /* Wait for the HEAD to be set */
> - while (!(plt_read64(tag_op) & BIT_ULL(35)))
> - ;
> -#endif
> -}
> -
> #endif
> diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c
> index 1f30bab59a..0e1276c60b 100644
> --- a/drivers/net/cnxk/cn10k_tx.c
> +++ b/drivers/net/cnxk/cn10k_tx.c
> @@ -16,7 +16,7 @@
> !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \
> return 0; \
> return cn10k_nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, \
> - flags); \
> + 0, flags); \
> }
>
> NIX_TX_FASTPATH_MODES
> diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
> index eb148b8e77..f75cae07ae 100644
> --- a/drivers/net/cnxk/cn10k_tx.h
> +++ b/drivers/net/cnxk/cn10k_tx.h
> @@ -18,6 +18,7 @@
> * Defining it from backwards to denote its been
> * not used as offload flags to pick function
> */
> +#define NIX_TX_VWQE_F BIT(14)
> #define NIX_TX_MULTI_SEG_F BIT(15)
>
> #define NIX_TX_NEED_SEND_HDR_W1 \
> @@ -519,7 +520,7 @@ cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
>
> static __rte_always_inline uint16_t
> cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
> - uint64_t *cmd, const uint16_t flags)
> + uint64_t *cmd, uintptr_t base, const uint16_t flags)
> {
> struct cn10k_eth_txq *txq = tx_queue;
> const rte_iova_t io_addr = txq->io_addr;
> @@ -528,14 +529,15 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
> uint64_t lso_tun_fmt;
> uint64_t data;
>
> - NIX_XMIT_FC_OR_RETURN(txq, pkts);
> + if (!(flags & NIX_TX_VWQE_F)) {
> + NIX_XMIT_FC_OR_RETURN(txq, pkts);
> + /* Reduce the cached count */
> + txq->fc_cache_pkts -= pkts;
> + }
>
> /* Get cmd skeleton */
> cn10k_nix_tx_skeleton(txq, cmd, flags);
>
> - /* Reduce the cached count */
> - txq->fc_cache_pkts -= pkts;
> -
> if (flags & NIX_TX_OFFLOAD_TSO_F)
> lso_tun_fmt = txq->lso_tun_fmt;
>
> @@ -558,6 +560,9 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
> lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2);
> }
>
> + if (flags & NIX_TX_VWQE_F)
> + roc_sso_hws_head_wait(base);
> +
> /* Trigger LMTST */
> if (burst > 16) {
> data = cn10k_nix_tx_steor_data(flags);
> @@ -604,7 +609,8 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
>
> static __rte_always_inline uint16_t
> cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
> - uint16_t pkts, uint64_t *cmd, const uint16_t flags)
> + uint16_t pkts, uint64_t *cmd, uintptr_t base,
> + const uint16_t flags)
> {
> struct cn10k_eth_txq *txq = tx_queue;
> uintptr_t pa0, pa1, lmt_addr = txq->lmt_base;
> @@ -652,6 +658,9 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
> shft += 3;
> }
>
> + if (flags & NIX_TX_VWQE_F)
> + roc_sso_hws_head_wait(base);
> +
> data0 = (uint64_t)data128;
> data1 = (uint64_t)(data128 >> 64);
> /* Make data0 similar to data1 */
> @@ -984,7 +993,8 @@ cn10k_nix_prep_lmt_mseg_vector(struct rte_mbuf **mbufs, uint64x2_t *cmd0,
>
> static __rte_always_inline uint16_t
> cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> - uint16_t pkts, uint64_t *cmd, const uint16_t flags)
> + uint16_t pkts, uint64_t *cmd, uintptr_t base,
> + const uint16_t flags)
> {
> uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
> uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
> @@ -1013,13 +1023,17 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> uint64_t data[2];
> } wd;
>
> - NIX_XMIT_FC_OR_RETURN(txq, pkts);
> -
> - scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
> - pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
> + if (!(flags & NIX_TX_VWQE_F)) {
> + NIX_XMIT_FC_OR_RETURN(txq, pkts);
> + scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
> + pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
> + /* Reduce the cached count */
> + txq->fc_cache_pkts -= pkts;
> + } else {
> + scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
> + pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
> + }
>
> - /* Reduce the cached count */
> - txq->fc_cache_pkts -= pkts;
> /* Perform header writes before barrier for TSO */
> if (flags & NIX_TX_OFFLOAD_TSO_F) {
> for (i = 0; i < pkts; i++)
> @@ -1973,6 +1987,9 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> if (flags & NIX_TX_MULTI_SEG_F)
> wd.data[0] >>= 16;
>
> + if (flags & NIX_TX_VWQE_F)
> + roc_sso_hws_head_wait(base);
> +
> /* Trigger LMTST */
> if (lnum > 16) {
> if (!(flags & NIX_TX_MULTI_SEG_F))
> @@ -2029,10 +2046,11 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> if (unlikely(scalar)) {
> if (flags & NIX_TX_MULTI_SEG_F)
> pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
> - scalar, cmd, flags);
> + scalar, cmd, base,
> + flags);
> else
> pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar,
> - cmd, flags);
> + cmd, base, flags);
> }
>
> return pkts;
> @@ -2041,13 +2059,15 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> #else
> static __rte_always_inline uint16_t
> cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> - uint16_t pkts, uint64_t *cmd, const uint16_t flags)
> + uint16_t pkts, uint64_t *cmd, uintptr_t base,
> + const uint16_t flags)
> {
> RTE_SET_USED(tx_queue);
> RTE_SET_USED(tx_pkts);
> RTE_SET_USED(pkts);
> RTE_SET_USED(cmd);
> RTE_SET_USED(flags);
> + RTE_SET_USED(base);
> return 0;
> }
> #endif
> diff --git a/drivers/net/cnxk/cn10k_tx_mseg.c b/drivers/net/cnxk/cn10k_tx_mseg.c
> index 33f6754722..4ea4c8a4e5 100644
> --- a/drivers/net/cnxk/cn10k_tx_mseg.c
> +++ b/drivers/net/cnxk/cn10k_tx_mseg.c
> @@ -18,7 +18,8 @@
> !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \
> return 0; \
> return cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd, \
> - (flags) | NIX_TX_MULTI_SEG_F); \
> + 0, (flags) \
> + | NIX_TX_MULTI_SEG_F); \
> }
>
> NIX_TX_FASTPATH_MODES
> diff --git a/drivers/net/cnxk/cn10k_tx_vec.c b/drivers/net/cnxk/cn10k_tx_vec.c
> index 34e3737501..a0350496ab 100644
> --- a/drivers/net/cnxk/cn10k_tx_vec.c
> +++ b/drivers/net/cnxk/cn10k_tx_vec.c
> @@ -18,7 +18,7 @@
> !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \
> return 0; \
> return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd,\
> - (flags)); \
> + 0, (flags)); \
> }
>
> NIX_TX_FASTPATH_MODES
> diff --git a/drivers/net/cnxk/cn10k_tx_vec_mseg.c b/drivers/net/cnxk/cn10k_tx_vec_mseg.c
> index 1fad81dbad..7f98f79b97 100644
> --- a/drivers/net/cnxk/cn10k_tx_vec_mseg.c
> +++ b/drivers/net/cnxk/cn10k_tx_vec_mseg.c
> @@ -16,7 +16,7 @@
> !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \
> return 0; \
> return cn10k_nix_xmit_pkts_vector( \
> - tx_queue, tx_pkts, pkts, cmd, \
> + tx_queue, tx_pkts, pkts, cmd, 0, \
> (flags) | NIX_TX_MULTI_SEG_F); \
> }
>
> --
> 2.17.1
>
More information about the dev
mailing list