[dpdk-dev] [PATCH] net/mlx5: fix completion queue drain loop

Gavin Hu (Arm Technology China) Gavin.Hu at arm.com
Mon Sep 2 04:38:33 CEST 2019


Hi Viacheslav,

> -----Original Message-----
> From: dev <dev-bounces at dpdk.org> On Behalf Of Viacheslav Ovsiienko
> Sent: Sunday, August 4, 2019 9:57 PM
> To: dev at dpdk.org
> Cc: yskoh at mellanox.com; shahafs at mellanox.com
> Subject: [dpdk-dev] [PATCH] net/mlx5: fix completion queue drain loop
>
> The completion loop speed optimizations for error-free
> operations are done - no CQE field fetch on each loop
> iteration. Also, code size is oprimized - the flush
> buffers routine is invoked once.
>
> Fixes: 318ea4cfa1b1 ("net/mlx5: fix Tx completion descriptors fetching loop")
>
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo at mellanox.com>
> ---
>  drivers/net/mlx5/mlx5_rxtx.c | 98 +++++++++++++++++++++++++++++----------
> -----
>  drivers/net/mlx5/mlx5_rxtx.h |  8 ++--
>  2 files changed, 68 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> index 1ec3793..a890f41 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -654,9 +654,10 @@ enum mlx5_txcmp_code {
>   *   Pointer to the error CQE.
>   *
>   * @return
> - *   The last Tx buffer element to free.
> + *   Negative value if queue recovery failed,
> + *   the last Tx buffer element to free otherwise.
>   */
> -uint16_t
> +int
>  mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq,
>                        volatile struct mlx5_err_cqe *err_cqe)
>  {
> @@ -706,6 +707,7 @@ enum mlx5_txcmp_code {
>                       return txq->elts_head;
>               }
>               /* Recovering failed - try again later on the same WQE. */
> +             return -1;
>       } else {
>               txq->cq_ci++;
>       }
> @@ -2010,6 +2012,45 @@ enum mlx5_txcmp_code {
>  }
>
>  /**
> + * Update completion queue consuming index via doorbell
> + * and flush the completed data buffers.
> + *
> + * @param txq
> + *   Pointer to TX queue structure.
> + * @param valid CQE pointer
> + *   if not NULL update txq->wqe_pi and flush the buffers
> + * @param itail
> + *   if not negative - flush the buffers till this index.
> + * @param olx
> + *   Configured Tx offloads mask. It is fully defined at
> + *   compile time and may be used for optimization.
> + */
> +static __rte_always_inline void
> +mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq,
> +                volatile struct mlx5_cqe *last_cqe,
> +                int itail,
> +                unsigned int olx __rte_unused)
> +{
> +     uint16_t tail;
> +
> +     if (likely(last_cqe != NULL)) {
> +             txq->wqe_pi = rte_be_to_cpu_16(last_cqe->wqe_counter);
> +             tail = ((volatile struct mlx5_wqe_cseg *)
> +                     (txq->wqes + (txq->wqe_pi & txq->wqe_m)))->misc;
> +     } else if (itail >= 0) {
> +             tail = (uint16_t)itail;
> +     } else {
> +             return;
> +     }
> +     rte_compiler_barrier();
the rte_compiler_barrier is sufficient for x86 like strong memory ordering model, but insufficient for PPC/aarch64 weak memory ordering models.
It should be rte_cio_wmb here to force memory writes above(++txq->cq_ci?) to complete before letting HW knows that.
> +     *txq->cq_db = rte_cpu_to_be_32(txq->cq_ci);
> +     if (likely(tail != txq->elts_tail)) {
> +             mlx5_tx_free_elts(txq, tail, olx);
> +             assert(tail == txq->elts_tail);
> +     }
> +}
> +
> +/**
>   * Manage TX completions. This routine checks the CQ for
>   * arrived CQEs, deduces the last accomplished WQE in SQ,
>   * updates SQ producing index and frees all completed mbufs.
> @@ -2028,10 +2069,11 @@ enum mlx5_txcmp_code {
>                         unsigned int olx __rte_unused)
>  {
>       unsigned int count = MLX5_TX_COMP_MAX_CQE;
> -     bool update = false;
> -     uint16_t tail = txq->elts_tail;
> +     volatile struct mlx5_cqe *last_cqe = NULL;
>       int ret;
>
> +     static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative
> value");
> +     static_assert(MLX5_CQE_STATUS_SW_OWN < 0, "Must be negative
> value");
>       do {
>               volatile struct mlx5_cqe *cqe;
>
> @@ -2043,32 +2085,30 @@ enum mlx5_txcmp_code {
>                               assert(ret == MLX5_CQE_STATUS_HW_OWN);
>                               break;
>                       }
> -                     /* Some error occurred, try to restart. */
> +                     /*
> +                      * Some error occurred, try to restart.
> +                      * We have no barrier after WQE related Doorbell
> +                      * written, make sure all writes are completed
> +                      * here, before we might perform SQ reset.
> +                      */
>                       rte_wmb();
> -                     tail = mlx5_tx_error_cqe_handle
> +                     ret = mlx5_tx_error_cqe_handle
>                               (txq, (volatile struct mlx5_err_cqe *)cqe);
> -                     if (likely(tail != txq->elts_tail)) {
> -                             mlx5_tx_free_elts(txq, tail, olx);
> -                             assert(tail == txq->elts_tail);
> -                     }
> -                     /* Allow flushing all CQEs from the queue. */
> -                     count = txq->cqe_s;
> -             } else {
> -                     volatile struct mlx5_wqe_cseg *cseg;
> -
> -                     /* Normal transmit completion. */
> -                     ++txq->cq_ci;
> -                     rte_cio_rmb();
> -                     txq->wqe_pi = rte_be_to_cpu_16(cqe->wqe_counter);
> -                     cseg = (volatile struct mlx5_wqe_cseg *)
> -                             (txq->wqes + (txq->wqe_pi & txq->wqe_m));
> -                     tail = cseg->misc;
> +                     /*
> +                      * Flush buffers, update consuming index
> +                      * if recovery succeeded. Otherwise
> +                      * just try to recover later.
> +                      */
> +                     last_cqe = NULL;
> +                     break;
>               }
> +             /* Normal transmit completion. */
> +             ++txq->cq_ci;
> +             last_cqe = cqe;
>  #ifndef NDEBUG
>               if (txq->cq_pi)
>                       --txq->cq_pi;
>  #endif
> -             update = true;
>       /*
>        * We have to restrict the amount of processed CQEs
>        * in one tx_burst routine call. The CQ may be large
> @@ -2078,17 +2118,7 @@ enum mlx5_txcmp_code {
>        * latency.
>        */
>       } while (--count);
> -     if (likely(tail != txq->elts_tail)) {
> -             /* Free data buffers from elts. */
> -             mlx5_tx_free_elts(txq, tail, olx);
> -             assert(tail == txq->elts_tail);
> -     }
> -     if (likely(update)) {
> -             /* Update the consumer index. */
> -             rte_compiler_barrier();
> -             *txq->cq_db =
> -             rte_cpu_to_be_32(txq->cq_ci);
> -     }
> +     mlx5_tx_comp_flush(txq, last_cqe, ret, olx);
>  }
>
>  /**
> diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> index c209d99..aaa02a2 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
> @@ -400,7 +400,7 @@ struct mlx5_txq_ctrl *mlx5_txq_new(struct
> rte_eth_dev *dev, uint16_t idx,
>  void mlx5_set_ptype_table(void);
>  void mlx5_set_cksum_table(void);
>  void mlx5_set_swp_types_table(void);
> -__rte_noinline uint16_t mlx5_tx_error_cqe_handle
> +__rte_noinline int mlx5_tx_error_cqe_handle
>                               (struct mlx5_txq_data *restrict txq,
>                                volatile struct mlx5_err_cqe *err_cqe);
>  uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t
> pkts_n);
> @@ -499,9 +499,9 @@ int mlx5_dma_unmap(struct rte_pci_device *pdev,
> void *addr, uint64_t iova,
>
>  /* CQE status. */
>  enum mlx5_cqe_status {
> -     MLX5_CQE_STATUS_SW_OWN,
> -     MLX5_CQE_STATUS_HW_OWN,
> -     MLX5_CQE_STATUS_ERR,
> +     MLX5_CQE_STATUS_SW_OWN = -1,
> +     MLX5_CQE_STATUS_HW_OWN = -2,
> +     MLX5_CQE_STATUS_ERR = -3,
>  };
>
>  /**
> --
> 1.8.3.1

IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.


More information about the dev mailing list