[PATCH v2 2/3] net/octeon_ep: use SSE instructions for Rx routine
Jerin Jacob
jerinjacobk at gmail.com
Wed Dec 6 13:17:16 CET 2023
On Sat, Nov 25, 2023 at 10:52 PM <pbhagavatula at marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula at marvell.com>
>
> Optimize Rx routine to use SSE instructions.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula at marvell.com>
> ---
> diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_sse.c b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
> new file mode 100644
> index 0000000000..531f75a2e0
> --- /dev/null
> +++ b/drivers/net/octeon_ep/cnxk_ep_rx_sse.c
> @@ -0,0 +1,124 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2023 Marvell.
> + */
> +
> +#include "cnxk_ep_rx.h"
> +
> +static __rte_always_inline uint32_t
> +hadd(__m128i x)
> +{
> + __m128i hi64 = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2));
> + __m128i sum64 = _mm_add_epi32(hi64, x);
> + __m128i hi32 = _mm_shufflelo_epi16(sum64, _MM_SHUFFLE(1, 0, 3, 2));
> + __m128i sum32 = _mm_add_epi32(sum64, hi32);
> + return _mm_cvtsi128_si32(sum32);
> +}
> +
> +static __rte_always_inline void
> +cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
> +{
> + struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
> + uint32_t bytes_rsvd = 0, read_idx = droq->read_idx;
> + uint32_t idx0, idx1, idx2, idx3;
> + struct rte_mbuf *m0, *m1, *m2, *m3;
> + uint16_t nb_desc = droq->nb_desc;
> + uint16_t pkts = 0;
> +
> + idx0 = read_idx;
> + while (pkts < new_pkts) {
> + const __m128i bswap_mask = _mm_set_epi8(0xFF, 0xFF, 12, 13, 0xFF, 0xFF, 8, 9, 0xFF,
> + 0xFF, 4, 5, 0xFF, 0xFF, 0, 1);
> + const __m128i cpy_mask = _mm_set_epi8(0xFF, 0xFF, 9, 8, 0xFF, 0xFF, 9, 8, 0xFF,
> + 0xFF, 1, 0, 0xFF, 0xFF, 1, 0);
> + __m128i s01, s23;
> +
> + idx1 = otx_ep_incr_index(idx0, 1, nb_desc);
> + idx2 = otx_ep_incr_index(idx1, 1, nb_desc);
> + idx3 = otx_ep_incr_index(idx2, 1, nb_desc);
> +
> + m0 = recv_buf_list[idx0];
> + m1 = recv_buf_list[idx1];
> + m2 = recv_buf_list[idx2];
> + m3 = recv_buf_list[idx3];
> +
Please add some comments for SSE usage for this section
> + s01 = _mm_set_epi32(rte_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
> + rte_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
> + rte_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
> + rte_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
> + s01 = _mm_shuffle_epi8(s01, bswap_mask);
> + bytes_rsvd += hadd(s01);
> + s23 = _mm_shuffle_epi32(s01, _MM_SHUFFLE(3, 3, 1, 1));
> + s01 = _mm_shuffle_epi8(s01, cpy_mask);
> + s23 = _mm_shuffle_epi8(s23, cpy_mask);
> diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
> index b159c32cae..af657dba50 100644
> --- a/drivers/net/octeon_ep/otx_ep_rxtx.h
> +++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
> @@ -48,12 +48,22 @@ cnxk_ep_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **pkts, uint16_t nb_pkts)
> uint16_t
> cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
>
> +#ifdef RTE_ARCH_X86
We can skip #ifdef for function declaration. Same comment for AVX
> +uint16_t
> +cnxk_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
> +#endif
> +
> uint16_t
> cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
>
> uint16_t
> cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
>
> +#ifdef RTE_ARCH_X86
We can skip #ifdef for function declaration. Same comment for AVX
> +uint16_t
> +cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
> +#endif
> +
> uint16_t
> cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
> #endif /* _OTX_EP_RXTX_H_ */
> --
> 2.25.1
>
More information about the dev
mailing list