[dpdk-dev] [PATCH 1/2] net/mlx4: get back RX flow functionality
Vasily Philipov
vasilyf at mellanox.com
Thu Aug 3 10:49:14 CEST 2017
Getting hw directly on RX fast path without verbs call.
Now the number of scatters is calculating on the fly, according to the
maximum expected packet size.
Signed-off-by: Vasily Philipov <vasilyf at mellanox.com>
---
The series depends on:
http://dpdk.org/dev/patchwork/patch/27313/
---
drivers/net/mlx4/mlx4.h | 3 +
drivers/net/mlx4/mlx4_prm.h | 405 ++++++++++++++++++++++++++++++++++++++++++
drivers/net/mlx4/mlx4_rxq.c | 205 ++++++++++-----------
drivers/net/mlx4/mlx4_rxtx.c | 266 ++++++++++++++++-----------
drivers/net/mlx4/mlx4_rxtx.h | 18 +-
drivers/net/mlx4/mlx4_utils.h | 20 +++
6 files changed, 688 insertions(+), 229 deletions(-)
create mode 100644 drivers/net/mlx4/mlx4_prm.h
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 1cd4db3..4b7f98b 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -42,6 +42,7 @@
#pragma GCC diagnostic ignored "-Wpedantic"
#endif
#include <infiniband/verbs.h>
+#include "mlx4_prm.h"
#ifdef PEDANTIC
#pragma GCC diagnostic error "-Wpedantic"
#endif
@@ -57,6 +58,8 @@
/* Maximum size for inline data. */
#define MLX4_PMD_MAX_INLINE 0
+#include <rte_ethdev.h>
+
/*
* Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP
* from which buffers are to be transmitted will have to be mapped by this
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
new file mode 100644
index 0000000..03c1192
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -0,0 +1,405 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright 2017 6WIND S.A.
+ * Copyright 2017 Mellanox.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of 6WIND S.A. nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef RTE_PMD_MLX4_PRM_H_
+#define RTE_PMD_MLX4_PRM_H_
+
+#include <arpa/inet.h>
+
+#include <infiniband/arch.h>
+#include <infiniband/driver.h>
+#include <infiniband/verbs.h>
+
+#define MLX4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#if MLX4_GCC_VERSION >= 403
+# define __MLX4_ALGN_FUNC__ __attribute__((noinline, aligned(64)))
+# define __MLX4_ALGN_DATA__ __attribute__((aligned(64)))
+#else
+# define __MLX4_ALGN_FUNC__
+# define __MLX4_ALGN_DATA__
+#endif
+
+/* Maximum number of physical ports. */
+#define MLX4_PMD_MAX_PHYS_PORTS 2
+
+/* Generic macro to convert MLX4 to IBV flags. */
+#define MLX4_TRANSPOSE(val, from, to) \
+ (((from) >= (to)) ? \
+ (((val) & (from)) / ((from) / (to))) : \
+ (((val) & (from)) * ((to) / (from))))
+
+struct list_head {
+ struct list_head *next, *prev;
+};
+
+enum {
+ MLX4_INVALID_LKEY = 0x100,
+};
+
+enum {
+ MLX4_MAX_BFS_IN_PAGE = 8,
+ MLX4_BFS_STRIDE = 512,
+};
+
+enum {
+ MLX4_CQE_L2_TUNNEL_IPV4 = 1U << 25,
+ MLX4_CQE_L2_TUNNEL_L4_CSUM = 1U << 26,
+ MLX4_CQE_L2_TUNNEL = 1U << 27,
+ MLX4_CQE_VLAN_PRESENT_MASK = 1U << 29,
+ MLX4_CQE_L2_TUNNEL_IPOK = 1U << 31,
+ MLX4_CQE_QPN_MASK = 0xffffff,
+};
+
+enum {
+ MLX4_QP_TABLE_BITS = 8,
+ MLX4_QP_TABLE_SIZE = 1 << MLX4_QP_TABLE_BITS,
+ MLX4_QP_TABLE_MASK = MLX4_QP_TABLE_SIZE - 1
+};
+
+enum {
+ MLX4_XSRQ_TABLE_BITS = 8,
+ MLX4_XSRQ_TABLE_SIZE = 1 << MLX4_XSRQ_TABLE_BITS,
+ MLX4_XSRQ_TABLE_MASK = MLX4_XSRQ_TABLE_SIZE - 1
+};
+
+struct mlx4_wqe_data_seg {
+ uint32_t byte_count;
+ uint32_t lkey;
+ uint64_t addr;
+};
+
+struct mlx4_xsrq_table {
+ struct {
+ struct mlx4_srq **table;
+ int refcnt;
+ } xsrq_table[MLX4_XSRQ_TABLE_SIZE];
+ pthread_mutex_t mutex;
+ int num_xsrq;
+ int shift;
+ int mask;
+};
+
+enum qp_cap_cache {
+ MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP = 1 << 1,
+ MLX4_RX_VXLAN = 1 << 2
+};
+
+enum mlx4_db_type {
+ MLX4_DB_TYPE_CQ,
+ MLX4_DB_TYPE_RQ,
+ MLX4_NUM_DB_TYPE,
+};
+
+enum mlx4_lock_type {
+ MLX4_SPIN_LOCK = 0,
+ MLX4_MUTEX = 1,
+};
+
+enum mlx4_lock_state {
+ MLX4_USE_LOCK,
+ MLX4_LOCKED,
+ MLX4_UNLOCKED
+};
+
+struct mlx4_spinlock {
+ pthread_spinlock_t lock;
+ enum mlx4_lock_state state;
+};
+
+struct mlx4_lock {
+ pthread_mutex_t mutex;
+ pthread_spinlock_t slock;
+ enum mlx4_lock_state state;
+ enum mlx4_lock_type type;
+};
+
+/* struct for BF dedicated for one QP */
+struct mlx4_dedic_bf {
+ void *address;
+};
+
+/* struct for the common BF which may be shared by many QPs */
+struct mlx4_cmn_bf {
+ void *address;
+ /*
+ * Protect usage of BF address field including data written
+ * to the BF and the BF buffer toggling.
+ */
+ struct mlx4_lock lock;
+};
+
+union mlx4_bf {
+ struct mlx4_dedic_bf dedic;
+ struct mlx4_cmn_bf cmn;
+};
+
+struct mlx4_bfs_data {
+ struct mlx4_dedic_bf dedic_bf[MLX4_MAX_BFS_IN_PAGE - 1];
+ struct mlx4_cmn_bf cmn_bf;
+ uint8_t dedic_bf_used[MLX4_MAX_BFS_IN_PAGE - 1];
+ uint8_t dedic_bf_free;
+ /*
+ * protect dedicated BFs managing
+ * including dedic_bf_used and
+ * dedic_bf_free fields
+ */
+ struct mlx4_spinlock dedic_bf_lock;
+ void *page;
+ uint16_t buf_size;
+ uint8_t num_dedic_bfs;
+};
+
+struct mlx4_db_page;
+
+struct mlx4_context {
+ union {
+ struct ibv_context ibv_ctx;
+ };
+ /* protects send_db_list and send_db_num_uars */
+ struct mlx4_spinlock send_db_lock;
+ struct list_head send_db_list;
+ unsigned int send_db_num_uars;
+ void *uar;
+ struct mlx4_spinlock uar_lock;
+ struct mlx4_bfs_data bfs;
+ int bf_regs_per_page;
+ int max_ctx_res_domain;
+ struct {
+ struct mlx4_qp **table;
+ int refcnt;
+ } qp_table[MLX4_QP_TABLE_SIZE];
+ pthread_mutex_t qp_table_mutex;
+ int num_qps;
+ int qp_table_shift;
+ int qp_table_mask;
+ int max_qp_wr;
+ int max_sge;
+ int max_cqe;
+ uint64_t exp_device_cap_flags;
+ struct {
+ int offset;
+ int mult;
+ int shift;
+ uint64_t mask;
+ } core_clk;
+ void *hca_core_clock;
+ struct mlx4_xsrq_table xsrq_table;
+ struct mlx4_db_page *db_list[MLX4_NUM_DB_TYPE];
+ pthread_mutex_t db_list_mutex;
+ int cqe_size;
+ int prefer_bf;
+ struct mlx4_spinlock hugetlb_lock;
+ struct list_head hugetlb_list;
+ int stall_enable;
+ pthread_mutex_t task_mutex;
+ struct {
+ uint8_t valid;
+ uint8_t link_layer;
+ enum ibv_port_cap_flags caps;
+ } port_query_cache[MLX4_PMD_MAX_PHYS_PORTS];
+ pthread_mutex_t env_mtx;
+ int env_initialized;
+};
+
+struct mlx4_buf {
+ void *buf;
+ void *hmem;
+ size_t length;
+ int base;
+};
+
+struct mlx4_pd {
+ struct ibv_pd ibv_pd;
+ uint32_t pdn;
+};
+
+struct mlx4_cq {
+ struct ibv_cq ibv_cq __MLX4_ALGN_DATA__;
+ uint32_t pattern;
+ struct mlx4_buf buf;
+ struct mlx4_buf resize_buf;
+ struct mlx4_lock lock;
+ uint32_t cqn;
+ uint32_t cons_index;
+ uint32_t wait_index;
+ uint32_t wait_count;
+ uint32_t *set_ci_db;
+ uint32_t *arm_db;
+ int arm_sn;
+ int stall_next_poll;
+ int stall_enable;
+ int cqe_size;
+ int creation_flags;
+ struct mlx4_qp *last_qp;
+ uint32_t model_flags; /* use mlx4_cq_model_flags */
+};
+
+struct mlx4_wq {
+ uint64_t *wrid;
+ struct mlx4_lock lock;
+ int wqe_cnt;
+ unsigned max_post;
+ char *buf;
+ unsigned head;
+ unsigned tail;
+ int max_gs;
+ int wqe_shift;
+ unsigned head_en_index;
+ unsigned head_en_count;
+};
+
+struct mlx4_inlr_rbuff {
+ void *rbuff;
+ int rlen;
+};
+
+struct mlx4_inlr_sg_list {
+ struct mlx4_inlr_rbuff *sg_list;
+ int list_len;
+};
+
+struct mlx4_inlr_buff {
+ struct mlx4_inlr_sg_list *buff;
+ int len;
+};
+
+struct mlx4_qp {
+ struct verbs_qp verbs_qp;
+ uint32_t pattern;
+ int buf_size;
+ uint32_t model_flags; /* use mlx4_qp_model_flags */
+ /* hot post send data */
+ struct mlx4_wq sq __MLX4_ALGN_DATA__;
+ int (*post_send_one)(struct ibv_send_wr *wr,
+ struct mlx4_qp *qp,
+ void *wqe, int *total_size,
+ int *inl, unsigned int ind);
+ union mlx4_bf *bf;
+ uint32_t *sdb; /* send DB */
+ struct mlx4_buf buf;
+ unsigned last_db_head;
+ uint32_t doorbell_qpn;
+ uint32_t create_flags;
+ uint16_t max_inline_data;
+ uint16_t bf_buf_size;
+ uint16_t sq_spare_wqes;
+ uint8_t srcrb_flags_tbl[16];
+ uint8_t db_method;
+ uint8_t qp_type;
+ /* RAW_PACKET hot data */
+ uint8_t link_layer;
+ uint8_t is_masked_atomic;
+ /* post receive hot data */
+ struct mlx4_wq rq __MLX4_ALGN_DATA__;
+ uint32_t *db;
+ uint32_t max_inlr_sg;
+ int32_t cached_rx_csum_flags;
+ int32_t transposed_rx_csum_flags;
+ struct mlx4_inlr_buff inlr_buff;
+ uint8_t qp_cap_cache;
+};
+
+struct mlx4_cqe {
+ uint32_t vlan_my_qpn;
+ uint32_t immed_rss_invalid;
+ uint32_t g_mlpath_rqpn;
+ union {
+ struct {
+ union {
+ struct {
+ uint16_t sl_vid;
+ uint16_t rlid;
+ };
+ uint32_t timestamp_16_47;
+ };
+ uint16_t status;
+ uint8_t reserved2;
+ uint8_t badfcs_enc;
+ };
+ struct {
+ uint16_t reserved4;
+ uint8_t smac[6];
+ };
+ };
+ uint32_t byte_cnt;
+ uint16_t wqe_index;
+ uint16_t checksum;
+ uint8_t reserved5[1];
+ uint16_t timestamp_0_15;
+ uint8_t owner_sr_opcode;
+} __attribute__((packed));
+
+enum {
+ MLX4_CQE_OWNER_MASK = 0x80,
+ MLX4_CQE_IS_SEND_MASK = 0x40,
+ MLX4_CQE_INL_SCATTER_MASK = 0x20,
+ MLX4_CQE_OPCODE_MASK = 0x1f
+};
+
+enum {
+ MLX4_CQE_OPCODE_ERROR = 0x1e,
+ MLX4_CQE_OPCODE_RESIZE = 0x16,
+};
+
+enum {
+ MLX4_CQE_STATUS_L4_CSUM = 1 << 2,
+ MLX4_CQE_STATUS_IPV4 = 1 << 6,
+ MLX4_CQE_STATUS_IPV4F = 1 << 7,
+ MLX4_CQE_STATUS_IPV6 = 1 << 8,
+ MLX4_CQE_STATUS_IPV4OPT = 1 << 9,
+ MLX4_CQE_STATUS_TCP = 1 << 10,
+ MLX4_CQE_STATUS_UDP = 1 << 11,
+ MLX4_CQE_STATUS_IPOK = 1 << 12
+};
+
+#define to_mxxx(xxx, type) \
+ ((struct mlx4_##type *) \
+ ((uint8_t *)ib##xxx - offsetof(struct mlx4_##type, ibv_##xxx)))
+
+static inline struct mlx4_context *to_mctx(struct ibv_context *ibctx)
+{
+ return to_mxxx(ctx, context);
+}
+
+static inline struct mlx4_cq *to_mcq(struct ibv_cq *ibcq)
+{
+ return to_mxxx(cq, cq);
+}
+
+static inline struct mlx4_qp *to_mqp(struct ibv_qp *ibqp)
+{
+ return container_of(container_of(ibqp, struct verbs_qp, qp),
+ struct mlx4_qp, verbs_qp);
+}
+
+#endif /* RTE_PMD_MLX4_PRM_H_ */
diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c
index 1456b5f..bbe9c89 100644
--- a/drivers/net/mlx4/mlx4_rxq.c
+++ b/drivers/net/mlx4/mlx4_rxq.c
@@ -78,103 +78,73 @@
*/
static int
mlx4_rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n,
- struct rte_mbuf **pool)
+ struct rte_mbuf *(*pool)[])
{
- unsigned int i;
- struct rxq_elt (*elts)[elts_n] =
- rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
- rxq->socket);
+ unsigned int i = 0;
+ const unsigned int sge_n = 1 << rxq->sge_n;
+ struct rte_mbuf *(*elts)[elts_n] =
+ rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, rxq->socket);
if (elts == NULL) {
rte_errno = ENOMEM;
ERROR("%p: can't allocate packets array", (void *)rxq);
goto error;
}
- /* For each WR (packet). */
- for (i = 0; (i != elts_n); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct ibv_recv_wr *wr = &elt->wr;
- struct ibv_sge *sge = &(*elts)[i].sge;
+ rxq->elts = elts;
+ for (; i != elts_n; ++i) {
struct rte_mbuf *buf;
+ volatile struct mlx4_wqe_data_seg *scat =
+ &(*rxq->hw.wqes)[i];
if (pool != NULL) {
- buf = *(pool++);
+ buf = (*pool)[i];
assert(buf != NULL);
rte_pktmbuf_reset(buf);
- } else {
+ rte_pktmbuf_refcnt_update(buf, 1);
+ } else
buf = rte_pktmbuf_alloc(rxq->mp);
- }
if (buf == NULL) {
rte_errno = ENOMEM;
assert(pool == NULL);
ERROR("%p: empty mbuf pool", (void *)rxq);
goto error;
}
- /*
- * Configure WR. Work request ID contains its own index in
- * the elts array and the offset between SGE buffer header and
- * its data.
- */
- WR_ID(wr->wr_id).id = i;
- WR_ID(wr->wr_id).offset =
- (((uintptr_t)buf->buf_addr + RTE_PKTMBUF_HEADROOM) -
- (uintptr_t)buf);
- wr->next = &(*elts)[(i + 1)].wr;
- wr->sg_list = sge;
- wr->num_sge = 1;
/* Headroom is reserved by rte_pktmbuf_alloc(). */
assert(buf->data_off == RTE_PKTMBUF_HEADROOM);
/* Buffer is supposed to be empty. */
assert(rte_pktmbuf_data_len(buf) == 0);
assert(rte_pktmbuf_pkt_len(buf) == 0);
- /* sge->addr must be able to store a pointer. */
- assert(sizeof(sge->addr) >= sizeof(uintptr_t));
- /* SGE keeps its headroom. */
- sge->addr = (uintptr_t)
- ((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
- sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
- sge->lkey = rxq->mr->lkey;
- /* Redundant check for tailroom. */
- assert(sge->length == rte_pktmbuf_tailroom(buf));
- /*
- * Make sure elts index and SGE mbuf pointer can be deduced
- * from WR ID.
- */
- if ((WR_ID(wr->wr_id).id != i) ||
- ((void *)((uintptr_t)sge->addr -
- WR_ID(wr->wr_id).offset) != buf)) {
- rte_errno = EOVERFLOW;
- ERROR("%p: cannot store index and offset in WR ID",
- (void *)rxq);
- sge->addr = 0;
- rte_pktmbuf_free(buf);
- goto error;
- }
+ assert(!buf->next);
+ /* Only the first segment keeps headroom. */
+ if (i % sge_n)
+ buf->data_off = 0;
+ buf->port = rxq->port_id;
+ buf->data_len = rte_pktmbuf_tailroom(buf);
+ buf->pkt_len = rte_pktmbuf_tailroom(buf);
+ buf->nb_segs = 1;
+ /* scat->addr must be able to store a pointer. */
+ assert(sizeof(scat->addr) >= sizeof(uintptr_t));
+ *scat = (struct mlx4_wqe_data_seg){
+ .addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
+ .byte_count = htonl(buf->data_len),
+ .lkey = htonl(rxq->mr->lkey),
+ };
+ (*rxq->elts)[i] = buf;
}
- /* The last WR pointer must be NULL. */
- (*elts)[(i - 1)].wr.next = NULL;
- DEBUG("%p: allocated and configured %u single-segment WRs",
- (void *)rxq, elts_n);
- rxq->elts_n = elts_n;
- rxq->elts_head = 0;
- rxq->elts = elts;
+ DEBUG("%p: allocated and configured %u segments (max %u packets)",
+ (void *)rxq, elts_n, elts_n >> rxq->sge_n);
+ rxq->elts_n = log2above(elts_n);
return 0;
error:
- if (elts != NULL) {
- assert(pool == NULL);
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf;
-
- if (elt->sge.addr == 0)
- continue;
- assert(WR_ID(elt->wr.wr_id).id == i);
- buf = (void *)((uintptr_t)elt->sge.addr -
- WR_ID(elt->wr.wr_id).offset);
- rte_pktmbuf_free_seg(buf);
- }
- rte_free(elts);
+ assert(pool == NULL);
+ elts_n = i;
+ for (i = 0; i != elts_n; ++i) {
+ if ((*rxq->elts)[i] != NULL)
+ rte_pktmbuf_free_seg((*rxq->elts)[i]);
+ (*rxq->elts)[i] = NULL;
}
+ rte_free(rxq->elts);
+ rxq->elts = NULL;
DEBUG("%p: failed, freed everything", (void *)rxq);
assert(rte_errno > 0);
return -rte_errno;
@@ -190,26 +160,17 @@
mlx4_rxq_free_elts(struct rxq *rxq)
{
unsigned int i;
- unsigned int elts_n = rxq->elts_n;
- struct rxq_elt (*elts)[elts_n] = rxq->elts;
DEBUG("%p: freeing WRs", (void *)rxq);
- rxq->elts_n = 0;
- rxq->elts = NULL;
- if (elts == NULL)
+ if (rxq->elts == NULL)
return;
- for (i = 0; (i != RTE_DIM(*elts)); ++i) {
- struct rxq_elt *elt = &(*elts)[i];
- struct rte_mbuf *buf;
- if (elt->sge.addr == 0)
- continue;
- assert(WR_ID(elt->wr.wr_id).id == i);
- buf = (void *)((uintptr_t)elt->sge.addr -
- WR_ID(elt->wr.wr_id).offset);
- rte_pktmbuf_free_seg(buf);
+ for (i = 0; i != (1u << rxq->elts_n); ++i) {
+ if ((*rxq->elts)[i] != NULL)
+ rte_pktmbuf_free_seg((*rxq->elts)[i]);
+ (*rxq->elts)[i] = NULL;
}
- rte_free(elts);
+ rte_free(rxq->elts);
}
/**
@@ -251,7 +212,8 @@
* QP pointer or NULL in case of error and rte_errno is set.
*/
static struct ibv_qp *
-mlx4_rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc)
+mlx4_rxq_setup_qp(struct priv *priv, struct ibv_cq *cq,
+ uint16_t desc, unsigned int sge_n)
{
struct ibv_qp *qp;
struct ibv_qp_init_attr attr = {
@@ -265,7 +227,7 @@
priv->device_attr.max_qp_wr :
desc),
/* Max number of scatter/gather elements in a WR. */
- .max_recv_sge = 1,
+ .max_recv_sge = sge_n,
},
.qp_type = IBV_QPT_RAW_PACKET,
};
@@ -307,26 +269,34 @@
.socket = socket
};
struct ibv_qp_attr mod;
- struct ibv_recv_wr *bad_wr;
unsigned int mb_len;
int ret;
(void)conf; /* Thresholds configuration (ignored). */
mb_len = rte_pktmbuf_data_room_size(mp);
- if (desc == 0) {
- rte_errno = EINVAL;
- ERROR("%p: invalid number of RX descriptors", (void *)dev);
- goto error;
- }
/* Enable scattered packets support for this queue if necessary. */
assert(mb_len >= RTE_PKTMBUF_HEADROOM);
if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
(mb_len - RTE_PKTMBUF_HEADROOM)) {
- ;
+ tmpl.sge_n = 0;
} else if (dev->data->dev_conf.rxmode.enable_scatter) {
- WARN("%p: scattered mode has been requested but is"
- " not supported, this may lead to packet loss",
- (void *)dev);
+ unsigned int sges_n;
+ unsigned int rx_pkt_len =
+ dev->data->dev_conf.rxmode.jumbo_frame ?
+ dev->data->dev_conf.rxmode.max_rx_pkt_len :
+ ETHER_MTU;
+
+ if (rx_pkt_len < ETHER_MTU)
+ rx_pkt_len = ETHER_MTU;
+ /* Only the first mbuf has a headroom */
+ rx_pkt_len = rx_pkt_len - mb_len + RTE_PKTMBUF_HEADROOM;
+ /*
+ * Determine the number of SGEs needed for a full packet
+ * and round it to the next power of two.
+ */
+ sges_n = (rx_pkt_len / mb_len) + !!(rx_pkt_len / mb_len) + 1;
+ tmpl.sge_n = log2above(sges_n);
+ desc >>= tmpl.sge_n;
} else {
WARN("%p: the requested maximum Rx packet size (%u) is"
" larger than a single mbuf (%u) and scattered"
@@ -335,6 +305,8 @@
dev->data->dev_conf.rxmode.max_rx_pkt_len,
mb_len - RTE_PKTMBUF_HEADROOM);
}
+ DEBUG("%p: number of sges %u (%u WRs)",
+ (void *)dev, 1 << tmpl.sge_n, desc);
/* Use the entire RX mempool as the memory region. */
tmpl.mr = mlx4_mp2mr(priv->pd, mp);
if (tmpl.mr == NULL) {
@@ -370,7 +342,7 @@
priv->device_attr.max_qp_wr);
DEBUG("priv->device_attr.max_sge is %d",
priv->device_attr.max_sge);
- tmpl.qp = mlx4_rxq_setup_qp(priv, tmpl.cq, desc);
+ tmpl.qp = mlx4_rxq_setup_qp(priv, tmpl.cq, desc, 1 << tmpl.sge_n);
if (tmpl.qp == NULL) {
ERROR("%p: QP creation failure: %s",
(void *)dev, strerror(rte_errno));
@@ -389,21 +361,6 @@
(void *)dev, strerror(rte_errno));
goto error;
}
- ret = mlx4_rxq_alloc_elts(&tmpl, desc, NULL);
- if (ret) {
- ERROR("%p: RXQ allocation failed: %s",
- (void *)dev, strerror(rte_errno));
- goto error;
- }
- ret = ibv_post_recv(tmpl.qp, &(*tmpl.elts)[0].wr, &bad_wr);
- if (ret) {
- rte_errno = ret;
- ERROR("%p: ibv_post_recv() failed for WR %p: %s",
- (void *)dev,
- (void *)bad_wr,
- strerror(rte_errno));
- goto error;
- }
mod = (struct ibv_qp_attr){
.qp_state = IBV_QPS_RTR
};
@@ -414,14 +371,32 @@
(void *)dev, strerror(rte_errno));
goto error;
}
+ /* Init HW depended fields */
+ tmpl.hw.wqes =
+ (volatile struct mlx4_wqe_data_seg (*)[])
+ (uintptr_t)to_mqp(tmpl.qp)->rq.buf;
+ tmpl.hw.rq_db =
+ (volatile uint32_t *)
+ (uintptr_t)to_mqp(tmpl.qp)->db;
+ tmpl.hw.rq_ci = 0;
/* Save port ID. */
tmpl.port_id = dev->data->port_id;
DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
+ ret = mlx4_rxq_alloc_elts(&tmpl, desc << tmpl.sge_n, NULL);
+ if (ret) {
+ ERROR("%p: RXQ allocation failed: %s",
+ (void *)dev, strerror(rte_errno));
+ goto error;
+ }
/* Clean up rxq in case we're reinitializing it. */
DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
mlx4_rxq_cleanup(rxq);
*rxq = tmpl;
DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
+ /* Update doorbell counter. */
+ rxq->hw.rq_ci = desc;
+ rte_wmb();
+ *rxq->hw.rq_db = htonl(rxq->hw.rq_ci);
return 0;
error:
ret = rte_errno;
@@ -459,6 +434,12 @@
struct rxq *rxq = (*priv->rxqs)[idx];
int ret;
+ if (!rte_is_power_of_2(desc)) {
+ desc = 1 << log2above(desc);
+ WARN("%p: increased number of descriptors in RX queue %u"
+ " to the next power of two (%d)",
+ (void *)dev, idx, desc);
+ }
DEBUG("%p: configuring queue %u for %u descriptors",
(void *)dev, idx, desc);
if (idx >= priv->rxqs_n) {
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 944cf48..f11c84c 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -348,9 +348,73 @@
}
/**
- * DPDK callback for Rx.
+ * Get next cqe from HW.
*
- * The following function doesn't manage scattered packets.
+ * @param cq
+ * Pointer to CQ structure.
+ *
+ * @return
+ * Pointer to the CQ element or NULL in case there is no one.
+ */
+static inline struct mlx4_cqe *
+mlx4_cq_get_next_cqe(struct mlx4_cq *cq)
+{
+ int cqe_off;
+ struct mlx4_cqe *cqe;
+ const int cqe_size = cq->cqe_size;
+
+ /* CQE offset is 32 bytes in case if cqe_size is 64 */
+ cqe_off = (cqe_size & 64) >> 1;
+ cqe = (struct mlx4_cqe *)
+ ((uint8_t *)cq->buf.buf +
+ (cq->cons_index & cq->ibv_cq.cqe) * cqe_size +
+ cqe_off);
+ /* Return NULL if HW hasn't produced cqe */
+ if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+ !!(cq->cons_index & (cq->ibv_cq.cqe + 1)))
+ return NULL;
+ return cqe;
+}
+
+/**
+ * Poll one CQE from CQ.
+ *
+ * @param cq
+ * Pointer to ibv CQ structure.
+ * @param[out] out
+ * Just polled cqe.
+ *
+ * @return
+ * byte_cnt of the cqe, 0 in case there is no completion,
+ * negative on failure.
+ */
+static int
+mlx4_cq_poll_one(struct rxq *rxq,
+ struct mlx4_cqe **out)
+{
+ int ret = 0;
+ struct mlx4_cqe *cqe;
+ struct mlx4_cq *cq = to_mcq(rxq->cq);
+
+ cqe = mlx4_cq_get_next_cqe(cq);
+ if (cqe) {
+ /*
+ * Make sure we read CQ entry contents after we've checked the
+ * ownership bit.
+ */
+ rte_rmb();
+ assert(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK));
+ assert((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) !=
+ MLX4_CQE_OPCODE_ERROR);
+ ret = ntohl(cqe->byte_cnt);
+ ++cq->cons_index;
+ }
+ *out = cqe;
+ return ret;
+}
+
+/**
+ * DPDK callback for RX with scattered packets support.
*
* @param dpdk_rxq
* Generic pointer to Rx queue structure.
@@ -365,121 +429,109 @@
uint16_t
mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
- struct rxq *rxq = (struct rxq *)dpdk_rxq;
- struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
- const unsigned int elts_n = rxq->elts_n;
- unsigned int elts_head = rxq->elts_head;
- struct ibv_wc wcs[pkts_n];
- struct ibv_recv_wr *wr_head = NULL;
- struct ibv_recv_wr **wr_next = &wr_head;
- struct ibv_recv_wr *wr_bad = NULL;
- unsigned int i;
- unsigned int pkts_ret = 0;
- int ret;
+ struct rxq *rxq = dpdk_rxq;
+ const unsigned int wr_cnt = (1 << rxq->elts_n) - 1;
+ const unsigned int sge_n = rxq->sge_n;
+ struct rte_mbuf *pkt = NULL;
+ struct rte_mbuf *seg = NULL;
+ unsigned int i = 0;
+ unsigned int rq_ci = (rxq->hw.rq_ci << sge_n);
+ int len = 0;
- ret = ibv_poll_cq(rxq->cq, pkts_n, wcs);
- if (unlikely(ret == 0))
- return 0;
- if (unlikely(ret < 0)) {
- DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
- (void *)rxq, ret);
- return 0;
- }
- assert(ret <= (int)pkts_n);
- /* For each work completion. */
- for (i = 0; i != (unsigned int)ret; ++i) {
- struct ibv_wc *wc = &wcs[i];
- struct rxq_elt *elt = &(*elts)[elts_head];
- struct ibv_recv_wr *wr = &elt->wr;
- uint64_t wr_id = wr->wr_id;
- uint32_t len = wc->byte_len;
- struct rte_mbuf *seg = (void *)((uintptr_t)elt->sge.addr -
- WR_ID(wr_id).offset);
- struct rte_mbuf *rep;
+ while (pkts_n) {
+ struct mlx4_cqe *cqe;
+ unsigned int idx = rq_ci & wr_cnt;
+ struct rte_mbuf *rep = (*rxq->elts)[idx];
+ volatile struct mlx4_wqe_data_seg *scat =
+ &(*rxq->hw.wqes)[idx];
- /* Sanity checks. */
- assert(WR_ID(wr_id).id < rxq->elts_n);
- assert(wr_id == wc->wr_id);
- assert(wr->sg_list == &elt->sge);
- assert(wr->num_sge == 1);
- assert(elts_head < rxq->elts_n);
- assert(rxq->elts_head < rxq->elts_n);
- /*
- * Fetch initial bytes of packet descriptor into a
- * cacheline while allocating rep.
- */
- rte_mbuf_prefetch_part1(seg);
- rte_mbuf_prefetch_part2(seg);
- /* Link completed WRs together for repost. */
- *wr_next = wr;
- wr_next = &wr->next;
- if (unlikely(wc->status != IBV_WC_SUCCESS)) {
- /* Whatever, just repost the offending WR. */
- DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work completion"
- " status (%d): %s",
- (void *)rxq, wr_id, wc->status,
- ibv_wc_status_str(wc->status));
- /* Increment dropped packets counter. */
- ++rxq->stats.idropped;
- goto repost;
- }
+ /* Update the 'next' pointer of the previous segment */
+ if (pkt)
+ seg->next = rep;
+ seg = rep;
+ rte_prefetch0(seg);
+ rte_prefetch0(scat);
rep = rte_mbuf_raw_alloc(rxq->mp);
if (unlikely(rep == NULL)) {
- /*
- * Unable to allocate a replacement mbuf,
- * repost WR.
- */
- DEBUG("rxq=%p, wr_id=%" PRIu32 ":"
- " can't allocate a new mbuf",
- (void *)rxq, WR_ID(wr_id).id);
- /* Increase out of memory counters. */
++rxq->stats.rx_nombuf;
- ++rxq->priv->dev->data->rx_mbuf_alloc_failed;
- goto repost;
+ if (!pkt) {
+ /*
+ * no buffers before we even started,
+ * bail out silently.
+ */
+ break;
+ }
+ while (pkt != seg) {
+ assert(pkt != (*rxq->elts)[idx]);
+ rep = pkt->next;
+ pkt->next = NULL;
+ pkt->nb_segs = 1;
+ rte_mbuf_raw_free(pkt);
+ pkt = rep;
+ }
+ break;
}
- /* Reconfigure sge to use rep instead of seg. */
- elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
- assert(elt->sge.lkey == rxq->mr->lkey);
- WR_ID(wr->wr_id).offset =
- (((uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM) -
- (uintptr_t)rep);
- assert(WR_ID(wr->wr_id).id == WR_ID(wr_id).id);
- /* Update seg information. */
- seg->data_off = RTE_PKTMBUF_HEADROOM;
- seg->nb_segs = 1;
- seg->port = rxq->port_id;
- seg->next = NULL;
- seg->pkt_len = len;
+ if (!pkt) {
+ /* Looking for the new packet */
+ len = mlx4_cq_poll_one(rxq, &cqe);
+ if (!len) {
+ rte_mbuf_raw_free(rep);
+ break;
+ }
+ if (unlikely(len < 0)) {
+ /* RX error, packet is likely too large. */
+ rte_mbuf_raw_free(rep);
+ ++rxq->stats.idropped;
+ goto skip;
+ }
+ pkt = seg;
+ pkt->packet_type = 0;
+ pkt->ol_flags = 0;
+ pkt->pkt_len = len;
+ }
+ rep->nb_segs = 1;
+ rep->port = rxq->port_id;
+ rep->data_len = seg->data_len;
+ rep->data_off = seg->data_off;
+ (*rxq->elts)[idx] = rep;
+ /*
+ * Fill NIC descriptor with the new buffer. The lkey and size
+ * of the buffers are already known, only the buffer address
+ * changes.
+ */
+ scat->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t));
+ if (len > seg->data_len) {
+ len -= seg->data_len;
+ ++pkt->nb_segs;
+ ++rq_ci;
+ continue;
+ }
+ /* The last segment. */
seg->data_len = len;
- seg->packet_type = 0;
- seg->ol_flags = 0;
+ /* Increment bytes counter. */
+ rxq->stats.ibytes += pkt->pkt_len;
/* Return packet. */
- *(pkts++) = seg;
- ++pkts_ret;
- /* Increase bytes counter. */
- rxq->stats.ibytes += len;
-repost:
- if (++elts_head >= elts_n)
- elts_head = 0;
- continue;
+ *(pkts++) = pkt;
+ pkt = NULL;
+ --pkts_n;
+ ++i;
+skip:
+ /* Align consumer index to the next stride. */
+ rq_ci >>= sge_n;
+ ++rq_ci;
+ rq_ci <<= sge_n;
}
- if (unlikely(i == 0))
+ if (unlikely((i == 0) && ((rq_ci >> sge_n) == rxq->hw.rq_ci)))
return 0;
- /* Repost WRs. */
- *wr_next = NULL;
- assert(wr_head);
- ret = ibv_post_recv(rxq->qp, wr_head, &wr_bad);
- if (unlikely(ret)) {
- /* Inability to repost WRs is fatal. */
- DEBUG("%p: recv_burst(): failed (ret=%d)",
- (void *)rxq->priv,
- ret);
- abort();
- }
- rxq->elts_head = elts_head;
- /* Increase packets counter. */
- rxq->stats.ipackets += pkts_ret;
- return pkts_ret;
+ /* Update the consumer index. */
+ rxq->hw.rq_ci = rq_ci >> sge_n;
+ rte_wmb();
+ *rxq->hw.rq_db = htonl(rxq->hw.rq_ci);
+ *to_mcq(rxq->cq)->set_ci_db =
+ htonl(to_mcq(rxq->cq)->cons_index & 0xffffff);
+ /* Increment packets counter. */
+ rxq->stats.ipackets += i;
+ return i;
}
/**
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index a3d972b..077fdd8 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -70,13 +70,6 @@ struct mlx4_rxq_stats {
uint64_t rx_nombuf; /**< Total of Rx mbuf allocation failures. */
};
-/** Rx element. */
-struct rxq_elt {
- struct ibv_recv_wr wr; /**< Work request. */
- struct ibv_sge sge; /**< Scatter/gather element. */
- /* mbuf pointer is derived from WR_ID(wr.wr_id).offset. */
-};
-
/** Rx queue descriptor. */
struct rxq {
struct priv *priv; /**< Back pointer to private data. */
@@ -86,9 +79,14 @@ struct rxq {
struct ibv_qp *qp; /**< Queue pair. */
struct ibv_comp_channel *channel; /**< Rx completion channel. */
unsigned int port_id; /**< Port ID for incoming packets. */
- unsigned int elts_n; /**< (*elts)[] length. */
- unsigned int elts_head; /**< Current index in (*elts)[]. */
- struct rxq_elt (*elts)[]; /**< Rx elements. */
+ unsigned int elts_n; /**< Log 2 of Mbufs. */
+ struct rte_mbuf *(*elts)[]; /**< Rx elements. */
+ struct {
+ volatile struct mlx4_wqe_data_seg(*wqes)[];
+ volatile uint32_t *rq_db;
+ uint16_t rq_ci;
+ } hw;
+ unsigned int sge_n; /**< Log 2 of SGEs number. */
struct mlx4_rxq_stats stats; /**< Rx queue counters. */
unsigned int socket; /**< CPU socket ID for allocations. */
};
diff --git a/drivers/net/mlx4/mlx4_utils.h b/drivers/net/mlx4/mlx4_utils.h
index e74b61b..a37a3e5 100644
--- a/drivers/net/mlx4/mlx4_utils.h
+++ b/drivers/net/mlx4/mlx4_utils.h
@@ -102,4 +102,24 @@
int mlx4_fd_set_non_blocking(int fd);
+/**
+ * Return nearest power of two above input value.
+ *
+ * @param v
+ * Input value.
+ *
+ * @return
+ * Nearest power of two above input value.
+ */
+static inline unsigned int
+log2above(unsigned int v)
+{
+ unsigned int l;
+ unsigned int r;
+
+ for (l = 0, r = 0; (v >> 1); ++l, v >>= 1)
+ r |= (v & 1);
+ return l + r;
+}
+
#endif /* MLX4_UTILS_H_ */
--
1.8.3.1
More information about the dev
mailing list