[dpdk-dev] [PATCH v2 3/7] net/mlx5: split PCI from generic probing code
Adrien Mazarguil
adrien.mazarguil at 6wind.com
Thu Jun 14 10:34:54 CEST 2018
All the generic probing code needs is an IB device. While this device is
currently supplied by a PCI lookup, other methods will be added soon.
This patch divides the original function, which has become huge over time,
as follows:
1. PCI-specific (mlx5_pci_probe()).
2. All ports of a Verbs device (mlx5_dev_spawn()).
3. A given port of a Verbs device (mlx5_dev_spawn_one()).
(Patch based on prior work from Yuanhan Liu)
Signed-off-by: Adrien Mazarguil <adrien.mazarguil at 6wind.com>
--
v2 changes:
- Fixed device naming. A port suffix is now appended only if several IB
ports happen to be detected.
- Added separate message to distinguish missing kernel drivers from other
initialization errors, as it was confusing.
---
drivers/net/mlx5/mlx5.c | 340 ++++++++++++++++++++++++++-----------------
1 file changed, 209 insertions(+), 131 deletions(-)
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 1a5391e63..01dcf25b9 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -635,30 +635,34 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)
}
/**
- * DPDK callback to register a PCI device.
- *
- * This function creates an Ethernet device for each port of a given
- * PCI device.
+ * Spawn an Ethernet device from Verbs information.
*
- * @param[in] pci_drv
- * PCI driver structure (mlx5_driver).
- * @param[in] pci_dev
- * PCI device information.
+ * @param dpdk_dev
+ * Backing DPDK device.
+ * @param ibv_dev
+ * Verbs device.
+ * @param vf
+ * If nonzero, enable VF-specific features.
+ * @param[in] attr
+ * Verbs device attributes.
+ * @param port
+ * Verbs port to use (indexed from 1).
*
* @return
- * 0 on success, a negative errno value otherwise and rte_errno is set.
+ * A valid Ethernet device object on success, NULL otherwise and rte_errno
+ * is set.
*/
-static int
-mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
- struct rte_pci_device *pci_dev)
+static struct rte_eth_dev *
+mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
+ struct ibv_device *ibv_dev,
+ int vf,
+ const struct ibv_device_attr_ex *attr,
+ unsigned int port)
{
- struct ibv_device **list = NULL;
- struct ibv_device *ibv_dev;
- struct ibv_context *ctx = NULL;
- struct ibv_device_attr_ex attr;
+ struct ibv_context *ctx;
struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+ struct rte_eth_dev *eth_dev = NULL;
int err = 0;
- unsigned int vf = 0;
unsigned int mps;
unsigned int cqe_comp;
unsigned int tunnel_en = 0;
@@ -670,71 +674,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
unsigned int mprq_max_stride_size_n = 0;
unsigned int mprq_min_stride_num_n = 0;
unsigned int mprq_max_stride_num_n = 0;
- int i;
#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
struct ibv_counter_set_description cs_desc = { .counter_type = 0 };
#endif
/* Prepare shared data between primary and secondary process. */
mlx5_prepare_shared_data();
- assert(pci_drv == &mlx5_driver);
- list = mlx5_glue->get_device_list(&i);
- if (list == NULL) {
- assert(errno);
- err = errno;
- if (errno == ENOSYS)
- DRV_LOG(ERR,
- "cannot list devices, is ib_uverbs loaded?");
- goto error;
- }
- assert(i >= 0);
- /*
- * For each listed device, check related sysfs entry against
- * the provided PCI ID.
- */
- while (i != 0) {
- struct rte_pci_addr pci_addr;
-
- --i;
- DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name);
- if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr))
- continue;
- if ((pci_dev->addr.domain != pci_addr.domain) ||
- (pci_dev->addr.bus != pci_addr.bus) ||
- (pci_dev->addr.devid != pci_addr.devid) ||
- (pci_dev->addr.function != pci_addr.function))
- continue;
- DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
- list[i]->name);
- vf = ((pci_dev->id.device_id ==
- PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
- (pci_dev->id.device_id ==
- PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) ||
- (pci_dev->id.device_id ==
- PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
- (pci_dev->id.device_id ==
- PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
- ctx = mlx5_glue->open_device(list[i]);
- rte_errno = errno;
- err = rte_errno;
- break;
- }
- if (ctx == NULL) {
- switch (err) {
- case 0:
- DRV_LOG(ERR,
- "cannot access device, is mlx5_ib loaded?");
- err = ENODEV;
- break;
- case EINVAL:
- DRV_LOG(ERR,
- "cannot use device, are drivers up to date?");
- break;
- }
- goto error;
+ errno = 0;
+ ctx = mlx5_glue->open_device(ibv_dev);
+ if (!ctx) {
+ rte_errno = errno ? errno : ENODEV;
+ return NULL;
}
- ibv_dev = list[i];
- DRV_LOG(DEBUG, "device opened");
#ifdef HAVE_IBV_MLX5_MOD_SWP
dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
#endif
@@ -822,20 +773,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
" old OFED/rdma-core version or firmware configuration");
#endif
- err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
- if (err) {
- DEBUG("ibv_query_device_ex() failed");
- goto error;
- }
- DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
- for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
+ {
char name[RTE_ETH_NAME_MAX_LEN];
- int len;
- uint32_t port = i + 1; /* ports are indexed from one */
struct ibv_port_attr port_attr;
struct ibv_pd *pd = NULL;
struct priv *priv = NULL;
- struct rte_eth_dev *eth_dev = NULL;
struct ether_addr mac;
struct mlx5_dev_config config = {
.cqe_comp = cqe_comp,
@@ -859,11 +801,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
},
};
- len = snprintf(name, sizeof(name), PCI_PRI_FMT,
- pci_dev->addr.domain, pci_dev->addr.bus,
- pci_dev->addr.devid, pci_dev->addr.function);
- if (attr.orig_attr.phys_port_cnt > 1)
- snprintf(name + len, sizeof(name), " port %u", i);
+ if (attr->orig_attr.phys_port_cnt > 1)
+ snprintf(name, sizeof(name), "%s port %u",
+ dpdk_dev->name, port);
+ else
+ snprintf(name, sizeof(name), "%s", dpdk_dev->name);
if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
eth_dev = rte_eth_dev_attach_secondary(name);
if (eth_dev == NULL) {
@@ -872,7 +814,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
err = rte_errno;
goto error;
}
- eth_dev->device = &pci_dev->device;
+ eth_dev->device = dpdk_dev;
eth_dev->dev_ops = &mlx5_dev_sec_ops;
err = mlx5_uar_init_secondary(eth_dev);
if (err) {
@@ -900,16 +842,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
mlx5_select_rx_function(eth_dev);
eth_dev->tx_pkt_burst =
mlx5_select_tx_function(eth_dev);
- rte_eth_dev_probing_finish(eth_dev);
- continue;
+ mlx5_glue->close_device(ctx);
+ return eth_dev;
}
DRV_LOG(DEBUG, "using port %u", port);
- if (!ctx)
- ctx = mlx5_glue->open_device(ibv_dev);
- if (ctx == NULL) {
- err = ENODEV;
- goto port_error;
- }
/* Check port status. */
err = mlx5_glue->query_port(ctx, port, &port_attr);
if (err) {
@@ -947,23 +883,23 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
priv->ctx = ctx;
strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
sizeof(priv->ibdev_path));
- priv->device_attr = attr;
+ priv->device_attr = *attr;
priv->port = port;
priv->pd = pd;
priv->mtu = ETHER_MTU;
- err = mlx5_args(&config, pci_dev->device.devargs);
+ err = mlx5_args(&config, dpdk_dev->devargs);
if (err) {
err = rte_errno;
DRV_LOG(ERR, "failed to process device arguments: %s",
strerror(rte_errno));
goto port_error;
}
- config.hw_csum = !!(attr.device_cap_flags_ex &
+ config.hw_csum = !!(attr->device_cap_flags_ex &
IBV_DEVICE_RAW_IP_CSUM);
DRV_LOG(DEBUG, "checksum offloading is %ssupported",
(config.hw_csum ? "" : "not "));
#ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
- config.flow_counter_en = !!attr.max_counter_sets;
+ config.flow_counter_en = !!attr->max_counter_sets;
mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
DRV_LOG(DEBUG,
"counter type = %d, num of cs = %ld, attributes = %d",
@@ -971,7 +907,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
cs_desc.attributes);
#endif
config.ind_table_max_size =
- attr.rss_caps.max_rwq_indirection_table_size;
+ attr->rss_caps.max_rwq_indirection_table_size;
/* Remove this check once DPDK supports larger/variable
* indirection tables. */
if (config.ind_table_max_size >
@@ -979,28 +915,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
config.ind_table_max_size);
- config.hw_vlan_strip = !!(attr.raw_packet_caps &
+ config.hw_vlan_strip = !!(attr->raw_packet_caps &
IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
(config.hw_vlan_strip ? "" : "not "));
- config.hw_fcs_strip = !!(attr.raw_packet_caps &
+ config.hw_fcs_strip = !!(attr->raw_packet_caps &
IBV_RAW_PACKET_CAP_SCATTER_FCS);
DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
(config.hw_fcs_strip ? "" : "not "));
#ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
- config.hw_padding = !!attr.rx_pad_end_addr_align;
+ config.hw_padding = !!attr->rx_pad_end_addr_align;
#endif
DRV_LOG(DEBUG,
"hardware Rx end alignment padding is %ssupported",
(config.hw_padding ? "" : "not "));
config.vf = vf;
- config.tso = (attr.tso_caps.max_tso > 0 &&
- (attr.tso_caps.supported_qpts &
+ config.tso = (attr->tso_caps.max_tso > 0 &&
+ (attr->tso_caps.supported_qpts &
(1 << IBV_QPT_RAW_PACKET)));
if (config.tso)
- config.tso_max_payload_sz = attr.tso_caps.max_tso;
+ config.tso_max_payload_sz = attr->tso_caps.max_tso;
if (config.mps && !mps) {
DRV_LOG(ERR,
"multi-packet send not supported on this device"
@@ -1041,8 +977,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
eth_dev->data->dev_private = priv;
priv->dev_data = eth_dev->data;
eth_dev->data->mac_addrs = priv->mac;
- eth_dev->device = &pci_dev->device;
- rte_eth_copy_pci_info(eth_dev, pci_dev);
+ eth_dev->device = dpdk_dev;
eth_dev->device->driver = &mlx5_driver.driver;
err = mlx5_uar_init_primary(eth_dev);
if (err) {
@@ -1160,13 +1095,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
priv, mem_event_cb);
rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
rte_eth_dev_probing_finish(eth_dev);
- /*
- * Each eth_dev instance is assigned its own Verbs context,
- * since this one is consumed, let the next iteration open
- * another.
- */
- ctx = NULL;
- continue;
+ return eth_dev;
port_error:
if (priv)
rte_free(priv);
@@ -1174,24 +1103,173 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
claim_zero(mlx5_glue->dealloc_pd(pd));
if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
rte_eth_dev_release_port(eth_dev);
- break;
}
- /*
- * XXX if something went wrong in the loop above, there is a resource
- * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
- * long as the dpdk does not provide a way to deallocate a ethdev and a
- * way to enumerate the registered ethdevs to free the previous ones.
- */
error:
if (ctx)
claim_zero(mlx5_glue->close_device(ctx));
- if (list)
- mlx5_glue->free_device_list(list);
- if (err) {
- rte_errno = err;
+ assert(err > 0);
+ rte_errno = err;
+ return NULL;
+}
+
+/**
+ * Spawn Ethernet devices from Verbs information, one per detected port.
+ *
+ * @param dpdk_dev
+ * Backing DPDK device.
+ * @param ibv_dev
+ * Verbs device.
+ * @param vf
+ * If nonzero, enable VF-specific features.
+ *
+ * @return
+ * A NULL-terminated list of Ethernet device objects on success, NULL
+ * otherwise and rte_errno is set. Caller is expected to release list
+ * memory through free().
+ */
+static struct rte_eth_dev **
+mlx5_dev_spawn(struct rte_device *dpdk_dev,
+ struct ibv_device *ibv_dev,
+ int vf)
+{
+ struct rte_eth_dev **eth_list = NULL;
+ struct ibv_context *ctx;
+ struct ibv_device_attr_ex attr;
+ unsigned int i;
+ int ret;
+
+ errno = 0;
+ ctx = mlx5_glue->open_device(ibv_dev);
+ if (!ctx) {
+ rte_errno = errno ? errno : ENODEV;
+ if (rte_errno == ENODEV)
+ DRV_LOG(ERR,
+ "cannot access device, is mlx5_ib loaded?");
+ else
+ DRV_LOG(ERR,
+ "cannot use device, are drivers up to date?");
+ return NULL;
+ }
+ ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
+ mlx5_glue->close_device(ctx);
+ if (ret) {
+ rte_errno = ret;
+ DRV_LOG(ERR, "unable to query device information: %s",
+ strerror(rte_errno));
+ return NULL;
+ }
+ DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
+ eth_list = malloc(sizeof(*eth_list) *
+ (attr.orig_attr.phys_port_cnt + 1));
+ if (!eth_list) {
+ rte_errno = errno;
+ return NULL;
+ }
+ for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
+ eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
+ &attr, i + 1);
+ if (eth_list[i])
+ continue;
+ /* Save rte_errno and roll back in case of failure. */
+ ret = rte_errno;
+ while (i--) {
+ mlx5_dev_close(eth_list[i]);
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ rte_free(eth_list[i]->data->dev_private);
+ claim_zero(rte_eth_dev_release_port(eth_list[i]));
+ }
+ free(eth_list);
+ rte_errno = ret;
+ return NULL;
+ }
+ eth_list[i] = NULL;
+ return eth_list;
+}
+
+/**
+ * DPDK callback to register a PCI device.
+ *
+ * This function creates an Ethernet device for each port of a given
+ * PCI device.
+ *
+ * @param[in] pci_drv
+ * PCI driver structure (mlx5_driver).
+ * @param[in] pci_dev
+ * PCI device information.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
+ struct rte_pci_device *pci_dev)
+{
+ struct ibv_device **ibv_list;
+ struct rte_eth_dev **eth_list = NULL;
+ int vf;
+ int ret;
+
+ assert(pci_drv == &mlx5_driver);
+ switch (pci_dev->id.device_id) {
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
+ vf = 1;
+ break;
+ default:
+ vf = 0;
+ }
+ errno = 0;
+ ibv_list = mlx5_glue->get_device_list(&ret);
+ if (!ibv_list) {
+ rte_errno = errno ? errno : ENOSYS;
+ DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
return -rte_errno;
}
- return 0;
+ while (ret-- > 0) {
+ struct rte_pci_addr pci_addr;
+
+ DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
+ if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
+ continue;
+ if (pci_dev->addr.domain != pci_addr.domain ||
+ pci_dev->addr.bus != pci_addr.bus ||
+ pci_dev->addr.devid != pci_addr.devid ||
+ pci_dev->addr.function != pci_addr.function)
+ continue;
+ DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
+ ibv_list[ret]->name);
+ break;
+ }
+ if (ret >= 0)
+ eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
+ mlx5_glue->free_device_list(ibv_list);
+ if (!ret) {
+ DRV_LOG(WARNING,
+ "no Verbs device matches PCI device " PCI_PRI_FMT ","
+ " are kernel drivers loaded?",
+ pci_dev->addr.domain, pci_dev->addr.bus,
+ pci_dev->addr.devid, pci_dev->addr.function);
+ rte_errno = ENOENT;
+ ret = -rte_errno;
+ } else if (!eth_list || !*eth_list) {
+ DRV_LOG(ERR,
+ "probe of PCI device " PCI_PRI_FMT " aborted after"
+ " encountering an error: %s",
+ pci_dev->addr.domain, pci_dev->addr.bus,
+ pci_dev->addr.devid, pci_dev->addr.function,
+ strerror(rte_errno));
+ ret = -rte_errno;
+ } else {
+ for (ret = 0; eth_list[ret]; ++ret) {
+ rte_eth_copy_pci_info(eth_list[ret], pci_dev);
+ rte_eth_dev_probing_finish(eth_list[ret]);
+ }
+ ret = 0;
+ }
+ free(eth_list);
+ return ret;
}
static const struct rte_pci_id mlx5_pci_id_map[] = {
--
2.11.0
More information about the dev
mailing list