[dpdk-dev] [PATCH 3/7] net/mlx5: split PCI from generic probing code
Xueming(Steven) Li
xuemingl at mellanox.com
Sun Jun 10 14:59:06 CEST 2018
Hi Adrien,
The logic looks much more clear now with the split.
> -----Original Message-----
> From: dev <dev-bounces at dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Saturday, May 26, 2018 12:35 AM
> To: Shahaf Shuler <shahafs at mellanox.com>
> Cc: dev at dpdk.org
> Subject: [dpdk-dev] [PATCH 3/7] net/mlx5: split PCI from generic probing code
>
> All the generic probing code needs is an IB device. While this device is currently supplied by a PCI
> lookup, other methods will be added soon.
>
> This patch divides the original function, which has become huge over time, as follows:
>
> 1. PCI-specific (mlx5_pci_probe()).
> 2. All ports of a Verbs device (mlx5_dev_spawn()).
> 3. A given port of a Verbs device (mlx5_dev_spawn_one()).
>
> (Patch based on prior work from Yuanhan Liu)
>
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil at 6wind.com>
> ---
> drivers/net/mlx5/mlx5.c | 332 ++++++++++++++++++++++++++-----------------
> 1 file changed, 201 insertions(+), 131 deletions(-)
>
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 41a542ebc..7a812ef93 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -633,30 +633,34 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev) }
>
> /**
> - * DPDK callback to register a PCI device.
> - *
> - * This function creates an Ethernet device for each port of a given
> - * PCI device.
> + * Spawn an Ethernet device from Verbs information.
> *
> - * @param[in] pci_drv
> - * PCI driver structure (mlx5_driver).
> - * @param[in] pci_dev
> - * PCI device information.
> + * @param dpdk_dev
> + * Backing DPDK device.
> + * @param ibv_dev
> + * Verbs device.
> + * @param vf
> + * If nonzero, enable VF-specific features.
> + * @param[in] attr
> + * Verbs device attributes.
> + * @param port
> + * Verbs port to use (indexed from 1).
> *
> * @return
> - * 0 on success, a negative errno value otherwise and rte_errno is set.
> + * A valid Ethernet device object on success, NULL otherwise and rte_errno
> + * is set.
> */
> -static int
> -mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> - struct rte_pci_device *pci_dev)
> +static struct rte_eth_dev *
> +mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
> + struct ibv_device *ibv_dev,
> + int vf,
> + const struct ibv_device_attr_ex *attr,
> + unsigned int port)
> {
> - struct ibv_device **list = NULL;
> - struct ibv_device *ibv_dev;
> - struct ibv_context *ctx = NULL;
> - struct ibv_device_attr_ex attr;
> + struct ibv_context *ctx;
> struct mlx5dv_context dv_attr = { .comp_mask = 0 };
> + struct rte_eth_dev *eth_dev = NULL;
> int err = 0;
> - unsigned int vf = 0;
> unsigned int mps;
> unsigned int cqe_comp;
> unsigned int tunnel_en = 0;
> @@ -668,71 +672,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> unsigned int mprq_max_stride_size_n = 0;
> unsigned int mprq_min_stride_num_n = 0;
> unsigned int mprq_max_stride_num_n = 0;
> - int i;
> #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
> struct ibv_counter_set_description cs_desc; #endif
>
> /* Prepare shared data between primary and secondary process. */
> mlx5_prepare_shared_data();
> - assert(pci_drv == &mlx5_driver);
> - list = mlx5_glue->get_device_list(&i);
> - if (list == NULL) {
> - assert(errno);
> - err = errno;
> - if (errno == ENOSYS)
> - DRV_LOG(ERR,
> - "cannot list devices, is ib_uverbs loaded?");
> - goto error;
> - }
> - assert(i >= 0);
> - /*
> - * For each listed device, check related sysfs entry against
> - * the provided PCI ID.
> - */
> - while (i != 0) {
> - struct rte_pci_addr pci_addr;
> -
> - --i;
> - DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name);
> - if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr))
> - continue;
> - if ((pci_dev->addr.domain != pci_addr.domain) ||
> - (pci_dev->addr.bus != pci_addr.bus) ||
> - (pci_dev->addr.devid != pci_addr.devid) ||
> - (pci_dev->addr.function != pci_addr.function))
> - continue;
> - DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
> - list[i]->name);
> - vf = ((pci_dev->id.device_id ==
> - PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
> - (pci_dev->id.device_id ==
> - PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) ||
> - (pci_dev->id.device_id ==
> - PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
> - (pci_dev->id.device_id ==
> - PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
> - ctx = mlx5_glue->open_device(list[i]);
> - rte_errno = errno;
> - err = rte_errno;
> - break;
> - }
> - if (ctx == NULL) {
> - switch (err) {
> - case 0:
> - DRV_LOG(ERR,
> - "cannot access device, is mlx5_ib loaded?");
> - err = ENODEV;
> - break;
> - case EINVAL:
> - DRV_LOG(ERR,
> - "cannot use device, are drivers up to date?");
> - break;
> - }
> - goto error;
> + errno = 0;
> + ctx = mlx5_glue->open_device(ibv_dev);
> + if (!ctx) {
> + rte_errno = errno ? errno : ENODEV;
> + return NULL;
> }
> - ibv_dev = list[i];
> - DRV_LOG(DEBUG, "device opened");
> #ifdef HAVE_IBV_MLX5_MOD_SWP
> dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP; #endif @@ -820,20 +771,11 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
> " old OFED/rdma-core version or firmware configuration"); #endif
> - err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
> - if (err) {
> - DEBUG("ibv_query_device_ex() failed");
> - goto error;
> - }
> - DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
> - for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
> + {
> char name[RTE_ETH_NAME_MAX_LEN];
> - int len;
> - uint32_t port = i + 1; /* ports are indexed from one */
> struct ibv_port_attr port_attr;
> struct ibv_pd *pd = NULL;
> struct priv *priv = NULL;
> - struct rte_eth_dev *eth_dev = NULL;
> struct ether_addr mac;
> struct mlx5_dev_config config = {
> .cqe_comp = cqe_comp,
> @@ -857,11 +799,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> },
> };
>
> - len = snprintf(name, sizeof(name), PCI_PRI_FMT,
> - pci_dev->addr.domain, pci_dev->addr.bus,
> - pci_dev->addr.devid, pci_dev->addr.function);
> - if (attr.orig_attr.phys_port_cnt > 1)
> - snprintf(name + len, sizeof(name), " port %u", i);
> + if (attr->orig_attr.phys_port_cnt > 1)
> + snprintf(name, sizeof(name), "%s", dpdk_dev->name);
> + else
> + snprintf(name, sizeof(name), "%s port %u",
> + dpdk_dev->name, port);
Name contains port only if phys_port_cnt > 1 in previous logic, are you sure?
> if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> eth_dev = rte_eth_dev_attach_secondary(name);
> if (eth_dev == NULL) {
> @@ -870,7 +812,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> err = rte_errno;
> goto error;
> }
> - eth_dev->device = &pci_dev->device;
> + eth_dev->device = dpdk_dev;
> eth_dev->dev_ops = &mlx5_dev_sec_ops;
> err = mlx5_uar_init_secondary(eth_dev);
> if (err) {
> @@ -898,16 +840,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> mlx5_select_rx_function(eth_dev);
> eth_dev->tx_pkt_burst =
> mlx5_select_tx_function(eth_dev);
> - rte_eth_dev_probing_finish(eth_dev);
> - continue;
> + mlx5_glue->close_device(ctx);
> + return eth_dev;
> }
> DRV_LOG(DEBUG, "using port %u", port);
> - if (!ctx)
> - ctx = mlx5_glue->open_device(ibv_dev);
> - if (ctx == NULL) {
> - err = ENODEV;
> - goto port_error;
> - }
> /* Check port status. */
> err = mlx5_glue->query_port(ctx, port, &port_attr);
> if (err) {
> @@ -945,23 +881,23 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> priv->ctx = ctx;
> strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
> sizeof(priv->ibdev_path));
> - priv->device_attr = attr;
> + priv->device_attr = *attr;
> priv->port = port;
> priv->pd = pd;
> priv->mtu = ETHER_MTU;
> - err = mlx5_args(&config, pci_dev->device.devargs);
> + err = mlx5_args(&config, dpdk_dev->devargs);
> if (err) {
> err = rte_errno;
> DRV_LOG(ERR, "failed to process device arguments: %s",
> strerror(rte_errno));
> goto port_error;
> }
> - config.hw_csum = !!(attr.device_cap_flags_ex &
> + config.hw_csum = !!(attr->device_cap_flags_ex &
> IBV_DEVICE_RAW_IP_CSUM);
> DRV_LOG(DEBUG, "checksum offloading is %ssupported",
> (config.hw_csum ? "" : "not "));
> #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
> - config.flow_counter_en = !!attr.max_counter_sets;
> + config.flow_counter_en = !!attr->max_counter_sets;
> mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
> DRV_LOG(DEBUG,
> "counter type = %d, num of cs = %ld, attributes = %d", @@ -969,7 +905,7 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> cs_desc.attributes);
> #endif
> config.ind_table_max_size =
> - attr.rss_caps.max_rwq_indirection_table_size;
> + attr->rss_caps.max_rwq_indirection_table_size;
> /* Remove this check once DPDK supports larger/variable
> * indirection tables. */
> if (config.ind_table_max_size >
> @@ -977,28 +913,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
> DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
> config.ind_table_max_size);
> - config.hw_vlan_strip = !!(attr.raw_packet_caps &
> + config.hw_vlan_strip = !!(attr->raw_packet_caps &
> IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
> DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
> (config.hw_vlan_strip ? "" : "not "));
>
> - config.hw_fcs_strip = !!(attr.raw_packet_caps &
> + config.hw_fcs_strip = !!(attr->raw_packet_caps &
> IBV_RAW_PACKET_CAP_SCATTER_FCS);
> DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
> (config.hw_fcs_strip ? "" : "not "));
>
> #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
> - config.hw_padding = !!attr.rx_pad_end_addr_align;
> + config.hw_padding = !!attr->rx_pad_end_addr_align;
> #endif
> DRV_LOG(DEBUG,
> "hardware Rx end alignment padding is %ssupported",
> (config.hw_padding ? "" : "not "));
> config.vf = vf;
> - config.tso = (attr.tso_caps.max_tso > 0 &&
> - (attr.tso_caps.supported_qpts &
> + config.tso = (attr->tso_caps.max_tso > 0 &&
> + (attr->tso_caps.supported_qpts &
> (1 << IBV_QPT_RAW_PACKET)));
> if (config.tso)
> - config.tso_max_payload_sz = attr.tso_caps.max_tso;
> + config.tso_max_payload_sz = attr->tso_caps.max_tso;
> if (config.mps && !mps) {
> DRV_LOG(ERR,
> "multi-packet send not supported on this device"
> @@ -1039,8 +975,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> eth_dev->data->dev_private = priv;
> priv->dev_data = eth_dev->data;
> eth_dev->data->mac_addrs = priv->mac;
> - eth_dev->device = &pci_dev->device;
> - rte_eth_copy_pci_info(eth_dev, pci_dev);
> + eth_dev->device = dpdk_dev;
> eth_dev->device->driver = &mlx5_driver.driver;
> err = mlx5_uar_init_primary(eth_dev);
> if (err) {
> @@ -1145,13 +1080,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> priv, mem_event_cb);
> rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
> rte_eth_dev_probing_finish(eth_dev);
> - /*
> - * Each eth_dev instance is assigned its own Verbs context,
> - * since this one is consumed, let the next iteration open
> - * another.
> - */
> - ctx = NULL;
> - continue;
> + return eth_dev;
> port_error:
> if (priv)
> rte_free(priv);
> @@ -1159,24 +1088,165 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> claim_zero(mlx5_glue->dealloc_pd(pd));
> if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
> rte_eth_dev_release_port(eth_dev);
> - break;
> }
> - /*
> - * XXX if something went wrong in the loop above, there is a resource
> - * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
> - * long as the dpdk does not provide a way to deallocate a ethdev and a
> - * way to enumerate the registered ethdevs to free the previous ones.
> - */
> error:
> if (ctx)
> claim_zero(mlx5_glue->close_device(ctx));
> - if (list)
> - mlx5_glue->free_device_list(list);
> - if (err) {
> - rte_errno = err;
> + assert(err > 0);
> + rte_errno = err;
> + return NULL;
> +}
> +
> +/**
> + * Spawn Ethernet devices from Verbs information, one per detected port.
> + *
> + * @param dpdk_dev
> + * Backing DPDK device.
> + * @param ibv_dev
> + * Verbs device.
> + * @param vf
> + * If nonzero, enable VF-specific features.
> + *
> + * @return
> + * A NULL-terminated list of Ethernet device objects on success, NULL
> + * otherwise and rte_errno is set. Caller is expected to release list
> + * memory through free().
> + */
> +static struct rte_eth_dev **
> +mlx5_dev_spawn(struct rte_device *dpdk_dev,
> + struct ibv_device *ibv_dev,
> + int vf)
> +{
> + struct rte_eth_dev **eth_list = NULL;
> + struct ibv_context *ctx;
> + struct ibv_device_attr_ex attr;
> + unsigned int i;
> + int ret;
> +
> + errno = 0;
> + ctx = mlx5_glue->open_device(ibv_dev);
> + if (!ctx) {
> + rte_errno = errno ? errno : ENODEV;
> + if (rte_errno == ENODEV)
> + DRV_LOG(ERR,
> + "cannot access device, is mlx5_ib loaded?");
> + else
> + DRV_LOG(ERR,
> + "cannot use device, are drivers up to date?");
> + return NULL;
> + }
> + ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
> + mlx5_glue->close_device(ctx);
> + if (ret) {
> + rte_errno = ret;
> + DRV_LOG(ERR, "unable to query device information: %s",
> + strerror(rte_errno));
> + return NULL;
> + }
> + DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
> + eth_list = malloc(sizeof(*eth_list) *
> + (attr.orig_attr.phys_port_cnt + 1));
> + if (!eth_list) {
> + rte_errno = errno;
> + return NULL;
> + }
> + for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> + eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> + &attr, i + 1);
> + if (eth_list[i])
> + continue;
> + /* Save rte_errno and roll back in case of failure. */
> + ret = rte_errno;
> + while (i--) {
> + mlx5_dev_close(eth_list[i]);
> + if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> + rte_free(eth_list[i]->data->dev_private);
> + claim_zero(rte_eth_dev_release_port(eth_list[i]));
> + }
> + free(eth_list);
> + rte_errno = ret;
> + return NULL;
The code is correct, but I personally prefer to move complex error handling to
dedicate "error:" block to make the code clear.
> + }
> + eth_list[i] = NULL;
> + return eth_list;
> +}
> +
> +/**
> + * DPDK callback to register a PCI device.
> + *
> + * This function creates an Ethernet device for each port of a given
> + * PCI device.
> + *
> + * @param[in] pci_drv
> + * PCI driver structure (mlx5_driver).
> + * @param[in] pci_dev
> + * PCI device information.
> + *
> + * @return
> + * 0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> + struct rte_pci_device *pci_dev) {
> + struct ibv_device **ibv_list;
> + struct rte_eth_dev **eth_list = NULL;
> + int vf;
> + int ret;
> +
> + assert(pci_drv == &mlx5_driver);
> + switch (pci_dev->id.device_id) {
> + case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
> + case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
> + case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
> + case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
> + vf = 1;
> + break;
> + default:
> + vf = 0;
> + }
How about use a macro for vf detection and invoke in mlx5_dev_spawn_one().
Seems it not used in outer callers.
> + errno = 0;
> + ibv_list = mlx5_glue->get_device_list(&ret);
> + if (!ibv_list) {
> + rte_errno = errno ? errno : ENOSYS;
> + DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
> return -rte_errno;
> }
> - return 0;
> + while (ret-- > 0) {
> + struct rte_pci_addr pci_addr;
> +
> + DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
> + if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
> + continue;
> + if (pci_dev->addr.domain != pci_addr.domain ||
> + pci_dev->addr.bus != pci_addr.bus ||
> + pci_dev->addr.devid != pci_addr.devid ||
> + pci_dev->addr.function != pci_addr.function)
> + continue;
> + DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
> + ibv_list[ret]->name);
> + break;
> + }
> + if (ret >= 0)
> + eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
> + mlx5_glue->free_device_list(ibv_list);
> + if (!eth_list || !*eth_list) {
> + DRV_LOG(WARNING,
> + "no Verbs device matches PCI " PCI_PRI_FMT ","
> + " are kernel drivers loaded?",
> + pci_dev->addr.domain, pci_dev->addr.bus,
> + pci_dev->addr.devid, pci_dev->addr.function);
> + rte_errno = ENOENT;
> + ret = -rte_errno;
> + } else {
> + for (ret = 0; eth_list[ret]; ++ret) {
> + rte_eth_copy_pci_info(eth_list[ret], pci_dev);
> + rte_eth_dev_probing_finish(eth_list[ret]);
> + }
> + ret = 0;
> + }
> + free(eth_list);
> + return ret;
> }
>
> static const struct rte_pci_id mlx5_pci_id_map[] = {
> --
> 2.11.0
More information about the dev
mailing list