[Patch v6 01/18] net/mana: add basic driver, build environment and doc

lihuisong (C) lihuisong at huawei.com
Wed Aug 31 03:32:51 CEST 2022


在 2022/8/31 6:51, longli at linuxonhyperv.com 写道:
> From: Long Li <longli at microsoft.com>
>
> MANA is a PCI device. It uses IB verbs to access hardware through the
> kernel RDMA layer. This patch introduces build environment and basic
> device probe functions.
>
> Signed-off-by: Long Li <longli at microsoft.com>
> ---
> Change log:
> v2:
> Fix typos.
> Make the driver build only on x86-64 and Linux.
> Remove unused header files.
> Change port definition to uint16_t or uint8_t (for IB).
> Use getline() in place of fgets() to read and truncate a line.
> v3:
> Add meson build check for required functions from RDMA direct verb header file
> v4:
> Remove extra "\n" in logging code.
> Use "r" in place of "rb" in fopen() to read text files.
>
> [snip]
> +
> +static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv __rte_unused,
> +			      struct rte_pci_device *pci_dev,
> +			      struct rte_ether_addr *mac_addr)
> +{
> +	struct ibv_device **ibv_list;
> +	int ibv_idx;
> +	struct ibv_context *ctx;
> +	struct ibv_device_attr_ex dev_attr;
> +	int num_devices;
> +	int ret = 0;
> +	uint8_t port;
> +	struct mana_priv *priv = NULL;
> +	struct rte_eth_dev *eth_dev = NULL;
> +	bool found_port;
> +
> +	ibv_list = ibv_get_device_list(&num_devices);
> +	for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
> +		struct ibv_device *ibdev = ibv_list[ibv_idx];
> +		struct rte_pci_addr pci_addr;
> +
> +		DRV_LOG(INFO, "Probe device name %s dev_name %s ibdev_path %s",
> +			ibdev->name, ibdev->dev_name, ibdev->ibdev_path);
> +
> +		if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
> +			continue;
> +
> +		/* Ignore if this IB device is not this PCI device */
> +		if (pci_dev->addr.domain != pci_addr.domain ||
> +		    pci_dev->addr.bus != pci_addr.bus ||
> +		    pci_dev->addr.devid != pci_addr.devid ||
> +		    pci_dev->addr.function != pci_addr.function)
> +			continue;
> +
> +		ctx = ibv_open_device(ibdev);
> +		if (!ctx) {
> +			DRV_LOG(ERR, "Failed to open IB device %s",
> +				ibdev->name);
> +			continue;
> +		}
> +
> +		ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
> +		DRV_LOG(INFO, "dev_attr.orig_attr.phys_port_cnt %u",
> +			dev_attr.orig_attr.phys_port_cnt);
> +		found_port = false;
> +
> +		for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
> +		     port++) {
> +			struct ibv_parent_domain_init_attr attr = {};
> +			struct rte_ether_addr addr;
> +			char address[64];
> +			char name[RTE_ETH_NAME_MAX_LEN];
> +
> +			ret = get_port_mac(ibdev, port, &addr);
> +			if (ret)
> +				continue;
> +
> +			if (mac_addr && !rte_is_same_ether_addr(&addr, mac_addr))
> +				continue;
> +
> +			rte_ether_format_addr(address, sizeof(address), &addr);
> +			DRV_LOG(INFO, "device located port %u address %s",
> +				port, address);
> +			found_port = true;
> +
> +			priv = rte_zmalloc_socket(NULL, sizeof(*priv),
> +						  RTE_CACHE_LINE_SIZE,
> +						  SOCKET_ID_ANY);
> +			if (!priv) {
> +				ret = -ENOMEM;
> +				goto failed;
> +			}
> +
> +			snprintf(name, sizeof(name), "%s_port%d",
> +				 pci_dev->device.name, port);
> +
> +			if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> +				int fd;
> +
> +				eth_dev = rte_eth_dev_attach_secondary(name);
> +				if (!eth_dev) {
> +					DRV_LOG(ERR, "Can't attach to dev %s",
> +						name);
> +					ret = -ENOMEM;
> +					goto failed;
> +				}
> +
> +				eth_dev->device = &pci_dev->device;
> +				eth_dev->dev_ops = &mana_dev_sec_ops;
> +				ret = mana_proc_priv_init(eth_dev);
> +				if (ret)
> +					goto failed;
> +				priv->process_priv = eth_dev->process_private;
> +
> +				/* Get the IB FD from the primary process */
> +				fd = mana_mp_req_verbs_cmd_fd(eth_dev);
> +				if (fd < 0) {
> +					DRV_LOG(ERR, "Failed to get FD %d", fd);
> +					ret = -ENODEV;
> +					goto failed;
> +				}
> +
> +				ret = mana_map_doorbell_secondary(eth_dev, fd);
> +				if (ret) {
> +					DRV_LOG(ERR, "Failed secondary map %d",
> +						fd);
> +					goto failed;
> +				}
> +
> +				/* fd is no not used after mapping doorbell */
> +				close(fd);
> +
> +				rte_spinlock_lock(&mana_shared_data->lock);
> +				mana_shared_data->secondary_cnt++;
> +				mana_local_data.secondary_cnt++;
> +				rte_spinlock_unlock(&mana_shared_data->lock);
> +
> +				rte_eth_copy_pci_info(eth_dev, pci_dev);
> +				rte_eth_dev_probing_finish(eth_dev);
> +
> +				/* Impossible to have more than one port
> +				 * matching a MAC address
> +				 */
> +				continue;
> +			}
> +
> +			eth_dev = rte_eth_dev_allocate(name);
> +			if (!eth_dev) {
> +				ret = -ENOMEM;
> +				goto failed;
> +			}
> +
> +			eth_dev->data->mac_addrs =
> +				rte_calloc("mana_mac", 1,
> +					   sizeof(struct rte_ether_addr), 0);
> +			if (!eth_dev->data->mac_addrs) {
> +				ret = -ENOMEM;
> +				goto failed;
> +			}
> +
> +			rte_ether_addr_copy(&addr, eth_dev->data->mac_addrs);
> +
> +			priv->ib_pd = ibv_alloc_pd(ctx);
> +			if (!priv->ib_pd) {
> +				DRV_LOG(ERR, "ibv_alloc_pd failed port %d", port);
> +				ret = -ENOMEM;
> +				goto failed;
> +			}
> +
> +			/* Create a parent domain with the port number */
> +			attr.pd = priv->ib_pd;
> +			attr.comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
> +			attr.pd_context = (void *)(uint64_t)port;
> +			priv->ib_parent_pd = ibv_alloc_parent_domain(ctx, &attr);
> +			if (!priv->ib_parent_pd) {
> +				DRV_LOG(ERR,
> +					"ibv_alloc_parent_domain failed port %d",
> +					port);
> +				ret = -ENOMEM;
> +				goto failed;
> +			}
> +
> +			priv->ib_ctx = ctx;
> +			priv->port_id = eth_dev->data->port_id;
> +			priv->dev_port = port;
> +			eth_dev->data->dev_private = priv;
> +			priv->dev_data = eth_dev->data;
> +
> +			priv->max_rx_queues = dev_attr.orig_attr.max_qp;
> +			priv->max_tx_queues = dev_attr.orig_attr.max_qp;
> +
> +			priv->max_rx_desc =
> +				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
> +					dev_attr.orig_attr.max_cqe);
> +			priv->max_tx_desc =
> +				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
> +					dev_attr.orig_attr.max_cqe);
> +
> +			priv->max_send_sge = dev_attr.orig_attr.max_sge;
> +			priv->max_recv_sge = dev_attr.orig_attr.max_sge;
> +
> +			priv->max_mr = dev_attr.orig_attr.max_mr;
> +			priv->max_mr_size = dev_attr.orig_attr.max_mr_size;
> +
> +			DRV_LOG(INFO, "dev %s max queues %d desc %d sge %d",
> +				name, priv->max_rx_queues, priv->max_rx_desc,
> +				priv->max_send_sge);
> +
> +			rte_spinlock_lock(&mana_shared_data->lock);
> +			mana_shared_data->primary_cnt++;
> +			rte_spinlock_unlock(&mana_shared_data->lock);
> +
> +			eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_RMV;
> +
> +			eth_dev->device = &pci_dev->device;
> +			eth_dev->data->dev_flags |=
> +				RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
> +

Please do not use the temporary macro. Please review this patch:

f30e69b41f94 ("ethdev: add device flag to bypass auto-filled queue xstats")

This patch requires that per queue statistics are filled in 
.xstats_get() by PMD.

> +			DRV_LOG(INFO, "device %s at port %u",
> +				name, eth_dev->data->port_id);
> +
> +			eth_dev->rx_pkt_burst = mana_rx_burst_removed;
> +			eth_dev->tx_pkt_burst = mana_tx_burst_removed;
> +			eth_dev->dev_ops = &mana_dev_ops;
> +
> +			rte_eth_copy_pci_info(eth_dev, pci_dev);
> +			rte_eth_dev_probing_finish(eth_dev);
> +		}
> +
> +		/* Secondary process doesn't need an ibv_ctx. It maps the
> +		 * doorbell pages using the IB cmd_fd passed from the primary
> +		 * process and send messages to primary process for memory
> +		 * registartions.
> +		 */
> +		if (!found_port || rte_eal_process_type() == RTE_PROC_SECONDARY)
> +			ibv_close_device(ctx);
> +	}
> +
> +	ibv_free_device_list(ibv_list);
> +	return 0;
> +
> +failed:
> +	/* Free the resource for the port failed */
> +	if (priv) {
> +		if (priv->ib_parent_pd)
> +			ibv_dealloc_pd(priv->ib_parent_pd);
> +
> +		if (priv->ib_pd)
> +			ibv_dealloc_pd(priv->ib_pd);
> +	}
> +
> +	if (eth_dev)
> +		rte_eth_dev_release_port(eth_dev);
> +
> +	rte_free(priv);
> +
> +	ibv_close_device(ctx);
> +	ibv_free_device_list(ibv_list);
> +
> +	return ret;
> +}
> [snip]
>


More information about the dev mailing list