[PATCH] net/mlx5: use port index as representor index

Dariusz Sosnowski dsosnowski at nvidia.com
Fri May 22 12:19:33 CEST 2026


Since the offending commit, mlx5 driver supports probing
representors on BlueField DPUs with Socket Direct (SD).
Such card can be connected to 2 different CPUs on the host system.
On DPU, user would see the following network devices:

- p0 and p1 - physical ports
- pf0hpf and pf2hpf - PF0 on CPU 0 and CPU 1 respectively
- pf1hpf and pf3hpf - PF1 on CPU 0 and CPU 1 respectively

mlx5 driver finds the relevant netdev by matching information
provided in representor devarg to phys_port_name
reported by Linux kernel.
For the above interfaces phys_port_name's would be reported
and probed as:

- p0 -> p0, no need for representor devarg
- p1 -> p1, with representor=pf1
- pf0hpf -> c1pf0, with representor=c1pf0vf65535
- pf1hpf -> c1pf1, with representor=c1pf1vf65535
- pf2hpf -> c2pf0, with representor=c2pf0vf65535
- pf3hpf -> c2pf1, with representor=c2pf1vf65535

Although hot-plugging all these representors is successful,
RTE_ETH_FOREACH_MATCHING_DEV() macro would find DPDK ports.
This is caused missing information reported by mlx5 driver,
through rte_eth_representor_info_get() API.
Specifically, mlx5 driver did not report controller index for all
representor ranges.

Until now mlx5 driver used static encoding for 16-bit representor_id:

- 2 bits for representor type
- 2 bits for PF index
- 2 bits for representor index (either VF or SF number)

Controller index was not encoded. This caused the mentioned issue
and on top of that:

- limits the number of PFs
- limits the number of SFs

This patch changes the mlx5 driver logic for
rte_eth_representor_info_get().
Instead of static encoding:

- representor_id's will be dynamically assigned
  to each probed representor.
- rte_eth_representor_info_get() will report N ranges:
    - N == number of probed ports on single embedded switch
    - Each range will define single representor_id
      for given controller/PF/VF/SF.

Fixes: 2f7cdd821b1b ("net/mlx5: fix probing to allow BlueField Socket Direct")
Cc: stable at dpdk.org

Signed-off-by: Dariusz Sosnowski <dsosnowski at nvidia.com>
Acked-by: Bing Zhao <bingz at nvidia.com>
---
 drivers/net/mlx5/linux/mlx5_os.c |   6 +-
 drivers/net/mlx5/mlx5.h          |  19 +++
 drivers/net/mlx5/mlx5_ethdev.c   | 284 +++++++++++++++++++------------
 3 files changed, 199 insertions(+), 110 deletions(-)

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 0fc721592b..5305523c1b 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -1677,9 +1677,13 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 		err = ENOMEM;
 		goto error;
 	}
+	priv->port_info.type = spawn->info.name_type;
+	priv->port_info.ctrl_num = spawn->info.ctrl_num;
+	priv->port_info.pf_num = spawn->info.pf_num;
+	priv->port_info.port_num = spawn->info.port_name;
 	if (priv->representor) {
 		eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
-		eth_dev->data->representor_id = priv->representor_id;
+		eth_dev->data->representor_id = eth_dev->data->port_id;
 		MLX5_ETH_FOREACH_DEV(port_id, dpdk_dev) {
 			struct mlx5_priv *opriv =
 				rte_eth_devices[port_id].data->dev_private;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 49a0c03544..23803b450b 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -1984,6 +1984,24 @@ struct mlx5_quota_ctx {
 	struct mlx5_indexed_pool *quota_ipool; /* Manage quota objects */
 };
 
+/* Stores info parsed from phys_port_name related to given DPDK port. */
+struct mlx5_representor_info {
+	enum mlx5_nl_phys_port_name_type type;
+	/* PCI controller index. 0 if no controller was reported in phys_port_name. */
+	int32_t ctrl_num;
+	/* PF index. */
+	int32_t pf_num;
+	/*
+	 * Representor number:
+	 *
+	 * - For VF/SF - VF/SF index.
+	 * - For PFHPF - -1.
+	 * - For uplink - physical port index.
+	 * - For others - VF representor is assumed, so VF index.
+	 */
+	int32_t port_num;
+};
+
 struct mlx5_nta_sample_ctx;
 struct mlx5_priv {
 	struct rte_eth_dev_data *dev_data;  /* Pointer to device data. */
@@ -2019,6 +2037,7 @@ struct mlx5_priv {
 	uint32_t vport_meta_tag; /* Used for vport index match ove VF LAG. */
 	uint32_t vport_meta_mask; /* Used for vport index field match mask. */
 	uint16_t representor_id; /* UINT16_MAX if not a representor. */
+	struct mlx5_representor_info port_info;
 	int32_t pf_bond; /* >=0, representor owner PF index in bonding. */
 	int32_t mpesw_owner; /* >=0, representor owner PF index in MPESW. */
 	int32_t mpesw_port; /* Related port index of MPESW device. < 0 - no MPESW. */
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index a29cdeeb50..e14b7f148b 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -345,6 +345,23 @@ mlx5_dev_get_max_wq_size(struct mlx5_dev_ctx_shared *sh)
 	return max_wqe;
 }
 
+/**
+ * Get switch port ID for given DPDK port.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @return
+ *   Switch port ID reported through rte_eth_dev_info_get().
+ */
+static uint16_t
+mlx5_dev_switch_info_port_id_get(struct rte_eth_dev *dev)
+{
+	if (rte_eth_dev_is_repr(dev))
+		return dev->data->port_id;
+
+	return UINT16_MAX;
+}
+
 /**
  * DPDK callback to get information about the device.
  *
@@ -401,7 +418,7 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
 		info->dev_capa |= RTE_ETH_DEV_CAPA_RXQ_SHARE;
 	info->switch_info.name = dev->data->name;
 	info->switch_info.domain_id = priv->domain_id;
-	info->switch_info.port_id = priv->representor_id;
+	info->switch_info.port_id = mlx5_dev_switch_info_port_id_get(dev);
 	info->switch_info.rx_domain = 0; /* No sub Rx domains. */
 	if (priv->representor) {
 		uint16_t port_id;
@@ -472,14 +489,162 @@ mlx5_representor_id_encode(const struct mlx5_switch_info *info,
 	return MLX5_REPRESENTOR_ID(pf, type, repr);
 }
 
+static unsigned int
+mlx5_representor_info_count_one(struct mlx5_priv *priv)
+{
+	switch (priv->port_info.type) {
+	case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
+		return 2;
+	case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
+		/* Only representor uplinks should be reported */
+		if (!priv->representor)
+			return 0;
+		return 1;
+	case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
+		/* FALLTHROUGH */
+	case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
+		/* FALLTHROUGH */
+	case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
+		/* FALLTHROUGH */
+	case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
+		/* FALLTHROUGH */
+	case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
+		/* FALLTHROUGH */
+	default:
+		return 1;
+	}
+}
+
+static unsigned int
+mlx5_representor_info_count(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint16_t port_id;
+	unsigned int count = 0;
+
+	MLX5_ETH_FOREACH_DEV(port_id, dev->device) {
+		struct mlx5_priv *opriv = rte_eth_devices[port_id].data->dev_private;
+
+		if (!opriv ||
+		    opriv->sh != priv->sh ||
+		    opriv->domain_id != priv->domain_id)
+			continue;
+
+		count += mlx5_representor_info_count_one(opriv);
+	}
+
+	return count;
+}
+
+static void
+mlx5_representor_info_fill_one(struct mlx5_priv *priv,
+			       struct rte_eth_representor_info *info)
+{
+	struct rte_eth_representor_range *range;
+	unsigned int count;
+
+	count = mlx5_representor_info_count_one(priv);
+	if (count == 0)
+		return;
+
+	if (info->nb_ranges + count > info->nb_ranges_alloc) {
+		DRV_LOG(ERR, "port %u representor info already full", priv->dev_data->port_id);
+		return;
+	}
+
+	range = &info->ranges[info->nb_ranges];
+
+	switch (priv->port_info.type) {
+	case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
+		range->type = RTE_ETH_REPRESENTOR_PF;
+		range->controller = priv->port_info.ctrl_num;
+		range->pf = priv->port_info.port_num;
+		range->id_base = priv->dev_data->port_id;
+		range->id_end = range->id_base;
+		snprintf(range->name, sizeof(range->name), "pf%d", range->pf);
+		break;
+	case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
+		/* Secondly, fill in SF variant. */
+		range->type = RTE_ETH_REPRESENTOR_SF;
+		range->controller = priv->port_info.ctrl_num;
+		range->pf = priv->port_info.pf_num;
+		range->sf = priv->port_info.port_num;
+		range->id_base = priv->dev_data->port_id;
+		range->id_end = range->id_base;
+		snprintf(range->name, sizeof(range->name), "pf%dsf", range->pf);
+		break;
+	case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
+		/*
+		 * Host PF can be probed either through VF(0xffff) or SF(0xffff).
+		 * Firstly fill in VF variant.
+		 */
+		range->type = RTE_ETH_REPRESENTOR_VF;
+		range->controller = priv->port_info.ctrl_num;
+		range->pf = priv->port_info.pf_num;
+		range->vf = UINT16_MAX;
+		range->id_base = priv->dev_data->port_id;
+		range->id_end = range->id_base;
+		snprintf(range->name, sizeof(range->name), "pf%dvf", range->pf);
+
+		/* Move the SF variant. */
+		range++;
+
+		/* Fill in SF variant. */
+		range->type = RTE_ETH_REPRESENTOR_SF;
+		range->controller = priv->port_info.ctrl_num;
+		range->pf = priv->port_info.pf_num;
+		range->sf = UINT16_MAX;
+		range->id_base = priv->dev_data->port_id;
+		range->id_end = range->id_base;
+		snprintf(range->name, sizeof(range->name), "pf%dsf", range->pf);
+		break;
+	case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
+		/* FALLTHROUGH */
+	case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
+		/* FALLTHROUGH */
+	case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
+		/* FALLTHROUGH */
+	case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
+		range->type = RTE_ETH_REPRESENTOR_VF;
+		range->controller = priv->port_info.ctrl_num;
+		range->pf = priv->port_info.pf_num;
+		range->vf = priv->port_info.port_num;
+		range->id_base = priv->dev_data->port_id;
+		range->id_end = range->id_base;
+		snprintf(range->name, sizeof(range->name), "pf%dvf", range->pf);
+		break;
+	}
+
+	info->nb_ranges += count;
+}
+
+static unsigned int
+mlx5_representor_info_fill(struct rte_eth_dev *dev,
+			   struct rte_eth_representor_info *info)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint16_t port_id;
+
+	info->controller = priv->port_info.ctrl_num;
+	info->pf = RTE_DEV_TO_PCI(dev->device)->addr.function;
+
+	MLX5_ETH_FOREACH_DEV(port_id, dev->device) {
+		struct mlx5_priv *opriv = rte_eth_devices[port_id].data->dev_private;
+
+		if (!opriv ||
+		    opriv->sh != priv->sh ||
+		    opriv->domain_id != priv->domain_id)
+			continue;
+
+		mlx5_representor_info_fill_one(opriv, info);
+	}
+
+	return info->nb_ranges;
+}
+
 /**
  * DPDK callback to get information about representor.
  *
- * Representor ID bits definition:
- *   vf/sf: 12
- *   type: 2
- *   pf: 2
- *
  * @param dev
  *   Pointer to Ethernet device structure.
  * @param[out] info
@@ -492,110 +657,11 @@ int
 mlx5_representor_info_get(struct rte_eth_dev *dev,
 			  struct rte_eth_representor_info *info)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
-	/* Representor types: PF, VF, HPF at VF, SF and HPF at SF, total 5. */
-	int n_type = RTE_ETH_REPRESENTOR_PF + 2; /* Maximal type + 2 for HPFs. */
-	int n_pf = 8; /* Maximal number of PFs. */
-	int i = 0, pf;
-	int n_entries;
-
 	if (info == NULL)
-		goto out;
-
-	n_entries = n_type * n_pf;
-	if ((uint32_t)n_entries > info->nb_ranges_alloc)
-		n_entries = info->nb_ranges_alloc;
-
-	info->controller = 0;
-	info->pf = 0;
-	if (mlx5_is_port_on_mpesw_device(priv)) {
-		info->pf = priv->mpesw_port;
-		for (i = 0; i < n_pf; i++) {
-			/* PF range, both ports will show the same information. */
-			info->ranges[i].type = RTE_ETH_REPRESENTOR_PF;
-			info->ranges[i].controller = 0;
-			info->ranges[i].pf = priv->mpesw_owner + i + 1;
-			info->ranges[i].vf = 0;
-			/*
-			 * The representor indexes should be the values set of "priv->mpesw_port".
-			 * In the real case now, only 1 PF/UPLINK representor is supported.
-			 * The port index will always be the value of "owner + 1".
-			 */
-			info->ranges[i].id_base =
-				MLX5_REPRESENTOR_ID(priv->mpesw_owner,
-						    info->ranges[i].type,
-						    info->ranges[i].pf);
-			info->ranges[i].id_end =
-				MLX5_REPRESENTOR_ID(priv->mpesw_owner,
-						    info->ranges[i].type,
-						    info->ranges[i].pf);
-			snprintf(info->ranges[i].name,
-				 sizeof(info->ranges[i].name),
-				 "pf%d", info->ranges[i].pf);
-		}
-	} else if (priv->pf_bond >= 0)
-		info->pf = priv->pf_bond;
-	for (pf = 0; pf < n_pf; ++pf) {
-		/* VF range. */
-		info->ranges[i].type = RTE_ETH_REPRESENTOR_VF;
-		info->ranges[i].controller = 0;
-		info->ranges[i].pf = pf;
-		info->ranges[i].vf = 0;
-		info->ranges[i].id_base =
-			MLX5_REPRESENTOR_ID(pf, info->ranges[i].type, 0);
-		info->ranges[i].id_end =
-			MLX5_REPRESENTOR_ID(pf, info->ranges[i].type, -1);
-		snprintf(info->ranges[i].name,
-			 sizeof(info->ranges[i].name), "pf%dvf", pf);
-		i++;
-		if (i == n_entries)
-			break;
-		/* HPF range of VF type. */
-		info->ranges[i].type = RTE_ETH_REPRESENTOR_VF;
-		info->ranges[i].controller = 0;
-		info->ranges[i].pf = pf;
-		info->ranges[i].vf = UINT16_MAX;
-		info->ranges[i].id_base =
-			MLX5_REPRESENTOR_ID(pf, info->ranges[i].type, -1);
-		info->ranges[i].id_end =
-			MLX5_REPRESENTOR_ID(pf, info->ranges[i].type, -1);
-		snprintf(info->ranges[i].name,
-			 sizeof(info->ranges[i].name), "pf%dvf", pf);
-		i++;
-		if (i == n_entries)
-			break;
-		/* SF range. */
-		info->ranges[i].type = RTE_ETH_REPRESENTOR_SF;
-		info->ranges[i].controller = 0;
-		info->ranges[i].pf = pf;
-		info->ranges[i].vf = 0;
-		info->ranges[i].id_base =
-			MLX5_REPRESENTOR_ID(pf, info->ranges[i].type, 0);
-		info->ranges[i].id_end =
-			MLX5_REPRESENTOR_ID(pf, info->ranges[i].type, -1);
-		snprintf(info->ranges[i].name,
-			 sizeof(info->ranges[i].name), "pf%dsf", pf);
-		i++;
-		if (i == n_entries)
-			break;
-		/* HPF range of SF type. */
-		info->ranges[i].type = RTE_ETH_REPRESENTOR_SF;
-		info->ranges[i].controller = 0;
-		info->ranges[i].pf = pf;
-		info->ranges[i].vf = UINT16_MAX;
-		info->ranges[i].id_base =
-			MLX5_REPRESENTOR_ID(pf, info->ranges[i].type, -1);
-		info->ranges[i].id_end =
-			MLX5_REPRESENTOR_ID(pf, info->ranges[i].type, -1);
-		snprintf(info->ranges[i].name,
-			 sizeof(info->ranges[i].name), "pf%dsf", pf);
-		i++;
-		if (i == n_entries)
-			break;
-	}
-	info->nb_ranges = i;
-out:
-	return n_type * n_pf;
+		return mlx5_representor_info_count(dev);
+
+	return mlx5_representor_info_fill(dev, info);
+
 }
 
 /**
-- 
2.47.3



More information about the stable mailing list