[PATCH v2] net/mlx5: allow MTU mismatch for running shared Rx queues

Shani Peretz shperetz at nvidia.com
Thu Jan 29 09:13:23 CET 2026


Previously, mlx5_shared_rxq_match() rejected any MTU mismatch between
a port and the shared RX queue, even when the queue hardware was already
running. This caused failures when hot-adding representors after runtime
MTU changes (new port has default MTU=1500, but running queue was created
with MTU=9000). It also prevented proper port reconfiguration scenarios.

The fix allows MTU mismatches when rxq_ctrl->obj != NULL (queue is
running), since runtime MTU changes via rte_eth_dev_set_mtu() only update
software bookkeeping without recreating hardware resources. Stopped queues
still enforce MTU matching to trigger proper reconfiguration.

This patch also removes the redundant priv->mtu field and uses
dev->data->mtu consistently for MTU tracking throughout the driver.

Fixes: 4414eb800708 ("net/mlx5: store MTU at Rx queue allocation time")
Fixes: 09c2555303be ("net/mlx5: support shared Rx queue")
Cc: a.schollmeyer at syseleven.de
Cc: stable at dpdk.org

Signed-off-by: Shani Peretz <shperetz at nvidia.com>
Acked-by: Dariusz Sosnowski <dsosnowski at nvidia.com>
---
 doc/guides/nics/mlx5.rst           |  6 +++
 drivers/net/mlx5/linux/mlx5_os.c   |  6 +--
 drivers/net/mlx5/mlx5.h            |  1 -
 drivers/net/mlx5/mlx5_ethdev.c     |  3 --
 drivers/net/mlx5/mlx5_rxq.c        | 85 ++++++++++++++++++++++++------
 drivers/net/mlx5/windows/mlx5_os.c |  6 +--
 6 files changed, 78 insertions(+), 29 deletions(-)

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 904a0ac358..66323907a9 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -1827,6 +1827,12 @@ Limitations
 
 #. Counters of received packets and bytes of queues in the same group and queue ID are same.
 
+#. Each Rx queue in share group must have the same queue configuration.
+
+#. Ports in share group must have equal MTU at port start time.
+
+#. Reconfiguring a shared queue while it is in use (started or referenced by flows) is not allowed.
+
 
 .. _mlx5_rx_threshold:
 
diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 7f73183bb1..e8c4fb62c8 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -1334,7 +1334,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	priv->sh = sh;
 	priv->dev_port = spawn->phys_port;
 	priv->pci_dev = spawn->pci_dev;
-	priv->mtu = RTE_ETHER_MTU;
 	/* Some internal functions rely on Netlink sockets, open them now. */
 	priv->nl_socket_rdma = nl_rdma;
 	priv->nl_socket_route =	mlx5_nl_init(NETLINK_ROUTE, 0);
@@ -1606,14 +1605,13 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	}
 #endif
 	/* Get actual MTU if possible. */
-	err = mlx5_get_mtu(eth_dev, &priv->mtu);
+	err = mlx5_get_mtu(eth_dev, &eth_dev->data->mtu);
 	if (err) {
 		err = rte_errno;
 		goto error;
 	}
-	eth_dev->data->mtu = priv->mtu;
 	DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
-		priv->mtu);
+		eth_dev->data->mtu);
 	/* Initialize burst functions to prevent crashes before link-up. */
 	eth_dev->rx_pkt_burst = rte_eth_pkt_burst_dummy;
 	eth_dev->tx_pkt_burst = rte_eth_pkt_burst_dummy;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 966e802f5f..551f3b9d17 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -1983,7 +1983,6 @@ struct mlx5_priv {
 	uint16_t vlan_filter[MLX5_MAX_VLAN_IDS]; /* VLAN filters table. */
 	unsigned int vlan_filter_n; /* Number of configured VLAN filters. */
 	/* Device properties. */
-	uint16_t mtu; /* Configured MTU. */
 	uint16_t min_mtu; /* Minimum MTU allowed on the NIC. */
 	uint16_t max_mtu; /* Maximum MTU allowed on the NIC. */
 	unsigned int isolated:1; /* Whether isolated mode is enabled. */
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index c93a7ac4f2..00a1d444ef 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -679,7 +679,6 @@ mlx5_dev_supported_ptypes_get(struct rte_eth_dev *dev, size_t *no_of_elements)
 int
 mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
 	uint16_t kern_mtu = 0;
 	int ret;
 
@@ -688,7 +687,6 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
 		return ret;
 
 	if (kern_mtu == mtu) {
-		priv->mtu = mtu;
 		DRV_LOG(DEBUG, "port %u adapter MTU was already set to %u",
 			dev->data->port_id, mtu);
 		return 0;
@@ -702,7 +700,6 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
 	if (ret)
 		return ret;
 	if (kern_mtu == mtu) {
-		priv->mtu = mtu;
 		DRV_LOG(DEBUG, "port %u adapter MTU set to %u",
 			dev->data->port_id, mtu);
 		return 0;
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 9210a92c5f..c4ba746d47 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -659,8 +659,6 @@ mlx5_rx_queue_pre_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t *desc,
 			struct mlx5_rxq_ctrl **rxq_ctrl)
 {
 	struct mlx5_priv *priv = dev->data->dev_private;
-	struct mlx5_rxq_priv *rxq;
-	bool empty;
 
 	if (*desc > mlx5_dev_get_max_wq_size(priv->sh)) {
 		DRV_LOG(ERR,
@@ -699,14 +697,6 @@ mlx5_rx_queue_pre_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t *desc,
 		if ((*rxq_ctrl)->obj != NULL)
 			/* Some port using shared Rx queue has been started. */
 			return 0;
-		/* Release all owner RxQ to reconfigure Shared RxQ. */
-		do {
-			rxq = LIST_FIRST(&(*rxq_ctrl)->owners);
-			LIST_REMOVE(rxq, owner_entry);
-			empty = LIST_EMPTY(&(*rxq_ctrl)->owners);
-			mlx5_rxq_release(ETH_DEV(rxq->priv), rxq->idx);
-		} while (!empty);
-		*rxq_ctrl = NULL;
 	}
 	return 0;
 }
@@ -780,10 +770,21 @@ mlx5_shared_rxq_match(struct mlx5_rxq_ctrl *rxq_ctrl, struct rte_eth_dev *dev,
 			dev->data->port_id, idx);
 		return false;
 	}
-	if (priv->mtu != rxq_ctrl->mtu) {
-		DRV_LOG(ERR, "port %u queue index %u failed to join shared group: mtu mismatch",
-			dev->data->port_id, idx);
-		return false;
+	if (dev->data->mtu != rxq_ctrl->mtu) {
+		/*
+		 * MTU mismatch is only a problem when the queue hasn't been started yet.
+		 * If rxq_ctrl->obj is NULL, the queue hardware objects haven't been created,
+		 * meaning we're in the initial configuration phase where MTU must match.
+		 * If obj != NULL, the queue is already running with its hardware configured,
+		 * and runtime MTU changes are safe as they only update software bookkeeping
+		 * without recreating hardware resources.
+		 */
+		if (rxq_ctrl->obj == NULL) {
+			DRV_LOG(DEBUG, "port %u queue index %u: mtu mismatch with existing shared rxq_ctrl "
+					"(port mtu=%u rxq_ctrl mtu=%u), reconfiguration needed",
+					dev->data->port_id, idx, dev->data->mtu, rxq_ctrl->mtu);
+			return false;
+		}
 	}
 	if (priv->dev_data->dev_conf.intr_conf.rxq !=
 	    spriv->dev_data->dev_conf.intr_conf.rxq) {
@@ -926,8 +927,57 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		if (rxq_ctrl != NULL &&
 		    !mlx5_shared_rxq_match(rxq_ctrl, dev, idx, desc, socket,
 					   conf, mp)) {
-			rte_errno = EINVAL;
-			return -rte_errno;
+			struct mlx5_rxq_priv *rxq_tmp;
+			bool empty;
+
+			/*
+			 * Configuration mismatch detected with existing shared RXQ.
+			 * We need to reconfigure, but only if it's safe to do so.
+			 *
+			 * First check: If hardware objects are allocated, the shared queue has
+			 * been started. Reconfiguration would require destroying and recreating
+			 * hardware resources, which cannot be done while the queue is active.
+			 * Return EBUSY to force caller to stop the queue first.
+			 */
+			if (rxq_ctrl->obj != NULL) {
+				DRV_LOG(ERR, "port %u queue index %u: cannot reconfigure shared RXQ while started",
+					dev->data->port_id, idx);
+				rte_errno = EBUSY;
+				return -rte_errno;
+			}
+
+			/*
+			 * Second check: Even if hardware objects aren't allocated yet,
+			 * verify that no owner port is actively using this queue.
+			 * refcnt == 1 means the queue exists but is idle (only setup reference).
+			 * refcnt > 1 means the queue is being used by flows or other components.
+			 * This prevents releasing a queue that other ports depend on.
+			 */
+			LIST_FOREACH(rxq_tmp, &rxq_ctrl->owners, owner_entry) {
+				if (rxq_tmp->refcnt > 1) {
+					DRV_LOG(ERR, "port %u queue index %u: cannot reconfigure shared RXQ "
+						"while other ports are running",
+						dev->data->port_id, idx);
+					rte_errno = EBUSY;
+					return -rte_errno;
+				}
+			}
+
+			/*
+			 * Safe to reconfigure: hardware not started and no active users.
+			 * Release all owner ports from the existing shared rxq_ctrl.
+			 * This will decrement references and eventually free the old rxq_ctrl.
+			 * Setting rxq_ctrl to NULL triggers creation of a new one below with
+			 * the updated configuration.
+			 */
+			do {
+				rxq_tmp = LIST_FIRST(&rxq_ctrl->owners);
+				LIST_REMOVE(rxq_tmp, owner_entry);
+				empty = LIST_EMPTY(&rxq_ctrl->owners);
+				mlx5_rxq_release(ETH_DEV(rxq_tmp->priv), rxq_tmp->idx);
+			} while (!empty);
+
+			rxq_ctrl = NULL;
 		}
 	} else {
 		res = mlx5_rx_queue_pre_setup(dev, idx, &desc, &rxq_ctrl);
@@ -1813,7 +1863,8 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	LIST_INIT(&tmpl->owners);
 	MLX5_ASSERT(n_seg && n_seg <= MLX5_MAX_RXQ_NSEG);
 	/*
-	 * Save the original MTU to check against for shared rx queues.
+	 * Save the current MTU to check against for shared rx queues.
+	 * Use dev->data->mtu which reflects the actual current MTU.
 	 */
 	tmpl->mtu = dev->data->mtu;
 	/*
diff --git a/drivers/net/mlx5/windows/mlx5_os.c b/drivers/net/mlx5/windows/mlx5_os.c
index 4eadc872a5..5dcabfbed2 100644
--- a/drivers/net/mlx5/windows/mlx5_os.c
+++ b/drivers/net/mlx5/windows/mlx5_os.c
@@ -396,7 +396,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	priv->sh = sh;
 	priv->dev_port = spawn->phys_port;
 	priv->pci_dev = spawn->pci_dev;
-	priv->mtu = RTE_ETHER_MTU;
 	priv->mp_id.port_id = port_id;
 	strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
 	priv->representor = !!switch_info->representor;
@@ -504,14 +503,13 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	}
 #endif
 	/* Get actual MTU if possible. */
-	err = mlx5_get_mtu(eth_dev, &priv->mtu);
+	err = mlx5_get_mtu(eth_dev, &eth_dev->data->mtu);
 	if (err) {
 		err = rte_errno;
 		goto error;
 	}
-	eth_dev->data->mtu = priv->mtu;
 	DRV_LOG(DEBUG, "port %u MTU is %u.", eth_dev->data->port_id,
-		priv->mtu);
+		eth_dev->data->mtu);
 	/* Initialize burst functions to prevent crashes before link-up. */
 	eth_dev->rx_pkt_burst = rte_eth_pkt_burst_dummy;
 	eth_dev->tx_pkt_burst = rte_eth_pkt_burst_dummy;
-- 
2.43.0



More information about the stable mailing list