[dpdk-dev] [PATCH v1 38/38] vdpa/mlx5: disable ROCE

Matan Azrad matan at mellanox.com
Mon Jan 20 18:03:10 CET 2020


In order to support virtio queue creation by the FW, ROCE mode
should be disabled in the device.

Do it by netlink which is like the devlink tool commands:
	1. devlink dev param set pci/[pci] name enable_roce value false
	   cmode driverinit
    	2. devlink dev reload pci/[pci]
Or by sysfs which is like:
	echo 0 >  /sys/bus/pci/devices/[pci]/roce_enable

The IB device is matched again after ROCE disabling.

Signed-off-by: Matan Azrad <matan at mellanox.com>
---
 drivers/vdpa/mlx5/Makefile    |   2 +-
 drivers/vdpa/mlx5/meson.build |   2 +-
 drivers/vdpa/mlx5/mlx5_vdpa.c | 190 +++++++++++++++++++++++++++++++++++-------
 3 files changed, 160 insertions(+), 34 deletions(-)

diff --git a/drivers/vdpa/mlx5/Makefile b/drivers/vdpa/mlx5/Makefile
index 62938b8..3dee270 100644
--- a/drivers/vdpa/mlx5/Makefile
+++ b/drivers/vdpa/mlx5/Makefile
@@ -29,7 +29,7 @@ CFLAGS += -D_XOPEN_SOURCE=600
 CFLAGS += $(WERROR_FLAGS)
 CFLAGS += -Wno-strict-prototypes
 LDLIBS += -lrte_common_mlx5
-LDLIBS += -lrte_eal -lrte_vhost -lrte_kvargs -lrte_bus_pci -lrte_sched
+LDLIBS += -lrte_eal -lrte_vhost -lrte_kvargs -lrte_pci -lrte_bus_pci -lrte_sched
 
 # A few warnings cannot be avoided in external headers.
 CFLAGS += -Wno-error=cast-qual
diff --git a/drivers/vdpa/mlx5/meson.build b/drivers/vdpa/mlx5/meson.build
index 60cefd7..7e46e34 100644
--- a/drivers/vdpa/mlx5/meson.build
+++ b/drivers/vdpa/mlx5/meson.build
@@ -9,7 +9,7 @@ endif
 
 fmt_name = 'mlx5_vdpa'
 allow_experimental_apis = true
-deps += ['hash', 'common_mlx5', 'vhost', 'bus_pci', 'eal', 'sched']
+deps += ['hash', 'common_mlx5', 'vhost', 'pci', 'bus_pci', 'eal', 'sched']
 sources = files(
 	'mlx5_vdpa.c',
 	'mlx5_vdpa_mem.c',
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.c b/drivers/vdpa/mlx5/mlx5_vdpa.c
index 2ceef18..5ea5739 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa.c
@@ -1,15 +1,19 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright 2019 Mellanox Technologies, Ltd
  */
+#include <unistd.h>
+
 #include <rte_malloc.h>
 #include <rte_log.h>
 #include <rte_errno.h>
 #include <rte_bus_pci.h>
+#include <rte_pci.h>
 
 #include <mlx5_glue.h>
 #include <mlx5_common.h>
 #include <mlx5_devx_cmds.h>
 #include <mlx5_prm.h>
+#include <mlx5_nl.h>
 
 #include "mlx5_vdpa_utils.h"
 #include "mlx5_vdpa.h"
@@ -228,6 +232,144 @@
 	.get_notify_area = NULL,
 };
 
+static struct ibv_device *
+mlx5_vdpa_get_ib_device_match(struct rte_pci_addr *addr)
+{
+	int n;
+	struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n);
+	struct ibv_device *ibv_match = NULL;
+
+	if (!ibv_list) {
+		rte_errno = ENOSYS;
+		return NULL;
+	}
+	while (n-- > 0) {
+		struct rte_pci_addr pci_addr;
+
+		DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name);
+		if (mlx5_dev_to_pci_addr(ibv_list[n]->ibdev_path, &pci_addr))
+			continue;
+		if (memcmp(addr, &pci_addr, sizeof(pci_addr)))
+			continue;
+		ibv_match = ibv_list[n];
+		break;
+	}
+	if (!ibv_match)
+		rte_errno = ENOENT;
+	return ibv_match;
+}
+
+/* Try to disable ROCE by Netlink\Devlink. */
+static int
+mlx5_vdpa_nl_roce_disable(const char *addr)
+{
+	int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC);
+	int devlink_id;
+	int enable;
+	int ret;
+
+	if (nlsk_fd < 0)
+		return nlsk_fd;
+	devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd);
+	if (devlink_id < 0) {
+		ret = devlink_id;
+		DRV_LOG(DEBUG, "Failed to get devlink id for ROCE operations by"
+			" Netlink.");
+		goto close;
+	}
+	ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable);
+	if (ret) {
+		DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.",
+			ret);
+		goto close;
+	} else if (!enable) {
+		DRV_LOG(INFO, "ROCE has already disabled(Netlink).");
+		goto close;
+	}
+	ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0);
+	if (ret)
+		DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret);
+	else
+		DRV_LOG(INFO, "ROCE is disabled by Netlink successfully.");
+close:
+	close(nlsk_fd);
+	return ret;
+}
+
+/* Try to disable ROCE by sysfs. */
+static int
+mlx5_vdpa_sys_roce_disable(const char *addr)
+{
+	FILE *file_o;
+	int enable;
+	int ret;
+
+	MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr);
+	file_o = fopen(file_p, "rb");
+	if (!file_o) {
+		rte_errno = ENOTSUP;
+		return -ENOTSUP;
+	}
+	ret = fscanf(file_o, "%d", &enable);
+	if (ret != 1) {
+		rte_errno = EINVAL;
+		ret = EINVAL;
+		goto close;
+	} else if (!enable) {
+		ret = 0;
+		DRV_LOG(INFO, "ROCE has already disabled(sysfs).");
+		goto close;
+	}
+	fclose(file_o);
+	file_o = fopen(file_p, "wb");
+	if (!file_o) {
+		rte_errno = ENOTSUP;
+		return -ENOTSUP;
+	}
+	fprintf(file_o, "0\n");
+	ret = 0;
+close:
+	if (ret)
+		DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret);
+	else
+		DRV_LOG(INFO, "ROCE is disabled by sysfs successfully.");
+	fclose(file_o);
+	return ret;
+}
+
+#define MLX5_VDPA_MAX_RETRIES 20
+#define MLX5_VDPA_USEC 1000
+static int
+mlx5_vdpa_roce_disable(struct rte_pci_addr *addr, struct ibv_device **ibv)
+{
+	char addr_name[64] = {0};
+
+	rte_pci_device_name(addr, addr_name, sizeof(addr_name));
+	/* Firstly try to disable ROCE by Netlink and fallback to sysfs. */
+	if (mlx5_vdpa_nl_roce_disable(addr_name) == 0 ||
+	    mlx5_vdpa_sys_roce_disable(addr_name) == 0) {
+		/*
+		 * Succeed to disable ROCE, wait for the IB device to appear
+		 * again after reload.
+		 */
+		int r;
+		struct ibv_device *ibv_new;
+
+		for (r = MLX5_VDPA_MAX_RETRIES; r; r--) {
+			ibv_new = mlx5_vdpa_get_ib_device_match(addr);
+			if (ibv_new) {
+				*ibv = ibv_new;
+				return 0;
+			}
+			usleep(MLX5_VDPA_USEC);
+		}
+		DRV_LOG(ERR, "Cannot much device %s after ROCE disable, "
+			"retries exceed %d", addr_name, MLX5_VDPA_MAX_RETRIES);
+		rte_errno = EAGAIN;
+	}
+	return -rte_errno;
+}
+
 /**
  * DPDK callback to register a PCI device.
  *
@@ -245,8 +387,7 @@
 mlx5_vdpa_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		    struct rte_pci_device *pci_dev)
 {
-	struct ibv_device **ibv_list;
-	struct ibv_device *ibv_match = NULL;
+	struct ibv_device *ibv;
 	struct mlx5_vdpa_priv *priv = NULL;
 	struct ibv_context *ctx;
 	struct mlx5_hca_attr attr;
@@ -257,41 +398,26 @@
 			" devargs.");
 		return 1;
 	}
-	errno = 0;
-	ibv_list = mlx5_glue->get_device_list(&ret);
-	if (!ibv_list) {
-		rte_errno = errno;
-		DRV_LOG(ERR, "Failed to get device list, is ib_uverbs loaded?");
-		return -ENOSYS;
-	}
-	while (ret-- > 0) {
-		struct rte_pci_addr pci_addr;
-
-		DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[ret]->name);
-		if (mlx5_dev_to_pci_addr(ibv_list[ret]->ibdev_path, &pci_addr))
-			continue;
-		if (pci_dev->addr.domain != pci_addr.domain ||
-		    pci_dev->addr.bus != pci_addr.bus ||
-		    pci_dev->addr.devid != pci_addr.devid ||
-		    pci_dev->addr.function != pci_addr.function)
-			continue;
-		DRV_LOG(INFO, "PCI information matches for device \"%s\".",
-			ibv_list[ret]->name);
-		ibv_match = ibv_list[ret];
-		break;
-	}
-	if (!ibv_match) {
+	ibv = mlx5_vdpa_get_ib_device_match(&pci_dev->addr);
+	if (!ibv) {
 		DRV_LOG(ERR, "No matching IB device for PCI slot "
-			"%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 ".",
-			pci_dev->addr.domain, pci_dev->addr.bus,
-			pci_dev->addr.devid, pci_dev->addr.function);
+			PCI_PRI_FMT ".", pci_dev->addr.domain,
+			pci_dev->addr.bus, pci_dev->addr.devid,
+			pci_dev->addr.function);
 		rte_errno = ENOENT;
 		return -rte_errno;
+	} else {
+		DRV_LOG(INFO, "PCI information matches for device \"%s\".",
+			ibv->name);
+	}
+	if (mlx5_vdpa_roce_disable(&pci_dev->addr, &ibv) != 0) {
+		DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
+			ibv->name);
+		//return -rte_errno;
 	}
-	ctx = mlx5_glue->dv_open_device(ibv_match);
+	ctx = mlx5_glue->dv_open_device(ibv);
 	if (!ctx) {
-		DRV_LOG(ERR, "Failed to open IB device \"%s\".",
-			ibv_match->name);
+		DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name);
 		rte_errno = ENODEV;
 		return -rte_errno;
 	}
-- 
1.8.3.1



More information about the dev mailing list