[dpdk-dev] [PATCH v4 25/25] common/mlx5: support ROCE disable through Netlink

Matan Azrad matan at mellanox.com
Wed Jan 29 13:38:50 CET 2020


Add new 4 Netlink commands to support enable/disable ROCE:
        1. mlx5_nl_devlink_family_id_get to get the Devlink family ID of
           Netlink general command.
        2. mlx5_nl_enable_roce_get to get the ROCE current status.
        3. mlx5_nl_driver_reload - to reload the device kernel driver.
        4. mlx5_nl_enable_roce_set - to set the ROCE status.

When the user changes the ROCE status, the IB device may disappear and
appear again, so DPDK driver should wait for it and to restart itself.

Signed-off-by: Matan Azrad <matan at mellanox.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo at mellanox.com>
---
 drivers/common/mlx5/Makefile                    |   5 +
 drivers/common/mlx5/meson.build                 |   1 +
 drivers/common/mlx5/mlx5_nl.c                   | 366 +++++++++++++++++++++++-
 drivers/common/mlx5/mlx5_nl.h                   |   6 +
 drivers/common/mlx5/rte_common_mlx5_version.map |   4 +
 5 files changed, 380 insertions(+), 2 deletions(-)

diff --git a/drivers/common/mlx5/Makefile b/drivers/common/mlx5/Makefile
index 6a14b7d..9d4d81f 100644
--- a/drivers/common/mlx5/Makefile
+++ b/drivers/common/mlx5/Makefile
@@ -260,6 +260,11 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 		enum IFLA_PHYS_PORT_NAME \
 		$(AUTOCONF_OUTPUT)
 	$Q sh -- '$<' '$@' \
+		HAVE_DEVLINK \
+		linux/devlink.h \
+		define DEVLINK_GENL_NAME \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
 		HAVE_SUPPORTED_40000baseKR4_Full \
 		/usr/include/linux/ethtool.h \
 		define SUPPORTED_40000baseKR4_Full \
diff --git a/drivers/common/mlx5/meson.build b/drivers/common/mlx5/meson.build
index 34cb7b9..fdd1e85 100644
--- a/drivers/common/mlx5/meson.build
+++ b/drivers/common/mlx5/meson.build
@@ -168,6 +168,7 @@ if build
 		'RDMA_NLDEV_ATTR_NDEV_INDEX' ],
 		[ 'HAVE_MLX5_DR_FLOW_DUMP', 'infiniband/mlx5dv.h',
 		'mlx5dv_dump_dr_domain'],
+		[ 'HAVE_DEVLINK', 'linux/devlink.h', 'DEVLINK_GENL_NAME' ],
 	]
 	config = configuration_data()
 	foreach arg:has_sym_args
diff --git a/drivers/common/mlx5/mlx5_nl.c b/drivers/common/mlx5/mlx5_nl.c
index b4fc053..0d1efd2 100644
--- a/drivers/common/mlx5/mlx5_nl.c
+++ b/drivers/common/mlx5/mlx5_nl.c
@@ -6,6 +6,7 @@
 #include <errno.h>
 #include <linux/if_link.h>
 #include <linux/rtnetlink.h>
+#include <linux/genetlink.h>
 #include <net/if.h>
 #include <rdma/rdma_netlink.h>
 #include <stdbool.h>
@@ -22,6 +23,10 @@
 
 #include "mlx5_nl.h"
 #include "mlx5_common_utils.h"
+#ifdef HAVE_DEVLINK
+#include <linux/devlink.h>
+#endif
+
 
 /* Size of the buffer to receive kernel messages */
 #define MLX5_NL_BUF_SIZE (32 * 1024)
@@ -90,6 +95,59 @@
 #define IFLA_PHYS_PORT_NAME 38
 #endif
 
+/*
+ * Some Devlink defines may be missed in old kernel versions,
+ * adjust used defines.
+ */
+#ifndef DEVLINK_GENL_NAME
+#define DEVLINK_GENL_NAME "devlink"
+#endif
+#ifndef DEVLINK_GENL_VERSION
+#define DEVLINK_GENL_VERSION 1
+#endif
+#ifndef DEVLINK_ATTR_BUS_NAME
+#define DEVLINK_ATTR_BUS_NAME 1
+#endif
+#ifndef DEVLINK_ATTR_DEV_NAME
+#define DEVLINK_ATTR_DEV_NAME 2
+#endif
+#ifndef DEVLINK_ATTR_PARAM
+#define DEVLINK_ATTR_PARAM 80
+#endif
+#ifndef DEVLINK_ATTR_PARAM_NAME
+#define DEVLINK_ATTR_PARAM_NAME 81
+#endif
+#ifndef DEVLINK_ATTR_PARAM_TYPE
+#define DEVLINK_ATTR_PARAM_TYPE 83
+#endif
+#ifndef DEVLINK_ATTR_PARAM_VALUES_LIST
+#define DEVLINK_ATTR_PARAM_VALUES_LIST 84
+#endif
+#ifndef DEVLINK_ATTR_PARAM_VALUE
+#define DEVLINK_ATTR_PARAM_VALUE 85
+#endif
+#ifndef DEVLINK_ATTR_PARAM_VALUE_DATA
+#define DEVLINK_ATTR_PARAM_VALUE_DATA 86
+#endif
+#ifndef DEVLINK_ATTR_PARAM_VALUE_CMODE
+#define DEVLINK_ATTR_PARAM_VALUE_CMODE 87
+#endif
+#ifndef DEVLINK_PARAM_CMODE_DRIVERINIT
+#define DEVLINK_PARAM_CMODE_DRIVERINIT 1
+#endif
+#ifndef DEVLINK_CMD_RELOAD
+#define DEVLINK_CMD_RELOAD 37
+#endif
+#ifndef DEVLINK_CMD_PARAM_GET
+#define DEVLINK_CMD_PARAM_GET 38
+#endif
+#ifndef DEVLINK_CMD_PARAM_SET
+#define DEVLINK_CMD_PARAM_SET 39
+#endif
+#ifndef NLA_FLAG
+#define NLA_FLAG 6
+#endif
+
 /* Add/remove MAC address through Netlink */
 struct mlx5_nl_mac_addr {
 	struct rte_ether_addr (*mac)[];
@@ -1241,8 +1299,8 @@ struct mlx5_nl_ifindex_data {
 	struct nlattr *nla = nl_msg_tail(nlh);
 
 	nla->nla_type = type;
-	nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr) + alen);
-	nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + nla->nla_len;
+	nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr)) + alen;
+	nlh->nlmsg_len += NLMSG_ALIGN(nla->nla_len);
 
 	if (alen)
 		memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
@@ -1335,3 +1393,307 @@ struct mlx5_nl_ifindex_data {
 	}
 	return ret;
 }
+
+/**
+ * Parse Netlink message to retrieve the general family ID.
+ *
+ * @param nh
+ *   Pointer to Netlink Message Header.
+ * @param arg
+ *   PMD data register with this callback.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_family_id_cb(struct nlmsghdr *nh, void *arg)
+{
+
+	struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
+	struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
+					NLMSG_ALIGN(sizeof(struct genlmsghdr)));
+
+	for (; nla->nla_len && nla < tail;
+	     nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len))) {
+		if (nla->nla_type == CTRL_ATTR_FAMILY_ID) {
+			*(uint16_t *)arg = *(uint16_t *)(nla + 1);
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
+#define MLX5_NL_MAX_ATTR_SIZE 100
+/**
+ * Get generic netlink family ID.
+ *
+ * @param[in] nlsk_fd
+ *   Netlink socket file descriptor.
+ * @param[in] name
+ *   The family name.
+ *
+ * @return
+ *   ID >= 0 on success and @p enable is updated, a negative errno value
+ *   otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_generic_family_id_get(int nlsk_fd, const char *name)
+{
+	struct nlmsghdr *nlh;
+	struct genlmsghdr *genl;
+	uint32_t sn = MLX5_NL_SN_GENERATE;
+	int name_size = strlen(name) + 1;
+	int ret;
+	uint16_t id = -1;
+	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+		    NLMSG_ALIGN(sizeof(struct nlattr)) +
+		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE)];
+
+	memset(buf, 0, sizeof(buf));
+	nlh = (struct nlmsghdr *)buf;
+	nlh->nlmsg_len = sizeof(struct nlmsghdr);
+	nlh->nlmsg_type = GENL_ID_CTRL;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
+	nlh->nlmsg_len += sizeof(struct genlmsghdr);
+	genl->cmd = CTRL_CMD_GETFAMILY;
+	genl->version = 1;
+	nl_attr_put(nlh, CTRL_ATTR_FAMILY_NAME, name, name_size);
+	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
+	if (ret >= 0)
+		ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_family_id_cb, &id);
+	if (ret < 0) {
+		DRV_LOG(DEBUG, "Failed to get Netlink %s family ID: %d.", name,
+			ret);
+		return ret;
+	}
+	DRV_LOG(DEBUG, "Netlink \"%s\" family ID is %u.", name, id);
+	return (int)id;
+}
+
+/**
+ * Get Devlink family ID.
+ *
+ * @param[in] nlsk_fd
+ *   Netlink socket file descriptor.
+ *
+ * @return
+ *   ID >= 0 on success and @p enable is updated, a negative errno value
+ *   otherwise and rte_errno is set.
+ */
+
+int
+mlx5_nl_devlink_family_id_get(int nlsk_fd)
+{
+	return mlx5_nl_generic_family_id_get(nlsk_fd, DEVLINK_GENL_NAME);
+}
+
+/**
+ * Parse Netlink message to retrieve the ROCE enable status.
+ *
+ * @param nh
+ *   Pointer to Netlink Message Header.
+ * @param arg
+ *   PMD data register with this callback.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_nl_roce_cb(struct nlmsghdr *nh, void *arg)
+{
+
+	int ret = -EINVAL;
+	int *enable = arg;
+	struct nlattr *tail = RTE_PTR_ADD(nh, nh->nlmsg_len);
+	struct nlattr *nla = RTE_PTR_ADD(nh, NLMSG_ALIGN(sizeof(*nh)) +
+					NLMSG_ALIGN(sizeof(struct genlmsghdr)));
+
+	while (nla->nla_len && nla < tail) {
+		switch (nla->nla_type) {
+		/* Expected nested attributes case. */
+		case DEVLINK_ATTR_PARAM:
+		case DEVLINK_ATTR_PARAM_VALUES_LIST:
+		case DEVLINK_ATTR_PARAM_VALUE:
+			ret = 0;
+			nla += 1;
+			break;
+		case DEVLINK_ATTR_PARAM_VALUE_DATA:
+			*enable = 1;
+			return 0;
+		default:
+			nla = RTE_PTR_ADD(nla, NLMSG_ALIGN(nla->nla_len));
+		}
+	}
+	*enable = 0;
+	return ret;
+}
+
+/**
+ * Get ROCE enable status through Netlink.
+ *
+ * @param[in] nlsk_fd
+ *   Netlink socket file descriptor.
+ * @param[in] family_id
+ *   the Devlink family ID.
+ * @param pci_addr
+ *   The device PCI address.
+ * @param[out] enable
+ *   Where to store the enable status.
+ *
+ * @return
+ *   0 on success and @p enable is updated, a negative errno value otherwise
+ *   and rte_errno is set.
+ */
+int
+mlx5_nl_enable_roce_get(int nlsk_fd, int family_id, const char *pci_addr,
+			int *enable)
+{
+	struct nlmsghdr *nlh;
+	struct genlmsghdr *genl;
+	uint32_t sn = MLX5_NL_SN_GENERATE;
+	int ret;
+	int cur_en;
+	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+		    NLMSG_ALIGN(sizeof(struct nlattr)) * 4 +
+		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 4];
+
+	memset(buf, 0, sizeof(buf));
+	nlh = (struct nlmsghdr *)buf;
+	nlh->nlmsg_len = sizeof(struct nlmsghdr);
+	nlh->nlmsg_type = family_id;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
+	nlh->nlmsg_len += sizeof(struct genlmsghdr);
+	genl->cmd = DEVLINK_CMD_PARAM_GET;
+	genl->version = DEVLINK_GENL_VERSION;
+	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
+	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
+	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
+	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
+	if (ret >= 0)
+		ret = mlx5_nl_recv(nlsk_fd, sn, mlx5_nl_roce_cb, &cur_en);
+	if (ret < 0) {
+		DRV_LOG(DEBUG, "Failed to get ROCE enable on device %s: %d.",
+			pci_addr, ret);
+		return ret;
+	}
+	*enable = cur_en;
+	DRV_LOG(DEBUG, "ROCE is %sabled for device \"%s\".",
+		cur_en ? "en" : "dis", pci_addr);
+	return ret;
+}
+
+/**
+ * Reload mlx5 device kernel driver through Netlink.
+ *
+ * @param[in] nlsk_fd
+ *   Netlink socket file descriptor.
+ * @param[in] family_id
+ *   the Devlink family ID.
+ * @param pci_addr
+ *   The device PCI address.
+ * @param[out] enable
+ *   The enable status to set.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_driver_reload(int nlsk_fd, int family_id, const char *pci_addr)
+{
+	struct nlmsghdr *nlh;
+	struct genlmsghdr *genl;
+	uint32_t sn = MLX5_NL_SN_GENERATE;
+	int ret;
+	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+		    NLMSG_ALIGN(sizeof(struct nlattr)) * 2 +
+		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 2];
+
+	memset(buf, 0, sizeof(buf));
+	nlh = (struct nlmsghdr *)buf;
+	nlh->nlmsg_len = sizeof(struct nlmsghdr);
+	nlh->nlmsg_type = family_id;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
+	nlh->nlmsg_len += sizeof(struct genlmsghdr);
+	genl->cmd = DEVLINK_CMD_RELOAD;
+	genl->version = DEVLINK_GENL_VERSION;
+	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
+	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
+	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
+	if (ret >= 0)
+		ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
+	if (ret < 0) {
+		DRV_LOG(DEBUG, "Failed to reload %s device by Netlink - %d",
+			pci_addr, ret);
+		return ret;
+	}
+	DRV_LOG(DEBUG, "Device \"%s\" was reloaded by Netlink successfully.",
+		pci_addr);
+	return 0;
+}
+
+/**
+ * Set ROCE enable status through Netlink.
+ *
+ * @param[in] nlsk_fd
+ *   Netlink socket file descriptor.
+ * @param[in] family_id
+ *   the Devlink family ID.
+ * @param pci_addr
+ *   The device PCI address.
+ * @param[out] enable
+ *   The enable status to set.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_enable_roce_set(int nlsk_fd, int family_id, const char *pci_addr,
+			int enable)
+{
+	struct nlmsghdr *nlh;
+	struct genlmsghdr *genl;
+	uint32_t sn = MLX5_NL_SN_GENERATE;
+	int ret;
+	uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+		    NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+		    NLMSG_ALIGN(sizeof(struct nlattr)) * 6 +
+		    NLMSG_ALIGN(MLX5_NL_MAX_ATTR_SIZE) * 6];
+	uint8_t cmode = DEVLINK_PARAM_CMODE_DRIVERINIT;
+	uint8_t ptype = NLA_FLAG;
+;
+
+	memset(buf, 0, sizeof(buf));
+	nlh = (struct nlmsghdr *)buf;
+	nlh->nlmsg_len = sizeof(struct nlmsghdr);
+	nlh->nlmsg_type = family_id;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	genl = (struct genlmsghdr *)nl_msg_tail(nlh);
+	nlh->nlmsg_len += sizeof(struct genlmsghdr);
+	genl->cmd = DEVLINK_CMD_PARAM_SET;
+	genl->version = DEVLINK_GENL_VERSION;
+	nl_attr_put(nlh, DEVLINK_ATTR_BUS_NAME, "pci", 4);
+	nl_attr_put(nlh, DEVLINK_ATTR_DEV_NAME, pci_addr, strlen(pci_addr) + 1);
+	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_NAME, "enable_roce", 12);
+	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_CMODE, &cmode, sizeof(cmode));
+	nl_attr_put(nlh, DEVLINK_ATTR_PARAM_TYPE, &ptype, sizeof(ptype));
+	if (enable)
+		nl_attr_put(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, NULL, 0);
+	ret = mlx5_nl_send(nlsk_fd, nlh, sn);
+	if (ret >= 0)
+		ret = mlx5_nl_recv(nlsk_fd, sn, NULL, NULL);
+	if (ret < 0) {
+		DRV_LOG(DEBUG, "Failed to %sable ROCE for device %s by Netlink:"
+			" %d.", enable ? "en" : "dis", pci_addr, ret);
+		return ret;
+	}
+	DRV_LOG(DEBUG, "Device %s ROCE was %sabled by Netlink successfully.",
+		pci_addr, enable ? "en" : "dis");
+	/* Now, need to reload the driver. */
+	return mlx5_nl_driver_reload(nlsk_fd, family_id, pci_addr);
+}
diff --git a/drivers/common/mlx5/mlx5_nl.h b/drivers/common/mlx5/mlx5_nl.h
index 8e66a98..2c3f837 100644
--- a/drivers/common/mlx5/mlx5_nl.h
+++ b/drivers/common/mlx5/mlx5_nl.h
@@ -53,5 +53,11 @@ void mlx5_nl_vlan_vmwa_delete(struct mlx5_nl_vlan_vmwa_context *vmwa,
 			      uint32_t ifindex);
 uint32_t mlx5_nl_vlan_vmwa_create(struct mlx5_nl_vlan_vmwa_context *vmwa,
 				  uint32_t ifindex, uint16_t tag);
+int mlx5_nl_devlink_family_id_get(int nlsk_fd);
+int mlx5_nl_enable_roce_get(int nlsk_fd, int family_id, const char *pci_addr,
+			    int *enable);
+int mlx5_nl_driver_reload(int nlsk_fd, int family_id, const char *pci_addr);
+int mlx5_nl_enable_roce_set(int nlsk_fd, int family_id, const char *pci_addr,
+			    int enable);
 
 #endif /* RTE_PMD_MLX5_NL_H_ */
diff --git a/drivers/common/mlx5/rte_common_mlx5_version.map b/drivers/common/mlx5/rte_common_mlx5_version.map
index f93f5cb..b1a2014 100644
--- a/drivers/common/mlx5/rte_common_mlx5_version.map
+++ b/drivers/common/mlx5/rte_common_mlx5_version.map
@@ -30,6 +30,10 @@ DPDK_20.02 {
 	mlx5_dev_to_pci_addr;
 
 	mlx5_nl_allmulti;
+	mlx5_nl_devlink_family_id_get;
+	mlx5_nl_driver_reload;
+	mlx5_nl_enable_roce_get;
+	mlx5_nl_enable_roce_set;
 	mlx5_nl_ifindex;
 	mlx5_nl_init;
 	mlx5_nl_mac_addr_add;
-- 
1.8.3.1



More information about the dev mailing list