[dpdk-dev] [PATCH 5/5] net/mlx5: e-switch VXLAN tunnel devices management

Slava Ovsiienko viacheslavo at mellanox.com
Tue Oct 2 08:30:42 CEST 2018


VXLAN interfaces are dynamically created for each local UDP port
of outer networks and then used as targets for TC "flower" filters
in order to perform encapsulation. These VXLAN interfaces are
system-wide, the only one device with given UDP port can exist
in the system (the attempt of creating another device with the
same UDP local port returns EEXIST), so PMD should support the
shared device instances database for PMD instances. These VXLAN
implicitly created devices are called VTEPs (Virtual Tunnel
End Points).

Creation of the VTEP occurs at the moment of rule applying. The
link is set up, root ingress qdisc is also initialized. One VTEP
is shared for all encapsulation rules in the DPDK application
instance. For decapsulaton one VTEP is created per every unique
UDP local port to accept tunnel traffic. The name of created
VTEP consists of prefix "vmlx_" and the number of UDP port in
decimal digits without leading zeros (vmlx_4789). The VTEP
can be preliminary created in the system before the launching
application, it allows to share UDP ports between primary and
secondary processes.

Suggested-by: Adrien Mazarguil <adrien.mazarguil at 6wind.com>
Signed-off-by: Viacheslav Ovsiienko <viacheslavo at mellanox.com>
---
 drivers/net/mlx5/mlx5_flow_tcf.c | 344 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 343 insertions(+), 1 deletion(-)

diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c
index dfffc50..0e62fe9 100644
--- a/drivers/net/mlx5/mlx5_flow_tcf.c
+++ b/drivers/net/mlx5/mlx5_flow_tcf.c
@@ -1482,7 +1482,7 @@ struct flow_tcf_ptoi {
 		default:
 			return rte_flow_error_set(error, ENOTSUP,
 						  RTE_FLOW_ERROR_TYPE_ITEM,
-						  NULL, "item not supported");
+						  items, "item not supported");
 		}
 	}
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
@@ -2886,6 +2886,291 @@ struct flow_tcf_ptoi {
 	return 0;
 }
 
+/* VTEP device list is shared between PMD port instances. */
+static LIST_HEAD(, mlx5_flow_tcf_vtep)
+			vtep_list_vxlan = LIST_HEAD_INITIALIZER();
+static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
+static struct mlx5_flow_tcf_vtep *vtep_encap;
+
+/**
+ * Deletes VTEP network device.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_flow_tcf_socket_open().
+ * @param[in] vtep
+ *   Flow tcf object with tunnel device structure to delete.
+ */
+static void
+flow_tcf_delete_iface(struct mlx5_tcf_socket *tcf,
+		      struct mlx5_flow_tcf_vtep *vtep)
+{
+	struct nlmsghdr *nlh;
+	struct ifinfomsg *ifm;
+	alignas(struct nlmsghdr)
+	uint8_t buf[mnl_nlmsg_size(MNL_ALIGN(sizeof(*ifm))) + 8];
+	int ret;
+
+	DRV_LOG(NOTICE, "VTEP delete (%d)", vtep->port);
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type = RTM_DELLINK;
+	nlh->nlmsg_flags = NLM_F_REQUEST;
+	ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+	ifm->ifi_family = AF_UNSPEC;
+	ifm->ifi_index = vtep->ifindex;
+	ret = flow_tcf_nl_ack(tcf, nlh);
+	if (ret)
+		DRV_LOG(DEBUG, "error deleting VXLAN encap/decap ifindex %u",
+			ifm->ifi_index);
+}
+
+/**
+ * Creates VTEP network device.
+ *
+ * @param[in] tcf
+ *   Context object initialized by mlx5_flow_tcf_socket_open().
+ * @param[in] port
+ *   UDP port of created VTEP device.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ * Pointer to created device structure on success, NULL otherwise
+ * and rte_errno is set.
+ */
+static struct mlx5_flow_tcf_vtep*
+flow_tcf_create_iface(struct mlx5_tcf_socket *tcf, uint16_t port,
+		      struct rte_flow_error *error)
+{
+	struct mlx5_flow_tcf_vtep *vtep;
+	struct nlmsghdr *nlh;
+	struct ifinfomsg *ifm;
+	alignas(struct nlmsghdr)
+	char name[sizeof(MLX5_VXLAN_DEVICE_PFX) + 24];
+	uint8_t buf[mnl_nlmsg_size(sizeof(*ifm)) +
+		       SZ_NLATTR_DATA_OF(sizeof(name)) +
+		       SZ_NLATTR_NEST * 2 +
+		       SZ_NLATTR_STRZ_OF("vxlan") +
+		       SZ_NLATTR_TYPE_OF_UINT32 +
+		       SZ_NLATTR_TYPE_OF_UINT16 +
+		       SZ_NLATTR_TYPE_OF_UINT8 + 128];
+	struct nlattr *na_info;
+	struct nlattr *na_vxlan;
+	rte_be16_t vxlan_port = RTE_BE16(port);
+	int ret;
+
+	vtep = rte_zmalloc(__func__, sizeof(*vtep),
+			alignof(struct mlx5_flow_tcf_vtep));
+	if (!vtep) {
+		rte_flow_error_set
+			(error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+			 NULL, "unadble to allocate memory for VTEP desc");
+		return NULL;
+	}
+	*vtep = (struct mlx5_flow_tcf_vtep){
+			.refcnt = 0,
+			.port = port,
+			.notcreated = 0,
+	};
+	memset(buf, 0, sizeof(buf));
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type = RTM_NEWLINK;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE  | NLM_F_EXCL;
+	ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+	ifm->ifi_family = AF_UNSPEC;
+	ifm->ifi_type = 0;
+	ifm->ifi_index = 0;
+	ifm->ifi_flags = IFF_UP;
+	ifm->ifi_change = 0xffffffff;
+	snprintf(name, sizeof(name), "%s%u", MLX5_VXLAN_DEVICE_PFX, port);
+	mnl_attr_put_strz(nlh, IFLA_IFNAME, name);
+	na_info = mnl_attr_nest_start(nlh, IFLA_LINKINFO);
+	assert(na_info);
+	mnl_attr_put_strz(nlh, IFLA_INFO_KIND, "vxlan");
+	na_vxlan = mnl_attr_nest_start(nlh, IFLA_INFO_DATA);
+	assert(na_vxlan);
+	mnl_attr_put_u8(nlh, IFLA_VXLAN_COLLECT_METADATA, 1);
+	mnl_attr_put_u8(nlh, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 1);
+	mnl_attr_put_u8(nlh, IFLA_VXLAN_LEARNING, 0);
+	mnl_attr_put_u16(nlh, IFLA_VXLAN_PORT, vxlan_port);
+	mnl_attr_nest_end(nlh, na_vxlan);
+	mnl_attr_nest_end(nlh, na_info);
+	assert(sizeof(buf) >= nlh->nlmsg_len);
+	ret = flow_tcf_nl_ack(tcf, nlh);
+	if (ret) {
+		DRV_LOG(WARNING,
+			"VTEP %s create failure (%d)",
+			name, rte_errno);
+		vtep->notcreated = 1; /* Assume the device exists. */
+	}
+	ret = if_nametoindex(name);
+	if (ret) {
+		vtep->ifindex = ret;
+		memset(buf, 0, sizeof(buf));
+		nlh = mnl_nlmsg_put_header(buf);
+		nlh->nlmsg_type = RTM_NEWLINK;
+		nlh->nlmsg_flags = NLM_F_REQUEST;
+		ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
+		ifm->ifi_family = AF_UNSPEC;
+		ifm->ifi_type = 0;
+		ifm->ifi_index = vtep->ifindex;
+		ifm->ifi_flags = IFF_UP;
+		ifm->ifi_change = IFF_UP;
+		ret = flow_tcf_nl_ack(tcf, nlh);
+		if (ret) {
+			DRV_LOG(WARNING,
+			"VTEP %s set link up failure (%d)", name, rte_errno);
+			rte_free(vtep);
+			rte_flow_error_set
+				(error, -errno,
+				 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				 "netlink: failed to set VTEP link up");
+			vtep = NULL;
+		} else {
+			ret = mlx5_flow_tcf_ifindex_init(tcf,
+							 vtep->ifindex, error);
+			if (ret)
+				DRV_LOG(WARNING,
+				"VTEP %s init failure (%d)", name, rte_errno);
+		}
+	} else {
+		DRV_LOG(WARNING,
+			"VTEP %s failed to get index (%d)", name, errno);
+		rte_flow_error_set
+			(error, -errno,
+			 RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			 vtep->notcreated ? "netlink: failed to create VTEP" :
+			 "netlink: failed to retrieve VTEP ifindex");
+			 ret = 1;
+	}
+	if (ret) {
+		if (!vtep->notcreated && vtep->ifindex)
+			flow_tcf_delete_iface(tcf, vtep);
+		rte_free(vtep);
+		vtep = NULL;
+	}
+	DRV_LOG(NOTICE, "VTEP create (%d, %s)", vtep->port, vtep ? "OK" : "error");
+	return vtep;
+}
+
+/**
+ * Creates target interface index for tunneling.
+ *
+ * @param tcf
+ *   Context object initialized by mlx5_flow_tcf_socket_open().
+ * @param[in] dev_flow
+ *   Flow tcf object with tunnel structure pointer set.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   Interface index on success, zero otherwise and rte_errno is set.
+ */
+static unsigned int
+flow_tcf_tunnel_vtep_create(struct mlx5_tcf_socket *tcf,
+			    struct mlx5_flow *dev_flow,
+			    struct rte_flow_error *error)
+{
+	unsigned int ret;
+
+	assert(dev_flow->tcf.tunnel);
+	pthread_mutex_lock(&vtep_list_mutex);
+	switch (dev_flow->tcf.tunnel->type) {
+	case MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP:
+		if (!vtep_encap) {
+			vtep_encap = flow_tcf_create_iface(tcf,
+				MLX5_VXLAN_DEFAULT_PORT, error);
+			if (!vtep_encap) {
+				ret = 0;
+				break;
+			}
+			LIST_INSERT_HEAD(&vtep_list_vxlan, vtep_encap, next);
+		}
+		vtep_encap->refcnt++;
+		ret = vtep_encap->ifindex;
+		assert(ret);
+		break;
+	case MLX5_FLOW_TCF_TUNACT_VXLAN_DECAP: {
+		struct mlx5_flow_tcf_vtep *vtep;
+		uint16_t port = dev_flow->tcf.vxlan_decap->udp_port;
+
+		LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
+			if (vtep->port == port)
+				break;
+		}
+		if (!vtep) {
+			vtep = flow_tcf_create_iface(tcf, port, error);
+			if (!vtep) {
+				ret = 0;
+				break;
+			}
+			LIST_INSERT_HEAD(&vtep_list_vxlan, vtep, next);
+		}
+		vtep->refcnt++;
+		ret = vtep->ifindex;
+		assert(ret);
+		break;
+	}
+	default:
+		rte_flow_error_set(error, ENOTSUP,
+				RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				"unsupported tunnel type");
+		ret = 0;
+		break;
+	}
+	pthread_mutex_unlock(&vtep_list_mutex);
+	return ret;
+}
+
+/**
+ * Deletes tunneling interface by UDP port.
+ *
+ * @param tx
+ *   Context object initialized by mlx5_flow_tcf_socket_open().
+ * @param[in] dev_flow
+ *   Flow tcf object with tunnel structure pointer set.
+ */
+static void
+flow_tcf_tunnel_vtep_delete(struct mlx5_tcf_socket *tcf,
+			    struct mlx5_flow *dev_flow)
+{
+	struct mlx5_flow_tcf_vtep *vtep;
+	uint16_t port = MLX5_VXLAN_DEFAULT_PORT;
+
+	assert(dev_flow->tcf.tunnel);
+	pthread_mutex_lock(&vtep_list_mutex);
+	switch (dev_flow->tcf.tunnel->type) {
+	case MLX5_FLOW_TCF_TUNACT_VXLAN_DECAP:
+		port = dev_flow->tcf.vxlan_decap->udp_port;
+		/* There is no break operator intentionally. */
+	case MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP:
+		LIST_FOREACH(vtep, &vtep_list_vxlan, next) {
+			if (vtep->port == port)
+				break;
+		}
+		if (!vtep) {
+			DRV_LOG(WARNING,
+				"No VTEP device found in the list");
+			break;
+		}
+		assert(dev_flow->tcf.tunnel->ifindex_tun == vtep->ifindex);
+		assert(vtep->refcnt);
+		if (vtep->refcnt && --vtep->refcnt)
+			break;
+		if (!vtep->notcreated)
+			flow_tcf_delete_iface(tcf, vtep);
+		LIST_REMOVE(vtep, next);
+		if (vtep_encap == vtep)
+			vtep_encap = NULL;
+		rte_free(vtep);
+		break;
+	default:
+		assert(false);
+		DRV_LOG(WARNING, "Unsupported tunnel type");
+		break;
+	}
+	pthread_mutex_unlock(&vtep_list_mutex);
+}
+
 /**
  * Apply flow to E-Switch by sending Netlink message.
  *
@@ -2917,12 +3202,45 @@ struct flow_tcf_ptoi {
 	nlh = dev_flow->tcf.nlh;
 	nlh->nlmsg_type = RTM_NEWTFILTER;
 	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
+	if (dev_flow->tcf.tunnel) {
+		/*
+		 * Replace the interface index, target for
+		 * encapsulation, source for decapsulation
+		 */
+		assert(!dev_flow->tcf.tunnel->ifindex_tun);
+		assert(dev_flow->tcf.tunnel->ifindex_ptr);
+		/* Create actual VTEP device when rule is being applied. */
+		dev_flow->tcf.tunnel->ifindex_tun
+			= flow_tcf_tunnel_vtep_create(&priv->tcf_socket,
+						      dev_flow, error);
+			DRV_LOG(INFO, "Replace ifindex: %d->%d",
+				dev_flow->tcf.tunnel->ifindex_tun,
+				*dev_flow->tcf.tunnel->ifindex_ptr);
+		if (!dev_flow->tcf.tunnel->ifindex_tun)
+			return -rte_errno;
+		dev_flow->tcf.tunnel->ifindex_org
+			= *dev_flow->tcf.tunnel->ifindex_ptr;
+		*dev_flow->tcf.tunnel->ifindex_ptr
+			= dev_flow->tcf.tunnel->ifindex_tun;
+	}
 	ret = flow_tcf_nl_ack(tcf, nlh);
+	if (dev_flow->tcf.tunnel) {
+		DRV_LOG(INFO, "Restore ifindex: %d->%d",
+				dev_flow->tcf.tunnel->ifindex_org,
+				*dev_flow->tcf.tunnel->ifindex_ptr);
+		*dev_flow->tcf.tunnel->ifindex_ptr
+			= dev_flow->tcf.tunnel->ifindex_org;
+		dev_flow->tcf.tunnel->ifindex_org = 0;
+	}
 	if (!ret) {
 		dev_flow->tcf.applied = 1;
 		return 0;
 	}
 	DRV_LOG(WARNING, "Failed to create TC rule (%d)", rte_errno);
+	if (dev_flow->tcf.tunnel->ifindex_tun) {
+		flow_tcf_tunnel_vtep_delete(&priv->tcf_socket, dev_flow);
+		dev_flow->tcf.tunnel->ifindex_tun = 0;
+	}
 	return rte_flow_error_set(error, rte_errno,
 				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
 				  "netlink: failed to create TC flow rule");
@@ -2951,10 +3269,34 @@ struct flow_tcf_ptoi {
 		return;
 	/* E-Switch flow can't be expanded. */
 	assert(!LIST_NEXT(dev_flow, next));
+	if (!dev_flow->tcf.applied)
+		return;
+	if (dev_flow->tcf.tunnel) {
+		/*
+		 * Replace the interface index, target for
+		 * encapsulation, source for decapsulation
+		 */
+		assert(dev_flow->tcf.tunnel->ifindex_tun);
+		assert(dev_flow->tcf.tunnel->ifindex_ptr);
+		dev_flow->tcf.tunnel->ifindex_org
+			= *dev_flow->tcf.tunnel->ifindex_ptr;
+		*dev_flow->tcf.tunnel->ifindex_ptr
+			= dev_flow->tcf.tunnel->ifindex_tun;
+	}
 	nlh = dev_flow->tcf.nlh;
 	nlh->nlmsg_type = RTM_DELTFILTER;
 	nlh->nlmsg_flags = NLM_F_REQUEST;
 	flow_tcf_nl_ack(tcf, nlh);
+	if (dev_flow->tcf.tunnel) {
+		*dev_flow->tcf.tunnel->ifindex_ptr
+			= dev_flow->tcf.tunnel->ifindex_org;
+		dev_flow->tcf.tunnel->ifindex_org = 0;
+		if (dev_flow->tcf.tunnel->ifindex_tun) {
+			flow_tcf_tunnel_vtep_delete(&priv->tcf_socket,
+						    dev_flow);
+			dev_flow->tcf.tunnel->ifindex_tun = 0;
+		}
+	}
 	dev_flow->tcf.applied = 0;
 }
 
-- 
1.8.3.1



More information about the dev mailing list