[dpdk-dev] [PATCH 1/5] net/mlx5: add VXLAN encap/decap support for e-switch

Slava Ovsiienko viacheslavo at mellanox.com
Tue Oct 2 08:30:33 CEST 2018


This patchset adds support for RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP and
RTE_FLOW_ACTION_TYPE_VXLAN_DECAP to mlx5 PMD. This patch is refactored
version of proposal 20180831092038.23051-2-adrien.mazarguil at 6wind.com.

A typical use case is port representors in switchdev mode, with VXLAN
traffic encapsulation performed on traffic coming *from* a representor
and decapsulation on traffic going *to* that representor, in order
to transparently assign a given VXLAN to VF traffic.

Since these actions are supported at the switch level, the "transfer"
attribute must be set on such flow rules. They must also be combined
with a port redirection action to make sense.

Since only ingress is supported, encapsulation flow rules are normally
applied on a physical port and emit traffic to a port representor.
The opposite order is used for decapsulation.

Like other mlx5 switch flow rule actions, these are implemented through
Linux's TC flower API. Since the Linux interface for VXLAN encap/decap
involves virtual network devices (i.e. ip link add type vxlan [...]),
 the PMD automatically spawns them on a needed basis through Netlink
 calls.

VXLAN interfaces are dynamically created for each local UDP port of
outer networks and then used as targets for TC "flower" filters
in order to perform encapsulation. These VXLAN interfaces are
system-wide, the only one device with given UDP port can exist
in the system (the attempt of creating another device with the
same UDP local port returns EEXIST), so PMD should support the shared
device instances database for PMD instances. These VXLAN implicitly
created devices are called VTEPs (Virtual Tunnel End Points).

The first part of patchset introduces the new datastructures and
definitions needed to implement VXLAN support in mlx5 PMD.

The history of the patch:

v1
Refactored code of initial experimental proposal
20180831092038.23051-2-adrien.mazarguil at 6wind.com, the unattached
VTEP used in order to resolve the problem of VTEP UDP port sharing
between several PMD ports.

Suggested-by: Adrien Mazarguil <adrien.mazarguil at 6wind.com>
Signed-off-by: Viacheslav Ovsiienko <viacheslavo at mellanox.com>
---
 app/test-pmd/config.c            |   3 +
 drivers/net/mlx5/Makefile        |  75 ++++++++++++++++++
 drivers/net/mlx5/mlx5_flow.h     |  11 +++
 drivers/net/mlx5/mlx5_flow_tcf.c | 167 +++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_nl.c       |  12 ++-
 5 files changed, 264 insertions(+), 4 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 794aa52..b088c9f 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1172,6 +1172,9 @@ enum item_spec_type {
 		       sizeof(struct rte_flow_action_of_pop_mpls)),
 	MK_FLOW_ACTION(OF_PUSH_MPLS,
 		       sizeof(struct rte_flow_action_of_push_mpls)),
+	MK_FLOW_ACTION(VXLAN_ENCAP,
+		       sizeof(struct rte_flow_action_vxlan_encap)),
+	MK_FLOW_ACTION(VXLAN_DECAP, 0),
 };
 
 /** Compute storage space needed by action configuration and copy it. */
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index ca1de9f..63c7191 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -347,6 +347,81 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 		enum TCA_VLAN_PUSH_VLAN_PRIORITY \
 		$(AUTOCONF_OUTPUT)
 	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_KEY_ID \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_KEY_ID \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_IPV4_SRC \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_IPV4_DST \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_IPV4_DST_MASK \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_IPV6_SRC \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_IPV6_DST \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_IPV6_DST_MASK \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_UDP_SRC_PORT \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_UDP_DST_PORT \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_ACT_TUNNEL_KEY \
+		linux/tc_act/tc_tunnel_key.h \
+		define TCA_ACT_TUNNEL_KEY \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT \
+		linux/tc_act/tc_tunnel_key.h \
+		enum TCA_TUNNEL_KEY_ENC_DST_PORT \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
 		HAVE_SUPPORTED_40000baseKR4_Full \
 		/usr/include/linux/ethtool.h \
 		define SUPPORTED_40000baseKR4_Full \
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 10d700a..2d56ced 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -87,6 +87,8 @@
 #define MLX5_ACTION_OF_PUSH_VLAN (1u << 8)
 #define MLX5_ACTION_OF_SET_VLAN_VID (1u << 9)
 #define MLX5_ACTION_OF_SET_VLAN_PCP (1u << 10)
+#define MLX5_ACTION_VXLAN_ENCAP (1u << 11)
+#define MLX5_ACTION_VXLAN_DECAP (1u << 12)
 
 /* possible L3 layers protocols filtering. */
 #define MLX5_IP_PROTOCOL_TCP 6
@@ -178,8 +180,17 @@ struct mlx5_flow_dv {
 
 /** Linux TC flower driver for E-Switch flow. */
 struct mlx5_flow_tcf {
+	uint32_t nlsize; /**< Size of NL message buffer. */
+	uint32_t applied:1; /**< Whether rule is currently applied. */
+	uint64_t item_flags; /**< Item flags. */
+	uint64_t action_flags; /**< Action flags. */
 	struct nlmsghdr *nlh;
 	struct tcmsg *tcm;
+	union { /**< Tunnel encap/decap descriptor. */
+		struct mlx5_flow_tcf_tunnel_hdr *tunnel;
+		struct mlx5_flow_tcf_vxlan_decap *vxlan_decap;
+		struct mlx5_flow_tcf_vxlan_encap *vxlan_encap;
+	};
 };
 
 /* Verbs specification header. */
diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c
index 1437618..5c93412 100644
--- a/drivers/net/mlx5/mlx5_flow_tcf.c
+++ b/drivers/net/mlx5/mlx5_flow_tcf.c
@@ -6,6 +6,29 @@
 #include <assert.h>
 #include <errno.h>
 #include <libmnl/libmnl.h>
+/*
+ * Older versions of linux/if.h do not have the required safeties to coexist
+ * with net/if.h. This causes a compilation failure due to symbol
+ * redefinitions even when including the latter first.
+ *
+ * One workaround is to prevent net/if.h from defining conflicting symbols
+ * by removing __USE_MISC, and maintaining it undefined while including
+ * linux/if.h.
+ *
+ * Alphabetical order cannot be preserved since net/if.h must always be
+ * included before linux/if.h regardless.
+ */
+#ifdef __USE_MISC
+#undef __USE_MISC
+#define RESTORE_USE_MISC
+#endif
+#include <net/if.h>
+#include <linux/if.h>
+#ifdef RESTORE_USE_MISC
+#undef RESTORE_USE_MISC
+#define __USE_MISC 1
+#endif
+#include <linux/if_arp.h>
 #include <linux/if_ether.h>
 #include <linux/netlink.h>
 #include <linux/pkt_cls.h>
@@ -53,6 +76,34 @@ struct tc_vlan {
 
 #endif /* HAVE_TC_ACT_VLAN */
 
+#ifdef HAVE_TC_ACT_TUNNEL_KEY
+
+#include <linux/tc_act/tc_tunnel_key.h>
+
+#ifndef HAVE_TCA_TUNNEL_KEY_ENC_DST_PORT
+#define TCA_TUNNEL_KEY_ENC_DST_PORT 9
+#endif
+
+#else /* HAVE_TC_ACT_TUNNEL_KEY */
+
+#define TCA_ACT_TUNNEL_KEY 17
+#define TCA_TUNNEL_KEY_ACT_SET 1
+#define TCA_TUNNEL_KEY_ACT_RELEASE 2
+#define TCA_TUNNEL_KEY_PARMS 2
+#define TCA_TUNNEL_KEY_ENC_IPV4_SRC 3
+#define TCA_TUNNEL_KEY_ENC_IPV4_DST 4
+#define TCA_TUNNEL_KEY_ENC_IPV6_SRC 5
+#define TCA_TUNNEL_KEY_ENC_IPV6_DST 6
+#define TCA_TUNNEL_KEY_ENC_KEY_ID 7
+#define TCA_TUNNEL_KEY_ENC_DST_PORT 9
+
+struct tc_tunnel_key {
+	tc_gen;
+	int t_action;
+};
+
+#endif /* HAVE_TC_ACT_TUNNEL_KEY */
+
 /* Normally found in linux/netlink.h. */
 #ifndef NETLINK_CAP_ACK
 #define NETLINK_CAP_ACK 10
@@ -148,11 +199,118 @@ struct tc_vlan {
 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
 #endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
+#define TCA_FLOWER_KEY_ENC_KEY_ID 26
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
+#define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
+#define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
+#define TCA_FLOWER_KEY_ENC_IPV4_DST 29
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
+#define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
+#define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
+#define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
+#define TCA_FLOWER_KEY_ENC_IPV6_DST 33
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
+#define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
+#define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
+#define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
+#define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
+#define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
+#endif
 
 #ifndef IPV6_ADDR_LEN
 #define IPV6_ADDR_LEN 16
 #endif
 
+#define MLX5_VXLAN_DEFAULT_PORT	4789
+#define MLX5_VXLAN_DEVICE_PFX "vmlx_"
+
+/** Tunnel action type, used for @p type in header structure. */
+enum mlx5_flow_tcf_tunact_type {
+	MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP,
+	MLX5_FLOW_TCF_TUNACT_VXLAN_DECAP,
+};
+
+/** Flags used for @p mask in tunnel action encap descriptors. */
+#define	MLX5_FLOW_TCF_ENCAP_ETH_SRC	(1u << 0)
+#define	MLX5_FLOW_TCF_ENCAP_ETH_DST	(1u << 1)
+#define	MLX5_FLOW_TCF_ENCAP_IPV4_SRC	(1u << 2)
+#define	MLX5_FLOW_TCF_ENCAP_IPV4_DST	(1u << 3)
+#define	MLX5_FLOW_TCF_ENCAP_IPV6_SRC	(1u << 4)
+#define	MLX5_FLOW_TCF_ENCAP_IPV6_DST	(1u << 5)
+#define	MLX5_FLOW_TCF_ENCAP_UDP_SRC	(1u << 6)
+#define	MLX5_FLOW_TCF_ENCAP_UDP_DST	(1u << 7)
+#define	MLX5_FLOW_TCF_ENCAP_VXLAN_VNI	(1u << 8)
+
+/** VXLAN virtual netdev. */
+struct mlx5_flow_tcf_vtep {
+	LIST_ENTRY(mlx5_flow_tcf_vtep) next;
+	uint32_t refcnt;
+	unsigned int ifindex;
+	uint16_t port;
+	uint8_t notcreated;
+};
+
+/** Tunnel descriptor header, common for all tunnel types. */
+struct mlx5_flow_tcf_tunnel_hdr {
+	uint32_t type; /**< Tunnel action type. */
+	unsigned int ifindex_tun; /**< Tunnel endpoint interface. */
+	unsigned int ifindex_org; /**< Original dst/src interface */
+	unsigned int *ifindex_ptr; /**< Interface ptr in message. */
+};
+
+struct mlx5_flow_tcf_vxlan_decap {
+	struct mlx5_flow_tcf_tunnel_hdr hdr;
+	uint16_t udp_port;
+};
+
+struct mlx5_flow_tcf_vxlan_encap {
+	struct mlx5_flow_tcf_tunnel_hdr hdr;
+	uint32_t mask;
+	struct {
+		struct ether_addr dst;
+		struct ether_addr src;
+	} eth;
+	union {
+		struct {
+			rte_be32_t dst;
+			rte_be32_t src;
+		} ipv4;
+		struct {
+			uint8_t dst[16];
+			uint8_t src[16];
+		} ipv6;
+	};
+	struct {
+		rte_be16_t src;
+		rte_be16_t dst;
+	} udp;
+	struct {
+		uint8_t vni[3];
+	} vxlan;
+};
+
 /** Empty masks for known item types. */
 static const union {
 	struct rte_flow_item_port_id port_id;
@@ -162,6 +320,7 @@ struct tc_vlan {
 	struct rte_flow_item_ipv6 ipv6;
 	struct rte_flow_item_tcp tcp;
 	struct rte_flow_item_udp udp;
+	struct rte_flow_item_vxlan vxlan;
 } flow_tcf_mask_empty;
 
 /** Supported masks for known item types. */
@@ -173,6 +332,7 @@ struct tc_vlan {
 	struct rte_flow_item_ipv6 ipv6;
 	struct rte_flow_item_tcp tcp;
 	struct rte_flow_item_udp udp;
+	struct rte_flow_item_vxlan vxlan;
 } flow_tcf_mask_supported = {
 	.port_id = {
 		.id = 0xffffffff,
@@ -209,6 +369,9 @@ struct tc_vlan {
 		.src_port = RTE_BE16(0xffff),
 		.dst_port = RTE_BE16(0xffff),
 	},
+	.vxlan = {
+	       .vni = "\xff\xff\xff",
+	},
 };
 
 #define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
@@ -216,6 +379,10 @@ struct tc_vlan {
 #define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
 #define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
 #define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
+#define SZ_NLATTR_TYPE_OF_UINT8 SZ_NLATTR_TYPE_OF(uint8_t)
+#define SZ_NLATTR_TYPE_OF_UINT16 SZ_NLATTR_TYPE_OF(uint16_t)
+#define SZ_NLATTR_TYPE_OF_UINT32 SZ_NLATTR_TYPE_OF(uint32_t)
+#define SZ_NLATTR_TYPE_OF_STRUCT(typ) SZ_NLATTR_TYPE_OF(struct typ)
 
 #define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
 
diff --git a/drivers/net/mlx5/mlx5_nl.c b/drivers/net/mlx5/mlx5_nl.c
index d61826a..88e8e15 100644
--- a/drivers/net/mlx5/mlx5_nl.c
+++ b/drivers/net/mlx5/mlx5_nl.c
@@ -385,8 +385,10 @@ struct mlx5_nl_ifindex_data {
 	int ret;
 	uint32_t sn = priv->nl_sn++;
 
-	if (priv->nl_socket_route == -1)
-		return 0;
+	if (priv->nl_socket_route < 0) {
+		rte_errno = ENOENT;
+		goto error;
+	}
 	fd = priv->nl_socket_route;
 	ret = mlx5_nl_request(fd, &req.hdr, sn, &req.ifm,
 			      sizeof(struct ifinfomsg));
@@ -449,8 +451,10 @@ struct mlx5_nl_ifindex_data {
 	int ret;
 	uint32_t sn = priv->nl_sn++;
 
-	if (priv->nl_socket_route == -1)
-		return 0;
+	if (priv->nl_socket_route < 0) {
+		rte_errno = ENOENT;
+		goto error;
+	}
 	fd = priv->nl_socket_route;
 	memcpy(RTA_DATA(&req.rta), mac, ETHER_ADDR_LEN);
 	req.hdr.nlmsg_len = NLMSG_ALIGN(req.hdr.nlmsg_len) +
-- 
1.8.3.1



More information about the dev mailing list