[dpdk-dev] [PATCH 8/8] net/mlx5: add VXLAN decap support to switch flow rules

Adrien Mazarguil adrien.mazarguil at 6wind.com
Fri Aug 31 11:57:42 CEST 2018


This provides support for the VXLAN_DECAP action. Outer tunnel properties
are specified as the initial part of the flow rule pattern (up to and
including VXLAN item), optionally followed by inner traffic properties.

Testpmd examples:

- Creating a flow on port ID 1 performing VXLAN decapsulation and directing
  the result to port ID 2 without checking inner properties:

  flow create 1 ingress transfer pattern eth src is 66:77:88:99:aa:bb
     dst is 00:11:22:33:44:55 / ipv4 src is 2.2.2.2 dst is 1.1.1.1 /
     udp src is 4789 dst is 4242 / vxlan vni is 0x112233 / end
     actions vxlan_decap / port_id id 2 / end

- Same as above except only inner TCPv6 packets with destination port 42
  will be let through:

  flow create 1 ingress transfer pattern eth src is 66:77:88:99:aa:bb
     dst is 00:11:22:33:44:55 / ipv4 src is 2.2.2.2 dst is 1.1.1.1 /
     udp src is 4789 dst is 4242 / vxlan vni is 0x112233 /
     eth / ipv6 / tcp dst is 42 / end
     actions vxlan_decap / port_id id 2 / end

Signed-off-by: Adrien Mazarguil <adrien.mazarguil at 6wind.com>
---
 drivers/net/mlx5/Makefile       |  65 +++++++
 drivers/net/mlx5/mlx5_nl_flow.c | 344 ++++++++++++++++++++++++++++++++---
 2 files changed, 379 insertions(+), 30 deletions(-)

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 1ba4ce612..85672abd6 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -335,6 +335,71 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 		enum TCA_FLOWER_KEY_VLAN_ETH_TYPE \
 		$(AUTOCONF_OUTPUT)
 	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_KEY_ID \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_KEY_ID \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_IPV4_SRC \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_IPV4_DST \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_IPV4_DST_MASK \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_IPV6_SRC \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_IPV6_DST \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_IPV6_DST_MASK \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_UDP_SRC_PORT \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_UDP_DST_PORT \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
 		HAVE_TC_ACT_VLAN \
 		linux/tc_act/tc_vlan.h \
 		enum TCA_VLAN_PUSH_VLAN_PRIORITY \
diff --git a/drivers/net/mlx5/mlx5_nl_flow.c b/drivers/net/mlx5/mlx5_nl_flow.c
index 672f92863..12802796a 100644
--- a/drivers/net/mlx5/mlx5_nl_flow.c
+++ b/drivers/net/mlx5/mlx5_nl_flow.c
@@ -201,6 +201,45 @@ struct tc_tunnel_key {
 #ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
 #endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_KEY_ID
+#define TCA_FLOWER_KEY_ENC_KEY_ID 26
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC
+#define TCA_FLOWER_KEY_ENC_IPV4_SRC 27
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK
+#define TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK 28
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST
+#define TCA_FLOWER_KEY_ENC_IPV4_DST 29
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV4_DST_MASK
+#define TCA_FLOWER_KEY_ENC_IPV4_DST_MASK 30
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC
+#define TCA_FLOWER_KEY_ENC_IPV6_SRC 31
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK
+#define TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK 32
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST
+#define TCA_FLOWER_KEY_ENC_IPV6_DST 33
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_IPV6_DST_MASK
+#define TCA_FLOWER_KEY_ENC_IPV6_DST_MASK 34
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT
+#define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT 43
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK
+#define TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK 44
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT
+#define TCA_FLOWER_KEY_ENC_UDP_DST_PORT 45
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK
+#define TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK 46
+#endif
 
 #define BIT(b) (1 << (b))
 #define BIT_ENCAP(e) BIT(MLX5_NL_FLOW_ENCAP_ ## e)
@@ -278,6 +317,7 @@ struct mlx5_nl_flow_ctx {
 struct mlx5_nl_flow {
 	uint32_t size; /**< Size of this object. */
 	uint32_t applied:1; /**< Whether rule is currently applied. */
+	uint32_t decap:1; /**< Decapsulate @p encap. */
 	unsigned int encap_ifindex; /**< Interface to use with @p encap. */
 	unsigned int *ifindex_src; /**< Source interface. */
 	unsigned int *ifindex_dst; /**< Destination interface. */
@@ -301,6 +341,11 @@ enum mlx5_nl_flow_trans {
 	ITEM_TCP,
 	ITEM_UDP,
 	ITEM_VXLAN,
+	ITEM_VXLAN_END,
+	ITEM_TUN_ETH,
+	ITEM_TUN_IPV4,
+	ITEM_TUN_IPV6,
+	ITEM_TUN_UDP,
 	ACTIONS,
 	ACTION_VOID,
 	ACTION_PORT_ID,
@@ -339,7 +384,12 @@ static const enum mlx5_nl_flow_trans *const mlx5_nl_flow_trans[] = {
 	[ITEM_IPV6] = TRANS(ITEM_TCP, ITEM_UDP, PATTERN_COMMON),
 	[ITEM_TCP] = TRANS(PATTERN_COMMON),
 	[ITEM_UDP] = TRANS(ITEM_VXLAN, PATTERN_COMMON),
-	[ITEM_VXLAN] = TRANS(PATTERN_COMMON),
+	[ITEM_VXLAN] = TRANS(ITEM_TUN_ETH, PATTERN_COMMON),
+	[ITEM_VXLAN_END] = TRANS(ITEM_ETH, PATTERN_COMMON),
+	[ITEM_TUN_ETH] = TRANS(ITEM_TUN_IPV4, ITEM_TUN_IPV6, PATTERN_COMMON),
+	[ITEM_TUN_IPV4] = TRANS(ITEM_TUN_UDP, PATTERN_COMMON),
+	[ITEM_TUN_IPV6] = TRANS(ITEM_TUN_UDP, PATTERN_COMMON),
+	[ITEM_TUN_UDP] = TRANS(ITEM_VXLAN_END, ITEM_VOID, ITEM_PORT_ID),
 	[ACTIONS] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
 	[ACTION_VOID] = TRANS(BACK),
 	[ACTION_PORT_ID] = TRANS(ACTION_VOID, END),
@@ -805,6 +855,7 @@ mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
 	bool vlan_present;
 	bool vlan_eth_type_set;
 	bool ip_proto_set;
+	bool vxlan_decap;
 	struct mlx5_nl_flow_encap encap;
 	struct nlattr *na_flower;
 	struct nlattr *na_flower_act;
@@ -819,6 +870,7 @@ mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
 		goto error_nobufs;
 	nl_flow->size = offsetof(struct mlx5_nl_flow, msg);
 	nl_flow->applied = 0;
+	nl_flow->decap = 0;
 	nl_flow->encap_ifindex = 0;
 	nl_flow->ifindex_src = NULL;
 	nl_flow->ifindex_dst = NULL;
@@ -833,6 +885,7 @@ mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
 	vlan_present = false;
 	vlan_eth_type_set = false;
 	ip_proto_set = false;
+	vxlan_decap = false;
 	memset(&encap, 0, sizeof(encap));
 	na_flower = NULL;
 	na_flower_act = NULL;
@@ -850,6 +903,7 @@ mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
 			const struct rte_flow_item_ipv6 *ipv6;
 			const struct rte_flow_item_tcp *tcp;
 			const struct rte_flow_item_udp *udp;
+			const struct rte_flow_item_vxlan *vxlan;
 		} spec, mask;
 		union {
 			const struct rte_flow_action_port_id *port_id;
@@ -943,9 +997,6 @@ mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
 		na_flower = mnl_attr_nest_start_check(buf, size, TCA_OPTIONS);
 		if (!na_flower)
 			goto error_nobufs;
-		if (!mnl_attr_put_u32_check(buf, size, TCA_FLOWER_FLAGS,
-					    TCA_CLS_FLAGS_SKIP_SW))
-			goto error_nobufs;
 		break;
 	case ITEM_VOID:
 		if (item->type != RTE_FLOW_ITEM_TYPE_VOID)
@@ -1286,16 +1337,215 @@ mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
 		++item;
 		break;
 	case ITEM_VXLAN:
+	case ITEM_VXLAN_END:
 		if (item->type != RTE_FLOW_ITEM_TYPE_VXLAN)
 			goto trans;
-		return rte_flow_error_set
-			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM, item,
-			 "VXLAN header matching is not supported yet");
+		if (vxlan_decap) {
+			/* Done with outer, continue with inner. */
+			++item;
+			break;
+		}
+		if (encap.mask)
+			return rte_flow_error_set
+				(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				 item, "no support for stacked encapsulation");
+		mask.vxlan = mlx5_nl_flow_item_mask
+			(item, &rte_flow_item_vxlan_mask,
+			 &mlx5_nl_flow_encap_mask_supported.vxlan,
+			 &mlx5_nl_flow_mask_empty.vxlan,
+			 sizeof(rte_flow_item_vxlan_mask), error);
+		if (!mask.vxlan)
+			return -rte_errno;
+		spec.vxlan = item->spec;
+		/*
+		 * No TCA_FLOWER_* to match VXLAN traffic. This can only be
+		 * done indirectly through ACTION_VXLAN_DECAP.
+		 *
+		 * Since tunnel encapsulation information must be collected
+		 * from the previous pattern items, the message built so far
+		 * must be discarded, inner traffic will be matched by
+		 * subsequent pattern items.
+		 *
+		 * Reset inner context and process pattern again through a
+		 * different path.
+		 */
+		eth_type_set = false;
+		vlan_present = false;
+		vlan_eth_type_set = false;
+		ip_proto_set = false;
+		nlh = buf;
+		mnl_attr_nest_cancel(nlh, na_flower);
+		na_flower = mnl_attr_nest_start_check(buf, size, TCA_OPTIONS);
+		if (!na_flower)
+			goto error_nobufs;
+		if (memcmp(mask.vxlan->vni, VXLAN_VNI_MASK, 3))
+			return rte_flow_error_set
+				(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM_MASK,
+				 mask.vxlan,
+				 "VXLAN VNI is either incomplete or missing");
+		if (!mnl_attr_put_u32_check(buf, size,
+					    TCA_FLOWER_KEY_ENC_KEY_ID,
+					    vxlan_vni_as_be32(spec.vxlan->vni)))
+			goto error_nobufs;
+		encap.vxlan.vni = vxlan_vni_as_be32(spec.vxlan->vni);
+		encap.mask |= BIT_ENCAP(VXLAN_VNI);
+		vxlan_decap = true;
+		item = pattern;
+		break;
+	case ITEM_TUN_ETH:
+		if (item->type != RTE_FLOW_ITEM_TYPE_ETH)
+			goto trans;
+		mask.eth = mlx5_nl_flow_item_mask
+			(item, &rte_flow_item_eth_mask,
+			 &mlx5_nl_flow_encap_mask_supported.eth,
+			 &mlx5_nl_flow_mask_empty.eth,
+			 sizeof(rte_flow_item_eth_mask), error);
+		if (!mask.eth)
+			return -rte_errno;
+		spec.eth = item->spec;
+		if ((!is_zero_ether_addr(&mask.eth->dst) ||
+		     !is_zero_ether_addr(&mask.eth->src)) &&
+		    nl_flow != (void *)buf_tmp)
+			DRV_LOG(WARNING,
+				"Ethernet source/destination addresses cannot"
+				" be matched along with VXLAN traffic;"
+				" parameters ignored");
+		/* Source and destination are swapped for decap. */
+		if (is_broadcast_ether_addr(&mask.eth->dst)) {
+			encap.eth.src = spec.eth->dst;
+			encap.mask |= BIT_ENCAP(ETH_SRC);
+		}
+		if (is_broadcast_ether_addr(&mask.eth->src)) {
+			encap.eth.dst = spec.eth->src;
+			encap.mask |= BIT_ENCAP(ETH_DST);
+		}
+		++item;
+		break;
+	case ITEM_TUN_IPV4:
+		if (item->type != RTE_FLOW_ITEM_TYPE_IPV4)
+			goto trans;
+		mask.ipv4 = mlx5_nl_flow_item_mask
+			(item, &rte_flow_item_ipv4_mask,
+			 &mlx5_nl_flow_encap_mask_supported.ipv4,
+			 &mlx5_nl_flow_mask_empty.ipv4,
+			 sizeof(rte_flow_item_ipv4_mask), error);
+		if (!mask.ipv4)
+			return -rte_errno;
+		spec.ipv4 = item->spec;
+		if ((mask.ipv4->hdr.src_addr &&
+		     (!mnl_attr_put_u32_check(buf, size,
+					      TCA_FLOWER_KEY_ENC_IPV4_SRC,
+					      spec.ipv4->hdr.src_addr) ||
+		      !mnl_attr_put_u32_check(buf, size,
+					      TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+					      mask.ipv4->hdr.src_addr))) ||
+		    (mask.ipv4->hdr.dst_addr &&
+		     (!mnl_attr_put_u32_check(buf, size,
+					      TCA_FLOWER_KEY_ENC_IPV4_DST,
+					      spec.ipv4->hdr.dst_addr) ||
+		      !mnl_attr_put_u32_check(buf, size,
+					      TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+					      mask.ipv4->hdr.dst_addr))))
+			goto error_nobufs;
+		/* Source and destination are swapped for decap. */
+		if (mask.ipv4->hdr.src_addr == IN_ADDR_MASK) {
+			encap.ip.dst.v4.s_addr = spec.ipv4->hdr.src_addr;
+			encap.mask |= BIT_ENCAP(IPV4_DST);
+		}
+		if (mask.ipv4->hdr.dst_addr == IN_ADDR_MASK) {
+			encap.ip.src.v4.s_addr = spec.ipv4->hdr.dst_addr;
+			encap.mask |= BIT_ENCAP(IPV4_SRC);
+		}
+		++item;
+		break;
+	case ITEM_TUN_IPV6:
+		if (item->type != RTE_FLOW_ITEM_TYPE_IPV6)
+			goto trans;
+		mask.ipv6 = mlx5_nl_flow_item_mask
+			(item, &rte_flow_item_ipv6_mask,
+			 &mlx5_nl_flow_encap_mask_supported.ipv6,
+			 &mlx5_nl_flow_mask_empty.ipv6,
+			 sizeof(rte_flow_item_ipv6_mask), error);
+		if (!mask.ipv6)
+			return -rte_errno;
+		spec.ipv6 = item->spec;
+		if ((!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr) &&
+		     (!mnl_attr_put_check(buf, size,
+					  TCA_FLOWER_KEY_ENC_IPV6_SRC,
+					  sizeof(spec.ipv6->hdr.src_addr),
+					  spec.ipv6->hdr.src_addr) ||
+		      !mnl_attr_put_check(buf, size,
+					  TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
+					  sizeof(mask.ipv6->hdr.src_addr),
+					  mask.ipv6->hdr.src_addr))) ||
+		    (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr) &&
+		     (!mnl_attr_put_check(buf, size,
+					  TCA_FLOWER_KEY_ENC_IPV6_DST,
+					  sizeof(spec.ipv6->hdr.dst_addr),
+					  spec.ipv6->hdr.dst_addr) ||
+		      !mnl_attr_put_check(buf, size,
+					  TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
+					  sizeof(mask.ipv6->hdr.dst_addr),
+					  mask.ipv6->hdr.dst_addr))))
+			goto error_nobufs;
+		/* Source and destination are swapped for decap. */
+		if (!memcmp(mask.ipv6->hdr.src_addr, IN6_ADDR_MASK, 16)) {
+			encap.ip.dst.v6 =
+				*(struct in6_addr *)&spec.ipv6->hdr.src_addr;
+			encap.mask |= BIT_ENCAP(IPV6_DST);
+		}
+		if (!memcmp(mask.ipv6->hdr.dst_addr, IN6_ADDR_MASK, 16)) {
+			encap.ip.src.v6 =
+				*(struct in6_addr *)&spec.ipv6->hdr.dst_addr;
+			encap.mask |= BIT_ENCAP(IPV6_SRC);
+		}
+		++item;
+		break;
+	case ITEM_TUN_UDP:
+		if (item->type != RTE_FLOW_ITEM_TYPE_UDP)
+			goto trans;
+		mask.udp = mlx5_nl_flow_item_mask
+			(item, &rte_flow_item_udp_mask,
+			 &mlx5_nl_flow_encap_mask_supported.udp,
+			 &mlx5_nl_flow_mask_empty.udp,
+			 sizeof(rte_flow_item_udp_mask), error);
+		if (!mask.udp)
+			return -rte_errno;
+		spec.udp = item->spec;
+		if ((mask.udp->hdr.src_port &&
+		     (!mnl_attr_put_u16_check(buf, size,
+					      TCA_FLOWER_KEY_ENC_UDP_SRC_PORT,
+					      spec.udp->hdr.src_port) ||
+		      !mnl_attr_put_u16_check
+			(buf, size, TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK,
+			 mask.udp->hdr.src_port))) ||
+		    (mask.udp->hdr.dst_port &&
+		     (!mnl_attr_put_u16_check(buf, size,
+					      TCA_FLOWER_KEY_ENC_UDP_DST_PORT,
+					      spec.udp->hdr.dst_port) ||
+		      !mnl_attr_put_u16_check
+			(buf, size, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
+			 mask.udp->hdr.dst_port))))
+			goto error_nobufs;
+		/* Source and destination are swapped for decap. */
+		if (mask.udp->hdr.src_port == BE16_MASK) {
+			encap.udp.dst = spec.udp->hdr.src_port;
+			encap.mask |= BIT_ENCAP(UDP_DST);
+		}
+		if (mask.udp->hdr.dst_port == BE16_MASK) {
+			encap.udp.src = spec.udp->hdr.dst_port;
+			encap.mask |= BIT_ENCAP(UDP_SRC);
+		}
+		++item;
+		break;
 	case ACTIONS:
 		if (item->type != RTE_FLOW_ITEM_TYPE_END)
 			goto trans;
 		assert(na_flower);
 		assert(!na_flower_act);
+		if (!mnl_attr_put_u32_check(buf, size, TCA_FLOWER_FLAGS,
+					    TCA_CLS_FLAGS_SKIP_SW))
+			goto error_nobufs;
 		na_flower_act =
 			mnl_attr_nest_start_check(buf, size, TCA_FLOWER_ACT);
 		if (!na_flower_act)
@@ -1446,14 +1696,35 @@ mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
 		}
 		++action;
 		break;
+	case ACTION_VXLAN_DECAP:
+		if (action->type != RTE_FLOW_ACTION_TYPE_VXLAN_DECAP)
+			goto trans;
+		if (!vxlan_decap)
+			return rte_flow_error_set
+				(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+				 action,
+				 "VXLAN decapsulation is only supported after"
+				 " matching VXLAN traffic explicitly first");
+		i = TCA_TUNNEL_KEY_ACT_RELEASE;
+		nl_flow->decap = 1;
+		conf.vxlan_encap = NULL;
+		goto vxlan_encap;
 	case ACTION_VXLAN_ENCAP:
 		if (action->type != RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP)
 			goto trans;
+		if (vxlan_decap)
+			return rte_flow_error_set
+				(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+				 action,
+				 "cannot combine VXLAN header matching with"
+				 " encapsulation");
 		conf.vxlan_encap = action->conf;
 		if (mlx5_nl_flow_encap_reap(&encap,
 					    conf.vxlan_encap->definition,
 					    error))
 			return -rte_errno;
+		i = TCA_TUNNEL_KEY_ACT_SET;
+vxlan_encap:
 		act_index =
 			mnl_attr_nest_start_check(buf, size, act_index_cur++);
 		if (!act_index ||
@@ -1467,10 +1738,11 @@ mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
 					sizeof(struct tc_tunnel_key),
 					&(struct tc_tunnel_key){
 						.action = TC_ACT_PIPE,
-						.t_action =
-							TCA_TUNNEL_KEY_ACT_SET,
+						.t_action = i,
 					}))
 			goto error_nobufs;
+		if (!conf.vxlan_encap)
+			goto vxlan_encap_end;
 		if (encap.mask & BIT_ENCAP(IPV4_SRC) &&
 		    !mnl_attr_put_u32_check
 		    (buf, size, TCA_TUNNEL_KEY_ENC_IPV4_SRC,
@@ -1507,16 +1779,11 @@ mlx5_nl_flow_transpose(struct mlx5_nl_flow *nl_flow,
 		if (!mnl_attr_put_u32_check
 		    (buf, size, TCA_TUNNEL_KEY_ENC_KEY_ID, encap.vxlan.vni))
 			goto error_nobufs;
+vxlan_encap_end:
 		mnl_attr_nest_end(buf, act);
 		mnl_attr_nest_end(buf, act_index);
 		++action;
 		break;
-	case ACTION_VXLAN_DECAP:
-		if (action->type != RTE_FLOW_ACTION_TYPE_VXLAN_DECAP)
-			goto trans;
-		return rte_flow_error_set
-			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION, action,
-			 "VXLAN decap is not supported yet");
 	case END:
 		if (item->type != RTE_FLOW_ITEM_TYPE_END ||
 		    action->type != RTE_FLOW_ACTION_TYPE_END)
@@ -1844,15 +2111,26 @@ mlx5_nl_flow_ifindex_vxlan(struct mlx5_nl_flow_ctx *ctx, unsigned int ifindex,
 	 * cannot be worked around by picking a random value here and using
 	 * a different one when creating flow rules later.
 	 *
-	 * Therefore request a hopefully unique VNI based on the interface
-	 * index in order to work around EEXIST. VNI will be overridden
-	 * later on a flow rule basis thanks to IFLA_VXLAN_COLLECT_METADATA.
+	 * There is another way to work around EEXIST by assigning a unique
+	 * VNI to the VXLAN interface (e.g. by emitting IFLA_VXLAN_ID based
+	 * on underlying ifindex), however doing so breaks decap as it
+	 * prevents the kernel from matching VNI when looking for a VXLAN
+	 * interface in that direction. Note that iproute2 doesn't allow
+	 * this combination either.
+	 *
+	 * Creating non-external VXLAN interfaces with fixed outer
+	 * properties was also considered. Problem is that not only it won't
+	 * scale to large numbers, it appears that only interfaces with
+	 * dynamic properties (external) can be offloaded to hardware.
+	 *
+	 * Hence the following limitation: as long as VXLAN encap/decap flow
+	 * rules exist on a given DPDK port, the local UDP port they rely on
+	 * can only be used by flow rules on that port. They will fail with
+	 * EEXIST on others.
 	 */
 	if (!mnl_attr_put_u16_check(nlh, sizeof(buf), IFLA_VXLAN_PORT,
 				    vxlan_port))
 		goto exit;
-	if (!mnl_attr_put_u32_check(nlh, sizeof(buf), IFLA_VXLAN_ID, ifindex))
-		goto exit;
 	mnl_attr_nest_end(nlh, na_vxlan);
 	mnl_attr_nest_end(nlh, na_info);
 	ret = mlx5_nl_flow_chat(ctx, nlh, NULL, NULL);
@@ -2022,8 +2300,9 @@ mlx5_nl_flow_encap_neigh(struct mlx5_nl_flow_ctx *ctx,
 		goto error_nobufs;
 	if (encap->mask & BIT_ENCAP(ETH_SRC) && enable)
 		DRV_LOG(WARNING,
-			"Ethernet source address cannot be forced"
-			" for VXLAN encap; parameter ignored");
+			"Ethernet source address (encap) or destination"
+			" address (decap) cannot be forced for VXLAN"
+			" encap/decap; parameter ignored");
 	if (encap->mask & BIT_ENCAP(ETH_DST) &&
 	    !mnl_attr_put_check(nlh, sizeof(buf), NDA_LLADDR,
 				sizeof(encap->eth.dst), &encap->eth.dst))
@@ -2325,9 +2604,12 @@ mlx5_nl_flow_create(struct mlx5_nl_flow_ctx *ctx, struct mlx5_nl_flow *nl_flow,
 {
 	struct nlmsghdr *nlh = (void *)nl_flow->msg;
 	struct mlx5_nl_flow_encap *encap =
-		nl_flow->encap && nl_flow->ifindex_dst ?
+		nl_flow->encap && nl_flow->ifindex_dst && nl_flow->ifindex_src ?
 		nl_flow->encap : NULL;
-	unsigned int ifindex = encap ? *nl_flow->ifindex_dst : 0;
+	unsigned int *ifindex_target =
+		nl_flow->decap ?
+		nl_flow->ifindex_src : nl_flow->ifindex_dst;
+	unsigned int ifindex = encap ? *ifindex_target : 0;
 	int ret;
 
 	if (nl_flow->applied)
@@ -2339,11 +2621,11 @@ mlx5_nl_flow_create(struct mlx5_nl_flow_ctx *ctx, struct mlx5_nl_flow *nl_flow,
 			(ctx, encap, ifindex, true, error);
 		if (!nl_flow->encap_ifindex)
 			return -rte_errno;
-		*nl_flow->ifindex_dst = nl_flow->encap_ifindex;
+		*ifindex_target = nl_flow->encap_ifindex;
 	}
 	ret = mlx5_nl_flow_chat(ctx, nlh, NULL, NULL);
 	if (encap)
-		*nl_flow->ifindex_dst = ifindex;
+		*ifindex_target = ifindex;
 	if (!ret) {
 		nl_flow->applied = 1;
 		return 0;
@@ -2378,9 +2660,11 @@ mlx5_nl_flow_destroy(struct mlx5_nl_flow_ctx *ctx, struct mlx5_nl_flow *nl_flow,
 {
 	struct nlmsghdr *nlh = (void *)nl_flow->msg;
 	struct mlx5_nl_flow_encap *encap =
-		nl_flow->encap && nl_flow->ifindex_dst ?
+		nl_flow->encap && nl_flow->ifindex_dst && nl_flow->ifindex_src ?
 		nl_flow->encap : NULL;
-	unsigned int ifindex = encap ? *nl_flow->ifindex_dst : 0;
+	unsigned int *ifindex_target =
+		nl_flow->decap ? nl_flow->ifindex_src : nl_flow->ifindex_dst;
+	unsigned int ifindex = encap ? *ifindex_target : 0;
 	int err = 0;
 	int ret;
 
@@ -2392,11 +2676,11 @@ mlx5_nl_flow_destroy(struct mlx5_nl_flow_ctx *ctx, struct mlx5_nl_flow *nl_flow,
 		if (!mlx5_nl_flow_encap_ifindex
 		    (ctx, encap, ifindex, false, error))
 			err = rte_errno;
-		*nl_flow->ifindex_dst = nl_flow->encap_ifindex;
+		*ifindex_target = nl_flow->encap_ifindex;
 	}
 	ret = mlx5_nl_flow_chat(ctx, nlh, NULL, NULL);
 	if (encap)
-		*nl_flow->ifindex_dst = ifindex;
+		*ifindex_target = ifindex;
 	nl_flow->applied = 0;
 	if (err) {
 		rte_errno = err;
-- 
2.11.0


More information about the dev mailing list