[dpdk-dev] [PATCH v2 3/3] net/mlx5: add Linux TC flower driver for	E-Switch flow
    Yongseok Koh 
    yskoh at mellanox.com
       
    Mon Sep 24 21:55:17 CEST 2018
    
    
  
Flows having 'transfer' attribute have to be inserted to E-Switch on the
NIC and the control path uses Linux TC flower interface via Netlink socket.
This patch adds the flow driver on top of the new flow engine.
Signed-off-by: Yongseok Koh <yskoh at mellanox.com>
---
 drivers/net/mlx5/Makefile        |    1 +
 drivers/net/mlx5/meson.build     |    1 +
 drivers/net/mlx5/mlx5.c          |   33 +
 drivers/net/mlx5/mlx5_flow.c     |    6 +-
 drivers/net/mlx5/mlx5_flow.h     |   20 +
 drivers/net/mlx5/mlx5_flow_tcf.c | 1608 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 1668 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/mlx5/mlx5_flow_tcf.c
diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 9c1044808..ca1de9f21 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -32,6 +32,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_rss.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_dv.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_tcf.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow_verbs.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_socket.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl.c
diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
index e5376291c..fd93ac162 100644
--- a/drivers/net/mlx5/meson.build
+++ b/drivers/net/mlx5/meson.build
@@ -32,6 +32,7 @@ if build
 		'mlx5_ethdev.c',
 		'mlx5_flow.c',
 		'mlx5_flow_dv.c',
+		'mlx5_flow_tcf.c',
 		'mlx5_flow_verbs.c',
 		'mlx5_mac.c',
 		'mlx5_mr.c',
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index bb9a63fba..4be6a1cc9 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -44,6 +44,7 @@
 #include "mlx5_rxtx.h"
 #include "mlx5_autoconf.h"
 #include "mlx5_defs.h"
+#include "mlx5_flow.h"
 #include "mlx5_glue.h"
 #include "mlx5_mr.h"
 #include "mlx5_flow.h"
@@ -286,6 +287,8 @@ mlx5_dev_close(struct rte_eth_dev *dev)
 		close(priv->nl_socket_route);
 	if (priv->nl_socket_rdma >= 0)
 		close(priv->nl_socket_rdma);
+	if (priv->mnl_socket)
+		mlx5_flow_tcf_socket_destroy(priv->mnl_socket);
 	ret = mlx5_hrxq_ibv_verify(dev);
 	if (ret)
 		DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
@@ -1135,6 +1138,34 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 	claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
 	if (vf && config.vf_nl_en)
 		mlx5_nl_mac_addr_sync(eth_dev);
+	priv->mnl_socket = mlx5_flow_tcf_socket_create();
+	if (!priv->mnl_socket) {
+		err = -rte_errno;
+		DRV_LOG(WARNING,
+			"flow rules relying on switch offloads will not be"
+			" supported: cannot open libmnl socket: %s",
+			strerror(rte_errno));
+	} else {
+		struct rte_flow_error error;
+		unsigned int ifindex = mlx5_ifindex(eth_dev);
+
+		if (!ifindex) {
+			err = -rte_errno;
+			error.message =
+				"cannot retrieve network interface index";
+		} else {
+			err = mlx5_flow_tcf_init(priv->mnl_socket, ifindex,
+						&error);
+		}
+		if (err) {
+			DRV_LOG(WARNING,
+				"flow rules relying on switch offloads will"
+				" not be supported: %s: %s",
+				error.message, strerror(rte_errno));
+			mlx5_flow_tcf_socket_destroy(priv->mnl_socket);
+			priv->mnl_socket = NULL;
+		}
+	}
 	TAILQ_INIT(&priv->flows);
 	TAILQ_INIT(&priv->ctrl_flows);
 	/* Hint libmlx5 to use PMD allocator for data plane resources */
@@ -1187,6 +1218,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev,
 			close(priv->nl_socket_route);
 		if (priv->nl_socket_rdma >= 0)
 			close(priv->nl_socket_rdma);
+		if (priv->mnl_socket)
+			mlx5_flow_tcf_socket_destroy(priv->mnl_socket);
 		if (own_domain_id)
 			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
 		rte_free(priv);
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 54008afa4..7660bee30 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -41,6 +41,7 @@ extern const struct eth_dev_ops mlx5_dev_ops_isolate;
 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
 extern const struct mlx5_flow_driver_ops mlx5_flow_dv_drv_ops;
 #endif
+extern const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops;
 extern const struct mlx5_flow_driver_ops mlx5_flow_verbs_drv_ops;
 
 const struct mlx5_flow_driver_ops mlx5_flow_null_drv_ops;
@@ -50,6 +51,7 @@ const struct mlx5_flow_driver_ops *flow_drv_ops[] = {
 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
 	[MLX5_FLOW_TYPE_DV] = &mlx5_flow_dv_drv_ops,
 #endif
+	[MLX5_FLOW_TYPE_TCF] = &mlx5_flow_tcf_drv_ops,
 	[MLX5_FLOW_TYPE_VERBS] = &mlx5_flow_verbs_drv_ops,
 	[MLX5_FLOW_TYPE_MAX] = &mlx5_flow_null_drv_ops
 };
@@ -1609,7 +1611,9 @@ flow_get_drv_type(struct rte_eth_dev *dev __rte_unused,
 	struct priv *priv __rte_unused = dev->data->dev_private;
 	enum mlx5_flow_drv_type type = MLX5_FLOW_TYPE_MAX;
 
-	if (!attr->transfer) {
+	if (attr->transfer) {
+		type = MLX5_FLOW_TYPE_TCF;
+	} else {
 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
 		type = priv->config.dv_flow_en ?  MLX5_FLOW_TYPE_DV :
 						  MLX5_FLOW_TYPE_VERBS;
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 2bc3bee8c..10d700a7f 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -82,6 +82,11 @@
 #define MLX5_ACTION_FLAG (1u << 3)
 #define MLX5_ACTION_MARK (1u << 4)
 #define MLX5_ACTION_COUNT (1u << 5)
+#define MLX5_ACTION_PORT_ID (1u << 6)
+#define MLX5_ACTION_OF_POP_VLAN (1u << 7)
+#define MLX5_ACTION_OF_PUSH_VLAN (1u << 8)
+#define MLX5_ACTION_OF_SET_VLAN_VID (1u << 9)
+#define MLX5_ACTION_OF_SET_VLAN_PCP (1u << 10)
 
 /* possible L3 layers protocols filtering. */
 #define MLX5_IP_PROTOCOL_TCP 6
@@ -131,6 +136,7 @@
 enum mlx5_flow_drv_type {
 	MLX5_FLOW_TYPE_MIN,
 	MLX5_FLOW_TYPE_DV,
+	MLX5_FLOW_TYPE_TCF,
 	MLX5_FLOW_TYPE_VERBS,
 	MLX5_FLOW_TYPE_MAX,
 };
@@ -170,6 +176,12 @@ struct mlx5_flow_dv {
 	int actions_n; /**< number of actions. */
 };
 
+/** Linux TC flower driver for E-Switch flow. */
+struct mlx5_flow_tcf {
+	struct nlmsghdr *nlh;
+	struct tcmsg *tcm;
+};
+
 /* Verbs specification header. */
 struct ibv_spec_header {
 	enum ibv_flow_spec_type type;
@@ -199,6 +211,7 @@ struct mlx5_flow {
 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
 		struct mlx5_flow_dv dv;
 #endif
+		struct mlx5_flow_tcf tcf;
 		struct mlx5_flow_verbs verbs;
 	};
 };
@@ -322,4 +335,11 @@ int mlx5_flow_validate_item_vxlan_gpe(const struct rte_flow_item *item,
 				      struct rte_eth_dev *dev,
 				      struct rte_flow_error *error);
 
+/* mlx5_flow_tcf.c */
+
+int mlx5_flow_tcf_init(struct mnl_socket *nl, unsigned int ifindex,
+		       struct rte_flow_error *error);
+struct mnl_socket *mlx5_flow_tcf_socket_create(void);
+void mlx5_flow_tcf_socket_destroy(struct mnl_socket *nl);
+
 #endif /* RTE_PMD_MLX5_FLOW_H_ */
diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c
new file mode 100644
index 000000000..14376188e
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_flow_tcf.c
@@ -0,0 +1,1608 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 6WIND S.A.
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <libmnl/libmnl.h>
+#include <linux/if_ether.h>
+#include <linux/netlink.h>
+#include <linux/pkt_cls.h>
+#include <linux/pkt_sched.h>
+#include <linux/rtnetlink.h>
+#include <linux/tc_act/tc_gact.h>
+#include <linux/tc_act/tc_mirred.h>
+#include <netinet/in.h>
+#include <stdalign.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+
+#include <rte_byteorder.h>
+#include <rte_errno.h>
+#include <rte_ether.h>
+#include <rte_flow.h>
+#include <rte_malloc.h>
+
+#include "mlx5.h"
+#include "mlx5_flow.h"
+#include "mlx5_autoconf.h"
+
+#ifdef HAVE_TC_ACT_VLAN
+
+#include <linux/tc_act/tc_vlan.h>
+
+#else /* HAVE_TC_ACT_VLAN */
+
+#define TCA_VLAN_ACT_POP 1
+#define TCA_VLAN_ACT_PUSH 2
+#define TCA_VLAN_ACT_MODIFY 3
+#define TCA_VLAN_PARMS 2
+#define TCA_VLAN_PUSH_VLAN_ID 3
+#define TCA_VLAN_PUSH_VLAN_PROTOCOL 4
+#define TCA_VLAN_PAD 5
+#define TCA_VLAN_PUSH_VLAN_PRIORITY 6
+
+struct tc_vlan {
+	tc_gen;
+	int v_action;
+};
+
+#endif /* HAVE_TC_ACT_VLAN */
+
+/* Normally found in linux/netlink.h. */
+#ifndef NETLINK_CAP_ACK
+#define NETLINK_CAP_ACK 10
+#endif
+
+/* Normally found in linux/pkt_sched.h. */
+#ifndef TC_H_MIN_INGRESS
+#define TC_H_MIN_INGRESS 0xfff2u
+#endif
+
+/* Normally found in linux/pkt_cls.h. */
+#ifndef TCA_CLS_FLAGS_SKIP_SW
+#define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
+#endif
+#ifndef HAVE_TCA_FLOWER_ACT
+#define TCA_FLOWER_ACT 3
+#endif
+#ifndef HAVE_TCA_FLOWER_FLAGS
+#define TCA_FLOWER_FLAGS 22
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ETH_TYPE
+#define TCA_FLOWER_KEY_ETH_TYPE 8
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ETH_DST
+#define TCA_FLOWER_KEY_ETH_DST 4
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ETH_DST_MASK
+#define TCA_FLOWER_KEY_ETH_DST_MASK 5
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC
+#define TCA_FLOWER_KEY_ETH_SRC 6
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_ETH_SRC_MASK
+#define TCA_FLOWER_KEY_ETH_SRC_MASK 7
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_IP_PROTO
+#define TCA_FLOWER_KEY_IP_PROTO 9
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC
+#define TCA_FLOWER_KEY_IPV4_SRC 10
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_IPV4_SRC_MASK
+#define TCA_FLOWER_KEY_IPV4_SRC_MASK 11
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST
+#define TCA_FLOWER_KEY_IPV4_DST 12
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_IPV4_DST_MASK
+#define TCA_FLOWER_KEY_IPV4_DST_MASK 13
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC
+#define TCA_FLOWER_KEY_IPV6_SRC 14
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_IPV6_SRC_MASK
+#define TCA_FLOWER_KEY_IPV6_SRC_MASK 15
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST
+#define TCA_FLOWER_KEY_IPV6_DST 16
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_IPV6_DST_MASK
+#define TCA_FLOWER_KEY_IPV6_DST_MASK 17
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC
+#define TCA_FLOWER_KEY_TCP_SRC 18
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_TCP_SRC_MASK
+#define TCA_FLOWER_KEY_TCP_SRC_MASK 35
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_TCP_DST
+#define TCA_FLOWER_KEY_TCP_DST 19
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_TCP_DST_MASK
+#define TCA_FLOWER_KEY_TCP_DST_MASK 36
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC
+#define TCA_FLOWER_KEY_UDP_SRC 20
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_UDP_SRC_MASK
+#define TCA_FLOWER_KEY_UDP_SRC_MASK 37
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_UDP_DST
+#define TCA_FLOWER_KEY_UDP_DST 21
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_UDP_DST_MASK
+#define TCA_FLOWER_KEY_UDP_DST_MASK 38
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_VLAN_ID
+#define TCA_FLOWER_KEY_VLAN_ID 23
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_VLAN_PRIO
+#define TCA_FLOWER_KEY_VLAN_PRIO 24
+#endif
+#ifndef HAVE_TCA_FLOWER_KEY_VLAN_ETH_TYPE
+#define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
+#endif
+
+#ifndef IPV6_ADDR_LEN
+#define IPV6_ADDR_LEN 16
+#endif
+
+/** Empty masks for known item types. */
+static const union {
+	struct rte_flow_item_port_id port_id;
+	struct rte_flow_item_eth eth;
+	struct rte_flow_item_vlan vlan;
+	struct rte_flow_item_ipv4 ipv4;
+	struct rte_flow_item_ipv6 ipv6;
+	struct rte_flow_item_tcp tcp;
+	struct rte_flow_item_udp udp;
+} flow_tcf_mask_empty;
+
+/** Supported masks for known item types. */
+static const struct {
+	struct rte_flow_item_port_id port_id;
+	struct rte_flow_item_eth eth;
+	struct rte_flow_item_vlan vlan;
+	struct rte_flow_item_ipv4 ipv4;
+	struct rte_flow_item_ipv6 ipv6;
+	struct rte_flow_item_tcp tcp;
+	struct rte_flow_item_udp udp;
+} flow_tcf_mask_supported = {
+	.port_id = {
+		.id = 0xffffffff,
+	},
+	.eth = {
+		.type = RTE_BE16(0xffff),
+		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+		.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+	},
+	.vlan = {
+		/* PCP and VID only, no DEI. */
+		.tci = RTE_BE16(0xefff),
+		.inner_type = RTE_BE16(0xffff),
+	},
+	.ipv4.hdr = {
+		.next_proto_id = 0xff,
+		.src_addr = RTE_BE32(0xffffffff),
+		.dst_addr = RTE_BE32(0xffffffff),
+	},
+	.ipv6.hdr = {
+		.proto = 0xff,
+		.src_addr =
+			"\xff\xff\xff\xff\xff\xff\xff\xff"
+			"\xff\xff\xff\xff\xff\xff\xff\xff",
+		.dst_addr =
+			"\xff\xff\xff\xff\xff\xff\xff\xff"
+			"\xff\xff\xff\xff\xff\xff\xff\xff",
+	},
+	.tcp.hdr = {
+		.src_port = RTE_BE16(0xffff),
+		.dst_port = RTE_BE16(0xffff),
+	},
+	.udp.hdr = {
+		.src_port = RTE_BE16(0xffff),
+		.dst_port = RTE_BE16(0xffff),
+	},
+};
+
+#define SZ_NLATTR_HDR MNL_ALIGN(sizeof(struct nlattr))
+#define SZ_NLATTR_NEST SZ_NLATTR_HDR
+#define SZ_NLATTR_DATA_OF(len) MNL_ALIGN(SZ_NLATTR_HDR + (len))
+#define SZ_NLATTR_TYPE_OF(typ) SZ_NLATTR_DATA_OF(sizeof(typ))
+#define SZ_NLATTR_STRZ_OF(str) SZ_NLATTR_DATA_OF(strlen(str) + 1)
+
+#define PTOI_TABLE_SZ_MAX(dev) (mlx5_dev_to_port_id((dev)->device, NULL, 0) + 2)
+
+/** DPDK port to network interface index (ifindex) conversion. */
+struct flow_tcf_ptoi {
+	uint16_t port_id; /**< DPDK port ID. */
+	unsigned int ifindex; /**< Network interface index. */
+};
+
+#define MLX5_TCF_FATE_ACTIONS (MLX5_ACTION_DROP | MLX5_ACTION_PORT_ID)
+
+/**
+ * Retrieve mask for pattern item.
+ *
+ * This function does basic sanity checks on a pattern item in order to
+ * return the most appropriate mask for it.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] mask_default
+ *   Default mask for pattern item as specified by the flow API.
+ * @param[in] mask_supported
+ *   Mask fields supported by the implementation.
+ * @param[in] mask_empty
+ *   Empty mask to return when there is no specification.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   Either @p item->mask or one of the mask parameters on success, NULL
+ *   otherwise and rte_errno is set.
+ */
+static const void *
+flow_tcf_item_mask(const struct rte_flow_item *item, const void *mask_default,
+		   const void *mask_supported, const void *mask_empty,
+		   size_t mask_size, struct rte_flow_error *error)
+{
+	const uint8_t *mask;
+	size_t i;
+
+	/* item->last and item->mask cannot exist without item->spec. */
+	if (!item->spec && (item->mask || item->last)) {
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ITEM, item,
+				   "\"mask\" or \"last\" field provided without"
+				   " a corresponding \"spec\"");
+		return NULL;
+	}
+	/* No spec, no mask, no problem. */
+	if (!item->spec)
+		return mask_empty;
+	mask = item->mask ? item->mask : mask_default;
+	assert(mask);
+	/*
+	 * Single-pass check to make sure that:
+	 * - Mask is supported, no bits are set outside mask_supported.
+	 * - Both item->spec and item->last are included in mask.
+	 */
+	for (i = 0; i != mask_size; ++i) {
+		if (!mask[i])
+			continue;
+		if ((mask[i] | ((const uint8_t *)mask_supported)[i]) !=
+		    ((const uint8_t *)mask_supported)[i]) {
+			rte_flow_error_set(error, ENOTSUP,
+					   RTE_FLOW_ERROR_TYPE_ITEM_MASK, mask,
+					   "unsupported field found"
+					   " in \"mask\"");
+			return NULL;
+		}
+		if (item->last &&
+		    (((const uint8_t *)item->spec)[i] & mask[i]) !=
+		    (((const uint8_t *)item->last)[i] & mask[i])) {
+			rte_flow_error_set(error, ENOTSUP,
+					   RTE_FLOW_ERROR_TYPE_ITEM_LAST,
+					   item->last,
+					   "range between \"spec\" and \"last\""
+					   " not comprised in \"mask\"");
+			return NULL;
+		}
+	}
+	return mask;
+}
+
+/**
+ * Build a conversion table between port ID and ifindex.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[out] ptoi
+ *   Pointer to ptoi table.
+ * @param[in] len
+ *   Size of ptoi table provided.
+ *
+ * @return
+ *   Size of ptoi table filled.
+ */
+static unsigned int
+flow_tcf_build_ptoi_table(struct rte_eth_dev *dev, struct flow_tcf_ptoi *ptoi,
+			  unsigned int len)
+{
+	unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
+	uint16_t port_id[n + 1];
+	unsigned int i;
+	unsigned int own = 0;
+
+	/* At least one port is needed when no switch domain is present. */
+	if (!n) {
+		n = 1;
+		port_id[0] = dev->data->port_id;
+	} else {
+		n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
+	}
+	if (n > len)
+		return 0;
+	for (i = 0; i != n; ++i) {
+		struct rte_eth_dev_info dev_info;
+
+		rte_eth_dev_info_get(port_id[i], &dev_info);
+		if (port_id[i] == dev->data->port_id)
+			own = i;
+		ptoi[i].port_id = port_id[i];
+		ptoi[i].ifindex = dev_info.if_index;
+	}
+	/* Ensure first entry of ptoi[] is the current device. */
+	if (own) {
+		ptoi[n] = ptoi[0];
+		ptoi[0] = ptoi[own];
+		ptoi[own] = ptoi[n];
+	}
+	/* An entry with zero ifindex terminates ptoi[]. */
+	ptoi[n].port_id = 0;
+	ptoi[n].ifindex = 0;
+	return n;
+}
+
+/**
+ * Verify the @p attr will be correctly understood by the E-switch.
+ *
+ * @param[in] attr
+ *   Pointer to flow attributes
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_tcf_validate_attributes(const struct rte_flow_attr *attr,
+			     struct rte_flow_error *error)
+{
+	/*
+	 * Supported attributes: no groups, some priorities and ingress only.
+	 * Don't care about transfer as it is the caller's problem.
+	 */
+	if (attr->group)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_GROUP, attr,
+					  "groups are not supported");
+	if (attr->priority > 0xfffe)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+					  attr,
+					  "lowest priority level is 0xfffe");
+	if (!attr->ingress)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
+					  attr, "only ingress is supported");
+	if (attr->egress)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
+					  attr, "egress is not supported");
+	return 0;
+}
+
+/**
+ * Validate flow for E-Switch.
+ *
+ * @param[in] priv
+ *   Pointer to the priv structure.
+ * @param[in] attr
+ *   Pointer to the flow attributes.
+ * @param[in] items
+ *   Pointer to the list of items.
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_ernno is set.
+ */
+static int
+flow_tcf_validate(struct rte_eth_dev *dev,
+		  const struct rte_flow_attr *attr,
+		  const struct rte_flow_item items[],
+		  const struct rte_flow_action actions[],
+		  struct rte_flow_error *error)
+{
+	union {
+		const struct rte_flow_item_port_id *port_id;
+		const struct rte_flow_item_eth *eth;
+		const struct rte_flow_item_vlan *vlan;
+		const struct rte_flow_item_ipv4 *ipv4;
+		const struct rte_flow_item_ipv6 *ipv6;
+		const struct rte_flow_item_tcp *tcp;
+		const struct rte_flow_item_udp *udp;
+	} spec, mask;
+	union {
+		const struct rte_flow_action_port_id *port_id;
+		const struct rte_flow_action_of_push_vlan *of_push_vlan;
+		const struct rte_flow_action_of_set_vlan_vid *
+			of_set_vlan_vid;
+		const struct rte_flow_action_of_set_vlan_pcp *
+			of_set_vlan_pcp;
+	} conf;
+	uint32_t item_flags = 0;
+	uint32_t action_flags = 0;
+	uint8_t next_protocol = -1;
+	unsigned int tcm_ifindex = 0;
+	struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
+	bool in_port_id_set;
+	int ret;
+
+	claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
+						PTOI_TABLE_SZ_MAX(dev)));
+	ret = flow_tcf_validate_attributes(attr, error);
+	if (ret < 0)
+		return ret;
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+		unsigned int i;
+
+		switch (items->type) {
+		case RTE_FLOW_ITEM_TYPE_VOID:
+			break;
+		case RTE_FLOW_ITEM_TYPE_PORT_ID:
+			mask.port_id = flow_tcf_item_mask
+				(items, &rte_flow_item_port_id_mask,
+				 &flow_tcf_mask_supported.port_id,
+				 &flow_tcf_mask_empty.port_id,
+				 sizeof(flow_tcf_mask_supported.port_id),
+				 error);
+			if (!mask.port_id)
+				return -rte_errno;
+			if (mask.port_id == &flow_tcf_mask_empty.port_id) {
+				in_port_id_set = 1;
+				break;
+			}
+			spec.port_id = items->spec;
+			if (mask.port_id->id && mask.port_id->id != 0xffffffff)
+				return rte_flow_error_set
+					(error, ENOTSUP,
+					 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
+					 mask.port_id,
+					 "no support for partial mask on"
+					 " \"id\" field");
+			if (!mask.port_id->id)
+				i = 0;
+			else
+				for (i = 0; ptoi[i].ifindex; ++i)
+					if (ptoi[i].port_id == spec.port_id->id)
+						break;
+			if (!ptoi[i].ifindex)
+				return rte_flow_error_set
+					(error, ENODEV,
+					 RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
+					 spec.port_id,
+					 "missing data to convert port ID to"
+					 " ifindex");
+			if (in_port_id_set && ptoi[i].ifindex != tcm_ifindex)
+				return rte_flow_error_set
+					(error, ENOTSUP,
+					 RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
+					 spec.port_id,
+					 "cannot match traffic for"
+					 " several port IDs through"
+					 " a single flow rule");
+			tcm_ifindex = ptoi[i].ifindex;
+			in_port_id_set = 1;
+			break;
+		case RTE_FLOW_ITEM_TYPE_ETH:
+			ret = mlx5_flow_validate_item_eth(items, item_flags,
+							  error);
+			if (ret < 0)
+				return ret;
+			item_flags |= MLX5_FLOW_LAYER_OUTER_L2;
+			/* TODO:
+			 * Redundant check due to different supported mask.
+			 * Same for the rest of items.
+			 */
+			mask.eth = flow_tcf_item_mask
+				(items, &rte_flow_item_eth_mask,
+				 &flow_tcf_mask_supported.eth,
+				 &flow_tcf_mask_empty.eth,
+				 sizeof(flow_tcf_mask_supported.eth),
+				 error);
+			if (!mask.eth)
+				return -rte_errno;
+			if (mask.eth->type && mask.eth->type !=
+			    RTE_BE16(0xffff))
+				return rte_flow_error_set
+					(error, ENOTSUP,
+					 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
+					 mask.eth,
+					 "no support for partial mask on"
+					 " \"type\" field");
+			break;
+		case RTE_FLOW_ITEM_TYPE_VLAN:
+			ret = mlx5_flow_validate_item_vlan(items, item_flags,
+							   error);
+			if (ret < 0)
+				return ret;
+			item_flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
+			mask.vlan = flow_tcf_item_mask
+				(items, &rte_flow_item_vlan_mask,
+				 &flow_tcf_mask_supported.vlan,
+				 &flow_tcf_mask_empty.vlan,
+				 sizeof(flow_tcf_mask_supported.vlan),
+				 error);
+			if (!mask.vlan)
+				return -rte_errno;
+			if ((mask.vlan->tci & RTE_BE16(0xe000) &&
+			     (mask.vlan->tci & RTE_BE16(0xe000)) !=
+			      RTE_BE16(0xe000)) ||
+			    (mask.vlan->tci & RTE_BE16(0x0fff) &&
+			     (mask.vlan->tci & RTE_BE16(0x0fff)) !=
+			      RTE_BE16(0x0fff)) ||
+			    (mask.vlan->inner_type &&
+			     mask.vlan->inner_type != RTE_BE16(0xffff)))
+				return rte_flow_error_set
+					(error, ENOTSUP,
+					 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
+					 mask.vlan,
+					 "no support for partial masks on"
+					 " \"tci\" (PCP and VID parts) and"
+					 " \"inner_type\" fields");
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV4:
+			ret = mlx5_flow_validate_item_ipv4(items, item_flags,
+							   error);
+			if (ret < 0)
+				return ret;
+			item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
+			mask.ipv4 = flow_tcf_item_mask
+				(items, &rte_flow_item_ipv4_mask,
+				 &flow_tcf_mask_supported.ipv4,
+				 &flow_tcf_mask_empty.ipv4,
+				 sizeof(flow_tcf_mask_supported.ipv4),
+				 error);
+			if (!mask.ipv4)
+				return -rte_errno;
+			if (mask.ipv4->hdr.next_proto_id &&
+			    mask.ipv4->hdr.next_proto_id != 0xff)
+				return rte_flow_error_set
+					(error, ENOTSUP,
+					 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
+					 mask.ipv4,
+					 "no support for partial mask on"
+					 " \"hdr.next_proto_id\" field");
+			else if (mask.ipv4->hdr.next_proto_id)
+				next_protocol =
+					((const struct rte_flow_item_ipv4 *)
+					 (items->spec))->hdr.next_proto_id;
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV6:
+			ret = mlx5_flow_validate_item_ipv6(items, item_flags,
+							   error);
+			if (ret < 0)
+				return ret;
+			item_flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
+			mask.ipv6 = flow_tcf_item_mask
+				(items, &rte_flow_item_ipv6_mask,
+				 &flow_tcf_mask_supported.ipv6,
+				 &flow_tcf_mask_empty.ipv6,
+				 sizeof(flow_tcf_mask_supported.ipv6),
+				 error);
+			if (!mask.ipv6)
+				return -rte_errno;
+			if (mask.ipv6->hdr.proto &&
+			    mask.ipv6->hdr.proto != 0xff)
+				return rte_flow_error_set
+					(error, ENOTSUP,
+					 RTE_FLOW_ERROR_TYPE_ITEM_MASK,
+					 mask.ipv6,
+					 "no support for partial mask on"
+					 " \"hdr.proto\" field");
+			else if (mask.ipv6->hdr.proto)
+				next_protocol =
+					((const struct rte_flow_item_ipv6 *)
+					 (items->spec))->hdr.proto;
+			break;
+		case RTE_FLOW_ITEM_TYPE_UDP:
+			ret = mlx5_flow_validate_item_udp(items, item_flags,
+							  next_protocol, error);
+			if (ret < 0)
+				return ret;
+			item_flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
+			mask.udp = flow_tcf_item_mask
+				(items, &rte_flow_item_udp_mask,
+				 &flow_tcf_mask_supported.udp,
+				 &flow_tcf_mask_empty.udp,
+				 sizeof(flow_tcf_mask_supported.udp),
+				 error);
+			if (!mask.udp)
+				return -rte_errno;
+			break;
+		case RTE_FLOW_ITEM_TYPE_TCP:
+			ret = mlx5_flow_validate_item_tcp(items, item_flags,
+							  next_protocol, error);
+			if (ret < 0)
+				return ret;
+			item_flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
+			mask.tcp = flow_tcf_item_mask
+				(items, &rte_flow_item_tcp_mask,
+				 &flow_tcf_mask_supported.tcp,
+				 &flow_tcf_mask_empty.tcp,
+				 sizeof(flow_tcf_mask_supported.tcp),
+				 error);
+			if (!mask.tcp)
+				return -rte_errno;
+			break;
+		default:
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  NULL, "item not supported");
+		}
+	}
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		unsigned int i;
+
+		switch (actions->type) {
+		case RTE_FLOW_ACTION_TYPE_VOID:
+			break;
+		case RTE_FLOW_ACTION_TYPE_PORT_ID:
+			if (action_flags & MLX5_TCF_FATE_ACTIONS)
+				return rte_flow_error_set
+					(error, ENOTSUP,
+					 RTE_FLOW_ERROR_TYPE_ACTION, actions,
+					 "can't have multiple fate actions");
+			conf.port_id = actions->conf;
+			if (conf.port_id->original)
+				i = 0;
+			else
+				for (i = 0; ptoi[i].ifindex; ++i)
+					if (ptoi[i].port_id == conf.port_id->id)
+						break;
+			if (!ptoi[i].ifindex)
+				return rte_flow_error_set
+					(error, ENODEV,
+					 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					 conf.port_id,
+					 "missing data to convert port ID to"
+					 " ifindex");
+			action_flags |= MLX5_ACTION_PORT_ID;
+			break;
+		case RTE_FLOW_ACTION_TYPE_DROP:
+			if (action_flags & MLX5_TCF_FATE_ACTIONS)
+				return rte_flow_error_set
+					(error, ENOTSUP,
+					 RTE_FLOW_ERROR_TYPE_ACTION, actions,
+					 "can't have multiple fate actions");
+			action_flags |= MLX5_ACTION_DROP;
+			break;
+		case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
+			action_flags |= MLX5_ACTION_OF_POP_VLAN;
+			break;
+		case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
+			action_flags |= MLX5_ACTION_OF_PUSH_VLAN;
+			break;
+		case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
+			action_flags |= MLX5_ACTION_OF_SET_VLAN_VID;
+			break;
+		case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
+			action_flags |= MLX5_ACTION_OF_SET_VLAN_PCP;
+			break;
+		default:
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "action not supported");
+		}
+	}
+	return 0;
+}
+
+/**
+ * Calculate maximum size of memory for flow items of Linux TC flower and
+ * extract specified items.
+ *
+ * @param[in] items
+ *   Pointer to the list of items.
+ * @param[out] item_flags
+ *   Pointer to the detected items.
+ *
+ * @return
+ *   Maximum size of memory for items.
+ */
+static int
+flow_tcf_get_items_and_size(const struct rte_flow_item items[],
+			    uint64_t *item_flags)
+{
+	int size = 0;
+	uint64_t flags = 0;
+
+	size += SZ_NLATTR_STRZ_OF("flower") +
+		SZ_NLATTR_NEST + /* TCA_OPTIONS. */
+		SZ_NLATTR_TYPE_OF(uint32_t); /* TCA_CLS_FLAGS_SKIP_SW. */
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+		switch (items->type) {
+		case RTE_FLOW_ITEM_TYPE_VOID:
+			break;
+		case RTE_FLOW_ITEM_TYPE_PORT_ID:
+			break;
+		case RTE_FLOW_ITEM_TYPE_ETH:
+			size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
+				SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) * 4;
+				/* dst/src MAC addr and mask. */
+			flags |= MLX5_FLOW_LAYER_OUTER_L2;
+			break;
+		case RTE_FLOW_ITEM_TYPE_VLAN:
+			size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
+				SZ_NLATTR_TYPE_OF(uint16_t) +
+				/* VLAN Ether type. */
+				SZ_NLATTR_TYPE_OF(uint8_t) + /* VLAN prio. */
+				SZ_NLATTR_TYPE_OF(uint16_t); /* VLAN ID. */
+			flags |= MLX5_FLOW_LAYER_OUTER_VLAN;
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV4:
+			size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
+				SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
+				SZ_NLATTR_TYPE_OF(uint32_t) * 4;
+				/* dst/src IP addr and mask. */
+			flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV6:
+			size += SZ_NLATTR_TYPE_OF(uint16_t) + /* Ether type. */
+				SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
+				SZ_NLATTR_TYPE_OF(IPV6_ADDR_LEN) * 4;
+				/* dst/src IP addr and mask. */
+			flags |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
+			break;
+		case RTE_FLOW_ITEM_TYPE_UDP:
+			size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
+				SZ_NLATTR_TYPE_OF(uint16_t) * 4;
+				/* dst/src port and mask. */
+			flags |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
+			break;
+		case RTE_FLOW_ITEM_TYPE_TCP:
+			size += SZ_NLATTR_TYPE_OF(uint8_t) + /* IP proto. */
+				SZ_NLATTR_TYPE_OF(uint16_t) * 4;
+				/* dst/src port and mask. */
+			flags |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
+			break;
+		default:
+			DRV_LOG(WARNING,
+				"unsupported item %p type %d,"
+				" items must be validated before flow creation",
+				(const void *)items, items->type);
+			break;
+		}
+	}
+	*item_flags = flags;
+	return size;
+}
+
+/**
+ * Calculate maximum size of memory for flow actions of Linux TC flower and
+ * extract specified actions.
+ *
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ * @param[out] action_flags
+ *   Pointer to the detected actions.
+ *
+ * @return
+ *   Maximum size of memory for actions.
+ */
+static int
+flow_tcf_get_actions_and_size(const struct rte_flow_action actions[],
+			      uint64_t *action_flags)
+{
+	int size = 0;
+	uint64_t flags = 0;
+
+	size += SZ_NLATTR_NEST; /* TCA_FLOWER_ACT. */
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		switch (actions->type) {
+		case RTE_FLOW_ACTION_TYPE_VOID:
+			break;
+		case RTE_FLOW_ACTION_TYPE_PORT_ID:
+			size += SZ_NLATTR_NEST + /* na_act_index. */
+				SZ_NLATTR_STRZ_OF("mirred") +
+				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
+				SZ_NLATTR_TYPE_OF(struct tc_mirred);
+			flags |= MLX5_ACTION_PORT_ID;
+			break;
+		case RTE_FLOW_ACTION_TYPE_DROP:
+			size += SZ_NLATTR_NEST + /* na_act_index. */
+				SZ_NLATTR_STRZ_OF("gact") +
+				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
+				SZ_NLATTR_TYPE_OF(struct tc_gact);
+			flags |= MLX5_ACTION_DROP;
+			break;
+		case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
+			flags |= MLX5_ACTION_OF_POP_VLAN;
+			goto action_of_vlan;
+		case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
+			flags |= MLX5_ACTION_OF_PUSH_VLAN;
+			goto action_of_vlan;
+		case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
+			flags |= MLX5_ACTION_OF_SET_VLAN_VID;
+			goto action_of_vlan;
+		case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
+			flags |= MLX5_ACTION_OF_SET_VLAN_PCP;
+			goto action_of_vlan;
+action_of_vlan:
+			size += SZ_NLATTR_NEST + /* na_act_index. */
+				SZ_NLATTR_STRZ_OF("vlan") +
+				SZ_NLATTR_NEST + /* TCA_ACT_OPTIONS. */
+				SZ_NLATTR_TYPE_OF(struct tc_vlan) +
+				SZ_NLATTR_TYPE_OF(uint16_t) +
+				/* VLAN protocol. */
+				SZ_NLATTR_TYPE_OF(uint16_t) + /* VLAN ID. */
+				SZ_NLATTR_TYPE_OF(uint8_t); /* VLAN prio. */
+			break;
+		default:
+			DRV_LOG(WARNING,
+				"unsupported action %p type %d,"
+				" items must be validated before flow creation",
+				(const void *)actions, actions->type);
+			break;
+		}
+	}
+	*action_flags = flags;
+	return size;
+}
+
+/**
+ * Brand rtnetlink buffer with unique handle.
+ *
+ * This handle should be unique for a given network interface to avoid
+ * collisions.
+ *
+ * @param nlh
+ *   Pointer to Netlink message.
+ * @param handle
+ *   Unique 32-bit handle to use.
+ */
+static void
+flow_tcf_nl_brand(struct nlmsghdr *nlh, uint32_t handle)
+{
+	struct tcmsg *tcm = mnl_nlmsg_get_payload(nlh);
+
+	tcm->tcm_handle = handle;
+	DRV_LOG(DEBUG, "Netlink msg %p is branded with handle %x",
+		(void *)nlh, handle);
+}
+
+/**
+ * Prepare a flow object for Linux TC flower. It calculates the maximum size of
+ * memory required, allocates the memory, initializes Netlink message headers
+ * and set unique TC message handle.
+ *
+ * @param[in] attr
+ *   Pointer to the flow attributes.
+ * @param[in] items
+ *   Pointer to the list of items.
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ * @param[out] item_flags
+ *   Pointer to bit mask of all items detected.
+ * @param[out] action_flags
+ *   Pointer to bit mask of all actions detected.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   Pointer to mlx5_flow object on success,
+ *   otherwise NULL and rte_ernno is set.
+ */
+static struct mlx5_flow *
+flow_tcf_prepare(const struct rte_flow_attr *attr __rte_unused,
+		 const struct rte_flow_item items[],
+		 const struct rte_flow_action actions[],
+		 uint64_t *item_flags, uint64_t *action_flags,
+		 struct rte_flow_error *error)
+{
+	size_t size = sizeof(struct mlx5_flow) +
+		      MNL_ALIGN(sizeof(struct nlmsghdr)) +
+		      MNL_ALIGN(sizeof(struct tcmsg));
+	struct mlx5_flow *dev_flow;
+	struct nlmsghdr *nlh;
+	struct tcmsg *tcm;
+
+	size += flow_tcf_get_items_and_size(items, item_flags);
+	size += flow_tcf_get_actions_and_size(actions, action_flags);
+	dev_flow = rte_zmalloc(__func__, size, MNL_ALIGNTO);
+	if (!dev_flow) {
+		rte_flow_error_set(error, ENOMEM,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "not enough memory to create E-Switch flow");
+		return NULL;
+	}
+	nlh = mnl_nlmsg_put_header((void *)(dev_flow + 1));
+	tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
+	*dev_flow = (struct mlx5_flow){
+		.tcf = (struct mlx5_flow_tcf){
+			.nlh = nlh,
+			.tcm = tcm,
+		},
+	};
+	/*
+	 * Generate a reasonably unique handle based on the address of the
+	 * target buffer.
+	 *
+	 * This is straightforward on 32-bit systems where the flow pointer can
+	 * be used directly. Otherwise, its least significant part is taken
+	 * after shifting it by the previous power of two of the pointed buffer
+	 * size.
+	 */
+	if (sizeof(dev_flow) <= 4)
+		flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow);
+	else
+		flow_tcf_nl_brand(nlh, (uintptr_t)dev_flow >>
+				       rte_log2_u32(rte_align32prevpow2(size)));
+	return dev_flow;
+}
+
+/**
+ * Translate flow for Linux TC flower and construct Netlink message.
+ *
+ * @param[in] priv
+ *   Pointer to the priv structure.
+ * @param[in, out] flow
+ *   Pointer to the sub flow.
+ * @param[in] attr
+ *   Pointer to the flow attributes.
+ * @param[in] items
+ *   Pointer to the list of items.
+ * @param[in] actions
+ *   Pointer to the list of actions.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_ernno is set.
+ */
+static int
+flow_tcf_translate(struct rte_eth_dev *dev, struct mlx5_flow *dev_flow,
+		   const struct rte_flow_attr *attr,
+		   const struct rte_flow_item items[],
+		   const struct rte_flow_action actions[],
+		   struct rte_flow_error *error)
+{
+	union {
+		const struct rte_flow_item_port_id *port_id;
+		const struct rte_flow_item_eth *eth;
+		const struct rte_flow_item_vlan *vlan;
+		const struct rte_flow_item_ipv4 *ipv4;
+		const struct rte_flow_item_ipv6 *ipv6;
+		const struct rte_flow_item_tcp *tcp;
+		const struct rte_flow_item_udp *udp;
+	} spec, mask;
+	union {
+		const struct rte_flow_action_port_id *port_id;
+		const struct rte_flow_action_of_push_vlan *of_push_vlan;
+		const struct rte_flow_action_of_set_vlan_vid *
+			of_set_vlan_vid;
+		const struct rte_flow_action_of_set_vlan_pcp *
+			of_set_vlan_pcp;
+	} conf;
+	struct flow_tcf_ptoi ptoi[PTOI_TABLE_SZ_MAX(dev)];
+	struct nlmsghdr *nlh = dev_flow->tcf.nlh;
+	struct tcmsg *tcm = dev_flow->tcf.tcm;
+	uint32_t na_act_index_cur;
+	bool eth_type_set = 0;
+	bool vlan_present = 0;
+	bool vlan_eth_type_set = 0;
+	bool ip_proto_set = 0;
+	struct nlattr *na_flower;
+	struct nlattr *na_flower_act;
+	struct nlattr *na_vlan_id = NULL;
+	struct nlattr *na_vlan_priority = NULL;
+
+	claim_nonzero(flow_tcf_build_ptoi_table(dev, ptoi,
+						PTOI_TABLE_SZ_MAX(dev)));
+	nlh = dev_flow->tcf.nlh;
+	tcm = dev_flow->tcf.tcm;
+	/* Prepare API must have been called beforehand. */
+	assert(nlh != NULL && tcm != NULL);
+	tcm->tcm_family = AF_UNSPEC;
+	tcm->tcm_ifindex = ptoi[0].ifindex;
+	tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
+	/*
+	 * Priority cannot be zero to prevent the kernel from picking one
+	 * automatically.
+	 */
+	tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
+				  RTE_BE16(ETH_P_ALL));
+	mnl_attr_put_strz(nlh, TCA_KIND, "flower");
+	na_flower = mnl_attr_nest_start(nlh, TCA_OPTIONS);
+	mnl_attr_put_u32(nlh, TCA_FLOWER_FLAGS, TCA_CLS_FLAGS_SKIP_SW);
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+		unsigned int i;
+
+		switch (items->type) {
+		case RTE_FLOW_ITEM_TYPE_VOID:
+			break;
+		case RTE_FLOW_ITEM_TYPE_PORT_ID:
+			mask.port_id = flow_tcf_item_mask
+				(items, &rte_flow_item_port_id_mask,
+				 &flow_tcf_mask_supported.port_id,
+				 &flow_tcf_mask_empty.port_id,
+				 sizeof(flow_tcf_mask_supported.port_id),
+				 error);
+			assert(mask.port_id);
+			if (mask.port_id == &flow_tcf_mask_empty.port_id)
+				break;
+			spec.port_id = items->spec;
+			if (!mask.port_id->id)
+				i = 0;
+			else
+				for (i = 0; ptoi[i].ifindex; ++i)
+					if (ptoi[i].port_id == spec.port_id->id)
+						break;
+			assert(ptoi[i].ifindex);
+			tcm->tcm_ifindex = ptoi[i].ifindex;
+			break;
+		case RTE_FLOW_ITEM_TYPE_ETH:
+			mask.eth = flow_tcf_item_mask
+				(items, &rte_flow_item_eth_mask,
+				 &flow_tcf_mask_supported.eth,
+				 &flow_tcf_mask_empty.eth,
+				 sizeof(flow_tcf_mask_supported.eth),
+				 error);
+			assert(mask.eth);
+			if (mask.eth == &flow_tcf_mask_empty.eth)
+				break;
+			spec.eth = items->spec;
+			if (mask.eth->type) {
+				mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
+						 spec.eth->type);
+				eth_type_set = 1;
+			}
+			if (!is_zero_ether_addr(&mask.eth->dst)) {
+				mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST,
+					     ETHER_ADDR_LEN,
+					     spec.eth->dst.addr_bytes);
+				mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_DST_MASK,
+					     ETHER_ADDR_LEN,
+					     mask.eth->dst.addr_bytes);
+			}
+			if (!is_zero_ether_addr(&mask.eth->src)) {
+				mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC,
+					     ETHER_ADDR_LEN,
+					     spec.eth->src.addr_bytes);
+				mnl_attr_put(nlh, TCA_FLOWER_KEY_ETH_SRC_MASK,
+					     ETHER_ADDR_LEN,
+					     mask.eth->src.addr_bytes);
+			}
+			break;
+		case RTE_FLOW_ITEM_TYPE_VLAN:
+			mask.vlan = flow_tcf_item_mask
+				(items, &rte_flow_item_vlan_mask,
+				 &flow_tcf_mask_supported.vlan,
+				 &flow_tcf_mask_empty.vlan,
+				 sizeof(flow_tcf_mask_supported.vlan),
+				 error);
+			assert(mask.vlan);
+			if (!eth_type_set)
+				mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_ETH_TYPE,
+						 RTE_BE16(ETH_P_8021Q));
+			eth_type_set = 1;
+			vlan_present = 1;
+			if (mask.vlan == &flow_tcf_mask_empty.vlan)
+				break;
+			spec.vlan = items->spec;
+			if (mask.vlan->inner_type) {
+				mnl_attr_put_u16(nlh,
+						 TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+						 spec.vlan->inner_type);
+				vlan_eth_type_set = 1;
+			}
+			if (mask.vlan->tci & RTE_BE16(0xe000))
+				mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_VLAN_PRIO,
+						(rte_be_to_cpu_16
+						 (spec.vlan->tci) >> 13) & 0x7);
+			if (mask.vlan->tci & RTE_BE16(0x0fff))
+				mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_VLAN_ID,
+						 rte_be_to_cpu_16
+						 (spec.vlan->tci &
+						  RTE_BE16(0x0fff)));
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV4:
+			mask.ipv4 = flow_tcf_item_mask
+				(items, &rte_flow_item_ipv4_mask,
+				 &flow_tcf_mask_supported.ipv4,
+				 &flow_tcf_mask_empty.ipv4,
+				 sizeof(flow_tcf_mask_supported.ipv4),
+				 error);
+			assert(mask.ipv4);
+			if (!eth_type_set || !vlan_eth_type_set)
+				mnl_attr_put_u16(nlh,
+						 vlan_present ?
+						 TCA_FLOWER_KEY_VLAN_ETH_TYPE :
+						 TCA_FLOWER_KEY_ETH_TYPE,
+						 RTE_BE16(ETH_P_IP));
+			eth_type_set = 1;
+			vlan_eth_type_set = 1;
+			if (mask.ipv4 == &flow_tcf_mask_empty.ipv4)
+				break;
+			spec.ipv4 = items->spec;
+			if (mask.ipv4->hdr.next_proto_id) {
+				mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
+						spec.ipv4->hdr.next_proto_id);
+				ip_proto_set = 1;
+			}
+			if (mask.ipv4->hdr.src_addr) {
+				mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_SRC,
+						 spec.ipv4->hdr.src_addr);
+				mnl_attr_put_u32(nlh,
+						 TCA_FLOWER_KEY_IPV4_SRC_MASK,
+						 mask.ipv4->hdr.src_addr);
+			}
+			if (mask.ipv4->hdr.dst_addr) {
+				mnl_attr_put_u32(nlh, TCA_FLOWER_KEY_IPV4_DST,
+						 spec.ipv4->hdr.dst_addr);
+				mnl_attr_put_u32(nlh,
+						 TCA_FLOWER_KEY_IPV4_DST_MASK,
+						 mask.ipv4->hdr.dst_addr);
+			}
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV6:
+			mask.ipv6 = flow_tcf_item_mask
+				(items, &rte_flow_item_ipv6_mask,
+				 &flow_tcf_mask_supported.ipv6,
+				 &flow_tcf_mask_empty.ipv6,
+				 sizeof(flow_tcf_mask_supported.ipv6),
+				 error);
+			assert(mask.ipv6);
+			if (!eth_type_set || !vlan_eth_type_set)
+				mnl_attr_put_u16(nlh,
+						 vlan_present ?
+						 TCA_FLOWER_KEY_VLAN_ETH_TYPE :
+						 TCA_FLOWER_KEY_ETH_TYPE,
+						 RTE_BE16(ETH_P_IPV6));
+			eth_type_set = 1;
+			vlan_eth_type_set = 1;
+			if (mask.ipv6 == &flow_tcf_mask_empty.ipv6)
+				break;
+			spec.ipv6 = items->spec;
+			if (mask.ipv6->hdr.proto) {
+				mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
+						spec.ipv6->hdr.proto);
+				ip_proto_set = 1;
+			}
+			if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.src_addr)) {
+				mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC,
+					     sizeof(spec.ipv6->hdr.src_addr),
+					     spec.ipv6->hdr.src_addr);
+				mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_SRC_MASK,
+					     sizeof(mask.ipv6->hdr.src_addr),
+					     mask.ipv6->hdr.src_addr);
+			}
+			if (!IN6_IS_ADDR_UNSPECIFIED(mask.ipv6->hdr.dst_addr)) {
+				mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST,
+					     sizeof(spec.ipv6->hdr.dst_addr),
+					     spec.ipv6->hdr.dst_addr);
+				mnl_attr_put(nlh, TCA_FLOWER_KEY_IPV6_DST_MASK,
+					     sizeof(mask.ipv6->hdr.dst_addr),
+					     mask.ipv6->hdr.dst_addr);
+			}
+			break;
+		case RTE_FLOW_ITEM_TYPE_UDP:
+			mask.udp = flow_tcf_item_mask
+				(items, &rte_flow_item_udp_mask,
+				 &flow_tcf_mask_supported.udp,
+				 &flow_tcf_mask_empty.udp,
+				 sizeof(flow_tcf_mask_supported.udp),
+				 error);
+			assert(mask.udp);
+			if (!ip_proto_set)
+				mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
+						IPPROTO_UDP);
+			if (mask.udp == &flow_tcf_mask_empty.udp)
+				break;
+			spec.udp = items->spec;
+			if (mask.udp->hdr.src_port) {
+				mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_SRC,
+						 spec.udp->hdr.src_port);
+				mnl_attr_put_u16(nlh,
+						 TCA_FLOWER_KEY_UDP_SRC_MASK,
+						 mask.udp->hdr.src_port);
+			}
+			if (mask.udp->hdr.dst_port) {
+				mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_UDP_DST,
+						 spec.udp->hdr.dst_port);
+				mnl_attr_put_u16(nlh,
+						 TCA_FLOWER_KEY_UDP_DST_MASK,
+						 mask.udp->hdr.dst_port);
+			}
+			break;
+		case RTE_FLOW_ITEM_TYPE_TCP:
+			mask.tcp = flow_tcf_item_mask
+				(items, &rte_flow_item_tcp_mask,
+				 &flow_tcf_mask_supported.tcp,
+				 &flow_tcf_mask_empty.tcp,
+				 sizeof(flow_tcf_mask_supported.tcp),
+				 error);
+			assert(mask.tcp);
+			if (!ip_proto_set)
+				mnl_attr_put_u8(nlh, TCA_FLOWER_KEY_IP_PROTO,
+						IPPROTO_TCP);
+			if (mask.tcp == &flow_tcf_mask_empty.tcp)
+				break;
+			spec.tcp = items->spec;
+			if (mask.tcp->hdr.src_port) {
+				mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_SRC,
+						 spec.tcp->hdr.src_port);
+				mnl_attr_put_u16(nlh,
+						 TCA_FLOWER_KEY_TCP_SRC_MASK,
+						 mask.tcp->hdr.src_port);
+			}
+			if (mask.tcp->hdr.dst_port) {
+				mnl_attr_put_u16(nlh, TCA_FLOWER_KEY_TCP_DST,
+						 spec.tcp->hdr.dst_port);
+				mnl_attr_put_u16(nlh,
+						 TCA_FLOWER_KEY_TCP_DST_MASK,
+						 mask.tcp->hdr.dst_port);
+			}
+			break;
+		default:
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  NULL, "item not supported");
+		}
+	}
+	na_flower_act = mnl_attr_nest_start(nlh, TCA_FLOWER_ACT);
+	na_act_index_cur = 1;
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		struct nlattr *na_act_index;
+		struct nlattr *na_act;
+		unsigned int vlan_act;
+		unsigned int i;
+
+		switch (actions->type) {
+		case RTE_FLOW_ACTION_TYPE_VOID:
+			break;
+		case RTE_FLOW_ACTION_TYPE_PORT_ID:
+			conf.port_id = actions->conf;
+			if (conf.port_id->original)
+				i = 0;
+			else
+				for (i = 0; ptoi[i].ifindex; ++i)
+					if (ptoi[i].port_id == conf.port_id->id)
+						break;
+			assert(ptoi[i].ifindex);
+			na_act_index =
+				mnl_attr_nest_start(nlh, na_act_index_cur++);
+			assert(na_act_index);
+			mnl_attr_put_strz(nlh, TCA_ACT_KIND, "mirred");
+			na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
+			assert(na_act);
+			mnl_attr_put(nlh, TCA_MIRRED_PARMS,
+				     sizeof(struct tc_mirred),
+				     &(struct tc_mirred){
+					.action = TC_ACT_STOLEN,
+					.eaction = TCA_EGRESS_REDIR,
+					.ifindex = ptoi[i].ifindex,
+				     });
+			mnl_attr_nest_end(nlh, na_act);
+			mnl_attr_nest_end(nlh, na_act_index);
+			break;
+		case RTE_FLOW_ACTION_TYPE_DROP:
+			na_act_index =
+				mnl_attr_nest_start(nlh, na_act_index_cur++);
+			assert(na_act_index);
+			mnl_attr_put_strz(nlh, TCA_ACT_KIND, "gact");
+			na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
+			assert(na_act);
+			mnl_attr_put(nlh, TCA_GACT_PARMS,
+				     sizeof(struct tc_gact),
+				     &(struct tc_gact){
+					.action = TC_ACT_SHOT,
+				     });
+			mnl_attr_nest_end(nlh, na_act);
+			mnl_attr_nest_end(nlh, na_act_index);
+			break;
+		case RTE_FLOW_ACTION_TYPE_OF_POP_VLAN:
+			conf.of_push_vlan = NULL;
+			vlan_act = TCA_VLAN_ACT_POP;
+			goto action_of_vlan;
+		case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
+			conf.of_push_vlan = actions->conf;
+			vlan_act = TCA_VLAN_ACT_PUSH;
+			goto action_of_vlan;
+		case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID:
+			conf.of_set_vlan_vid = actions->conf;
+			if (na_vlan_id)
+				goto override_na_vlan_id;
+			vlan_act = TCA_VLAN_ACT_MODIFY;
+			goto action_of_vlan;
+		case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP:
+			conf.of_set_vlan_pcp = actions->conf;
+			if (na_vlan_priority)
+				goto override_na_vlan_priority;
+			vlan_act = TCA_VLAN_ACT_MODIFY;
+			goto action_of_vlan;
+action_of_vlan:
+			na_act_index =
+				mnl_attr_nest_start(nlh, na_act_index_cur++);
+			assert(na_act_index);
+			mnl_attr_put_strz(nlh, TCA_ACT_KIND, "vlan");
+			na_act = mnl_attr_nest_start(nlh, TCA_ACT_OPTIONS);
+			assert(na_act);
+			mnl_attr_put(nlh, TCA_VLAN_PARMS,
+				     sizeof(struct tc_vlan),
+				     &(struct tc_vlan){
+					.action = TC_ACT_PIPE,
+					.v_action = vlan_act,
+				     });
+			if (vlan_act == TCA_VLAN_ACT_POP) {
+				mnl_attr_nest_end(nlh, na_act);
+				mnl_attr_nest_end(nlh, na_act_index);
+				break;
+			}
+			if (vlan_act == TCA_VLAN_ACT_PUSH)
+				mnl_attr_put_u16(nlh,
+						 TCA_VLAN_PUSH_VLAN_PROTOCOL,
+						 conf.of_push_vlan->ethertype);
+			na_vlan_id = mnl_nlmsg_get_payload_tail(nlh);
+			mnl_attr_put_u16(nlh, TCA_VLAN_PAD, 0);
+			na_vlan_priority = mnl_nlmsg_get_payload_tail(nlh);
+			mnl_attr_put_u8(nlh, TCA_VLAN_PAD, 0);
+			mnl_attr_nest_end(nlh, na_act);
+			mnl_attr_nest_end(nlh, na_act_index);
+			if (actions->type ==
+			    RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) {
+override_na_vlan_id:
+				na_vlan_id->nla_type = TCA_VLAN_PUSH_VLAN_ID;
+				*(uint16_t *)mnl_attr_get_payload(na_vlan_id) =
+					rte_be_to_cpu_16
+					(conf.of_set_vlan_vid->vlan_vid);
+			} else if (actions->type ==
+				   RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP) {
+override_na_vlan_priority:
+				na_vlan_priority->nla_type =
+					TCA_VLAN_PUSH_VLAN_PRIORITY;
+				*(uint8_t *)mnl_attr_get_payload
+					(na_vlan_priority) =
+					conf.of_set_vlan_pcp->vlan_pcp;
+			}
+			break;
+		default:
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "action not supported");
+		}
+	}
+	assert(na_flower);
+	assert(na_flower_act);
+	mnl_attr_nest_end(nlh, na_flower_act);
+	mnl_attr_nest_end(nlh, na_flower);
+	return 0;
+}
+
+/**
+ * Send Netlink message with acknowledgment.
+ *
+ * @param nl
+ *   Libmnl socket to use.
+ * @param nlh
+ *   Message to send. This function always raises the NLM_F_ACK flag before
+ *   sending.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_tcf_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh)
+{
+	alignas(struct nlmsghdr)
+	uint8_t ans[mnl_nlmsg_size(sizeof(struct nlmsgerr)) +
+		    nlh->nlmsg_len - sizeof(*nlh)];
+	uint32_t seq = random();
+	int ret;
+
+	nlh->nlmsg_flags |= NLM_F_ACK;
+	nlh->nlmsg_seq = seq;
+	ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len);
+	if (ret != -1)
+		ret = mnl_socket_recvfrom(nl, ans, sizeof(ans));
+	if (ret != -1)
+		ret = mnl_cb_run
+			(ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL);
+	if (ret > 0)
+		return 0;
+	rte_errno = errno;
+	return -rte_errno;
+}
+
+/**
+ * Apply flow to E-Switch by sending Netlink message.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in, out] flow
+ *   Pointer to the sub flow.
+ * @param[out] error
+ *   Pointer to the error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_ernno is set.
+ */
+static int
+flow_tcf_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
+	       struct rte_flow_error *error)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct mnl_socket *nl = priv->mnl_socket;
+	struct mlx5_flow *dev_flow;
+	struct nlmsghdr *nlh;
+
+	dev_flow = LIST_FIRST(&flow->dev_flows);
+	/* E-Switch flow can't be expanded. */
+	assert(!LIST_NEXT(dev_flow, next));
+	nlh = dev_flow->tcf.nlh;
+	nlh->nlmsg_type = RTM_NEWTFILTER;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
+	if (!flow_tcf_nl_ack(nl, nlh))
+		return 0;
+	return rte_flow_error_set(error, rte_errno,
+				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				  "netlink: failed to create TC flow rule");
+}
+
+/**
+ * Remove flow from E-Switch by sending Netlink message.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in, out] flow
+ *   Pointer to the sub flow.
+ */
+static void
+flow_tcf_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+	struct priv *priv = dev->data->dev_private;
+	struct mnl_socket *nl = priv->mnl_socket;
+	struct mlx5_flow *dev_flow;
+	struct nlmsghdr *nlh;
+
+	if (!flow)
+		return;
+	dev_flow = LIST_FIRST(&flow->dev_flows);
+	if (!dev_flow)
+		return;
+	/* E-Switch flow can't be expanded. */
+	assert(!LIST_NEXT(dev_flow, next));
+	nlh = dev_flow->tcf.nlh;
+	nlh->nlmsg_type = RTM_DELTFILTER;
+	nlh->nlmsg_flags = NLM_F_REQUEST;
+	flow_tcf_nl_ack(nl, nlh);
+}
+
+/**
+ * Remove flow from E-Switch and release resources of the device flow.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in, out] flow
+ *   Pointer to the sub flow.
+ */
+static void
+flow_tcf_destroy(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+	struct mlx5_flow *dev_flow;
+
+	if (!flow)
+		return;
+	flow_tcf_remove(dev, flow);
+	dev_flow = LIST_FIRST(&flow->dev_flows);
+	if (!dev_flow)
+		return;
+	/* E-Switch flow can't be expanded. */
+	assert(!LIST_NEXT(dev_flow, next));
+	LIST_REMOVE(dev_flow, next);
+	rte_free(dev_flow);
+}
+
+const struct mlx5_flow_driver_ops mlx5_flow_tcf_drv_ops = {
+	.validate = flow_tcf_validate,
+	.prepare = flow_tcf_prepare,
+	.translate = flow_tcf_translate,
+	.apply = flow_tcf_apply,
+	.remove = flow_tcf_remove,
+	.destroy = flow_tcf_destroy,
+};
+
+/**
+ * Initialize ingress qdisc of a given network interface.
+ *
+ * @param nl
+ *   Libmnl socket of the @p NETLINK_ROUTE kind.
+ * @param ifindex
+ *   Index of network interface to initialize.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_tcf_init(struct mnl_socket *nl, unsigned int ifindex,
+		   struct rte_flow_error *error)
+{
+	struct nlmsghdr *nlh;
+	struct tcmsg *tcm;
+	alignas(struct nlmsghdr)
+	uint8_t buf[mnl_nlmsg_size(sizeof(*tcm) + 128)];
+
+	/* Destroy existing ingress qdisc and everything attached to it. */
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type = RTM_DELQDISC;
+	nlh->nlmsg_flags = NLM_F_REQUEST;
+	tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
+	tcm->tcm_family = AF_UNSPEC;
+	tcm->tcm_ifindex = ifindex;
+	tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+	tcm->tcm_parent = TC_H_INGRESS;
+	/* Ignore errors when qdisc is already absent. */
+	if (flow_tcf_nl_ack(nl, nlh) &&
+	    rte_errno != EINVAL && rte_errno != ENOENT)
+		return rte_flow_error_set(error, rte_errno,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "netlink: failed to remove ingress"
+					  " qdisc");
+	/* Create fresh ingress qdisc. */
+	nlh = mnl_nlmsg_put_header(buf);
+	nlh->nlmsg_type = RTM_NEWQDISC;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
+	tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
+	tcm->tcm_family = AF_UNSPEC;
+	tcm->tcm_ifindex = ifindex;
+	tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+	tcm->tcm_parent = TC_H_INGRESS;
+	mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress");
+	if (flow_tcf_nl_ack(nl, nlh))
+		return rte_flow_error_set(error, rte_errno,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+					  "netlink: failed to create ingress"
+					  " qdisc");
+	return 0;
+}
+
+/**
+ * Create and configure a libmnl socket for Netlink flow rules.
+ *
+ * @return
+ *   A valid libmnl socket object pointer on success, NULL otherwise and
+ *   rte_errno is set.
+ */
+struct mnl_socket *
+mlx5_flow_tcf_socket_create(void)
+{
+	struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE);
+
+	if (nl) {
+		mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 },
+				      sizeof(int));
+		if (!mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID))
+			return nl;
+	}
+	rte_errno = errno;
+	if (nl)
+		mnl_socket_close(nl);
+	return NULL;
+}
+
+/**
+ * Destroy a libmnl socket.
+ *
+ * @param nl
+ *   Libmnl socket of the @p NETLINK_ROUTE kind.
+ */
+void
+mlx5_flow_tcf_socket_destroy(struct mnl_socket *nl)
+{
+	mnl_socket_close(nl);
+}
-- 
2.11.0
    
    
More information about the dev
mailing list