[dpdk-dev] [RFC v1] net/mlx5: support e-switch flow count action

Moti Haimovsky motih at mellanox.com
Wed Aug 22 18:46:45 CEST 2018


Today, it is possible to offload an interface flow rules to the hardware
using DPDK flow commands.
With mlx5 it is also possible to offload a limited set of flow rules to
the mlxsw (or e-switch) using the same DPDK flow commands.
A 'transfer' attribute was added to the flow rule creation command in
order to distinguish between configuring port flows and E-switch flows.
The commands destined for the switch are transposed to TC flower rules
and are sent, as Netlink messages, to the mlx5 driver (or more precisely
to the netdev which represent the mlxsw port).
Each flow rule configured by the mlx5 driver is also assigned with a set
of flow counters. These counters can be retrieved when querying the flow
rule via Netlink, and they can be found in each flow action section of
that rule.
Currently the limited set of eswitch flow rules does not contain the
'count' action but since every rule contains a count we can still retrieve
these values as if we configured a 'count' action.
The purpose of this RFC is to propose a method to support the command
for configuring the mlx5 E-switch with 'count' action and to retrieve the
counters of it.

Supporting the 'count' action in the flow configuration command is
straight-forward. When transposing the command to a tc flower Netlink
message, just ignore it instead of rejecting it.
So the following two commands will have the same affect and behavior:
  testpmd> flow create 0 transfer ingress pattern eth src is
           11:22:33:44:55:77 / end actions drop / end
  testpmd> flow create 0 transfer ingress pattern eth src is
           11:22:33:44:55:77 / end actions count / drop / end
In the flow query side, the command
  testpmd> flow query 0 0 count
should also return that counts for flows configured in the e-switch.
Therefore the routine mlx5_flow_query_count will be enhanced to also
support e-switch flows.
A routine, named mlx5_nl_flow_query_count, will be written in order to
query the status of a specific e-switch flow, parse the returned message
and extract the flow counts if any. The routine will be called from the
currently existing mlx5_flow_query_count.
Special care is taken in order to prevent Netlink messages truncation
due to short buffers by using MNL_SOCKET_BUFFER_SIZE buffers. due to the
size of these buffers (8k bytes) it was decided to pre-allocate it
(per port instance) rather than use it as a local variable of the routine
(hence allocated on the stack).
Below is the code implementing the above.

NOTE:
 The code in this proposal depends on another rework not yet committed
and therefore may be subject to modifications.

Signed-off-by: Moti Haimovsky <motih at mellanox.com>
---
v1:
 * Modified headline.
 * Removed invalid comments.
---
 drivers/net/mlx5/mlx5.c         |   9 +-
 drivers/net/mlx5/mlx5.h         |  10 ++
 drivers/net/mlx5/mlx5_flow.c    |  17 ++-
 drivers/net/mlx5/mlx5_nl_flow.c | 318 +++++++++++++++++++++++++++++++++++++++-
 4 files changed, 346 insertions(+), 8 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 30d4e70..9bc7e1c 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -284,6 +284,8 @@
 		close(priv->nl_socket_rdma);
 	if (priv->mnl_socket)
 		mlx5_nl_flow_socket_destroy(priv->mnl_socket);
+	if (priv->mnl_rcvbuf)
+		mlx5_nl_flow_rcv_buf_destroy(priv->mnl_rcvbuf);
 	ret = mlx5_hrxq_ibv_verify(dev);
 	if (ret)
 		DRV_LOG(WARNING, "port %u some hash Rx queue still remain",
@@ -1131,7 +1133,8 @@
 	if (vf && config.vf_nl_en)
 		mlx5_nl_mac_addr_sync(eth_dev);
 	priv->mnl_socket = mlx5_nl_flow_socket_create();
-	if (!priv->mnl_socket) {
+	priv->mnl_rcvbuf = mlx5_nl_flow_rcv_buf_create();
+	if (!priv->mnl_socket || !priv->mnl_rcvbuf) {
 		err = -rte_errno;
 		DRV_LOG(WARNING,
 			"flow rules relying on switch offloads will not be"
@@ -1155,7 +1158,9 @@
 				" not be supported: %s: %s",
 				error.message, strerror(rte_errno));
 			mlx5_nl_flow_socket_destroy(priv->mnl_socket);
+			mlx5_nl_flow_rcv_buf_destroy(priv->mnl_rcvbuf);
 			priv->mnl_socket = NULL;
+			priv->mnl_rcvbuf = NULL;
 		}
 	}
 	TAILQ_INIT(&priv->flows);
@@ -1212,6 +1217,8 @@
 			close(priv->nl_socket_rdma);
 		if (priv->mnl_socket)
 			mlx5_nl_flow_socket_destroy(priv->mnl_socket);
+		if (priv->mnl_rcvbuf)
+			mlx5_nl_flow_rcv_buf_destroy(priv->mnl_rcvbuf);
 		if (own_domain_id)
 			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
 		rte_free(priv);
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 35a196e..4004545 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -163,6 +163,7 @@ struct mlx5_nl_flow_ptoi {
 };
 
 struct mnl_socket;
+struct mlx5_nl_rbuf;
 
 struct priv {
 	LIST_ENTRY(priv) mem_event_cb; /* Called by memory event callback. */
@@ -229,6 +230,8 @@ struct priv {
 	/* UAR same-page access control required in 32bit implementations. */
 #endif
 	struct mnl_socket *mnl_socket; /* Libmnl socket. */
+	struct mlx5_nl_rbuf *mnl_rcvbuf;
+	/* Buffer for receiving libmnl messages. */
 };
 
 #define PORT_ID(priv) ((priv)->dev_data->port_id)
@@ -414,5 +417,12 @@ int mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex,
 		      struct rte_flow_error *error);
 struct mnl_socket *mlx5_nl_flow_socket_create(void);
 void mlx5_nl_flow_socket_destroy(struct mnl_socket *nl);
+struct mlx5_nl_rbuf *mlx5_nl_flow_rcv_buf_create(void);
+void mlx5_nl_flow_rcv_buf_destroy(struct mlx5_nl_rbuf *rb);
+int mlx5_nl_flow_query_count(struct mnl_socket *nl,
+			     void *fbuf,
+			     struct mlx5_nl_rbuf *rbuf,
+			     struct rte_flow_query_count *qc,
+			     struct rte_flow_error *error);
 
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 3f548a9..0d6e3be 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -3370,13 +3370,20 @@ struct rte_flow *
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 static int
-mlx5_flow_query_count(struct rte_flow *flow __rte_unused,
-		      void *data __rte_unused,
+mlx5_flow_query_count(struct rte_eth_dev *dev,
+		      struct rte_flow *flow,
+		      void *data,
 		      struct rte_flow_error *error)
 {
+	struct priv *priv = dev->data->dev_private;
+	struct rte_flow_query_count *qc = data;
+
+	if (flow->nl_flow && priv->mnl_socket && priv->mnl_rcvbuf)
+		return mlx5_nl_flow_query_count(priv->mnl_socket,
+						flow->nl_flow,
+						priv->mnl_rcvbuf, qc, error);
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
 	if (flow->modifier & MLX5_FLOW_MOD_COUNT) {
-		struct rte_flow_query_count *qc = data;
 		uint64_t counters[2] = {0, 0};
 		struct ibv_query_counter_set_attr query_cs_attr = {
 			.cs = flow->counter->cs,
@@ -3423,7 +3430,7 @@ struct rte_flow *
  * @see rte_flow_ops
  */
 int
-mlx5_flow_query(struct rte_eth_dev *dev __rte_unused,
+mlx5_flow_query(struct rte_eth_dev *dev,
 		struct rte_flow *flow,
 		const struct rte_flow_action *actions,
 		void *data,
@@ -3436,7 +3443,7 @@ struct rte_flow *
 		case RTE_FLOW_ACTION_TYPE_VOID:
 			break;
 		case RTE_FLOW_ACTION_TYPE_COUNT:
-			ret = mlx5_flow_query_count(flow, data, error);
+			ret = mlx5_flow_query_count(dev, flow, data, error);
 			break;
 		default:
 			return rte_flow_error_set(error, ENOTSUP,
diff --git a/drivers/net/mlx5/mlx5_nl_flow.c b/drivers/net/mlx5/mlx5_nl_flow.c
index beb03c9..fb1b64d 100644
--- a/drivers/net/mlx5/mlx5_nl_flow.c
+++ b/drivers/net/mlx5/mlx5_nl_flow.c
@@ -11,6 +11,7 @@
 #include <linux/pkt_cls.h>
 #include <linux/pkt_sched.h>
 #include <linux/rtnetlink.h>
+#include <linux/gen_stats.h>
 #include <linux/tc_act/tc_gact.h>
 #include <linux/tc_act/tc_mirred.h>
 #include <netinet/in.h>
@@ -25,6 +26,7 @@
 #include <rte_errno.h>
 #include <rte_ether.h>
 #include <rte_flow.h>
+#include <rte_malloc.h>
 
 #include "mlx5.h"
 #include "mlx5_autoconf.h"
@@ -147,6 +149,16 @@ struct tc_vlan {
 #define TCA_FLOWER_KEY_VLAN_ETH_TYPE 25
 #endif
 
+/**
+ * Structure for holding netlink message buffer of MNL_SOCKET_BUFFER_SIZE.
+ * Using this (8KB) buffer size ensures that netlink messages will never be
+ * truncated.
+ */
+struct mlx5_nl_rbuf {
+	uint8_t *buf;
+	uint16_t bsize;
+};
+
 /** Parser state definitions for mlx5_nl_flow_trans[]. */
 enum mlx5_nl_flow_trans {
 	INVALID,
@@ -169,6 +181,7 @@ enum mlx5_nl_flow_trans {
 	ACTION_OF_PUSH_VLAN,
 	ACTION_OF_SET_VLAN_VID,
 	ACTION_OF_SET_VLAN_PCP,
+	ACTION_COUNT,
 	END,
 };
 
@@ -178,7 +191,7 @@ enum mlx5_nl_flow_trans {
 	ITEM_VOID, ITEM_PORT_ID, ACTIONS
 #define ACTIONS_COMMON \
 	ACTION_VOID, ACTION_OF_POP_VLAN, ACTION_OF_PUSH_VLAN, \
-	ACTION_OF_SET_VLAN_VID, ACTION_OF_SET_VLAN_PCP
+	ACTION_OF_SET_VLAN_VID, ACTION_OF_SET_VLAN_PCP, ACTION_COUNT
 #define ACTIONS_FATE \
 	ACTION_PORT_ID, ACTION_DROP
 
@@ -204,6 +217,7 @@ enum mlx5_nl_flow_trans {
 	[ACTION_OF_PUSH_VLAN] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
 	[ACTION_OF_SET_VLAN_VID] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
 	[ACTION_OF_SET_VLAN_PCP] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
+	[ACTION_COUNT] = TRANS(ACTIONS_FATE, ACTIONS_COMMON),
 	[END] = NULL,
 };
 
@@ -869,6 +883,11 @@ enum mlx5_nl_flow_trans {
 			goto trans;
 		++action;
 		break;
+	case ACTION_COUNT:
+		if (action->type != RTE_FLOW_ACTION_TYPE_COUNT)
+			goto trans;
+		++action;
+		break;
 	case ACTION_PORT_ID:
 		if (action->type != RTE_FLOW_ACTION_TYPE_PORT_ID)
 			goto trans;
@@ -1042,7 +1061,7 @@ enum mlx5_nl_flow_trans {
  *   Unique 32-bit handle to use.
  */
 void
-mlx5_nl_flow_brand(void *buf, uint32_t handle)
+mlx5_nl_flow_brand(void *buf, uint32_t handle __rte_unused)
 {
 	struct tcmsg *tcm = mnl_nlmsg_get_payload(buf);
 
@@ -1141,6 +1160,256 @@ enum mlx5_nl_flow_trans {
 }
 
 /**
+ * Parse rtnetlink message attributes filling the attribute table with the info
+ * being retrieved.
+ *
+ * @param tb
+ *   Attribute table to be filled.
+ * @param[out] max
+ *   Maxinum entry in the attribute table.
+ * @param rte
+ *   The attributes section in the message to be parsed.
+ * @param len
+ *   The length of the attributes section in the message.
+ * @return
+ *   0 on successful extraction of action counts, -1 otherwise.
+ */
+static void
+tc_parse_rtattr(struct rtattr *tb[], int max, struct rtattr *rta, int len)
+{
+	unsigned short type;
+
+	memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
+	while (RTA_OK(rta, len)) {
+		type = rta->rta_type;
+		if (type <= max && !tb[type])
+			tb[type] = rta;
+		rta = RTA_NEXT(rta, len);
+	}
+}
+
+/**
+ * Extract action counters from flower action.
+ *
+ * @param rta
+ *   flower action stats properties in the Netlink message received.
+ * @param[out] qc
+ *   Count statistics retrieved from the message query.
+ * @return
+ *   0 on successful extraction of action counts, -1 otherwise.
+ */
+static int
+tc_flow_extract_stats_attr(struct rtattr *rta, struct rte_flow_query_count *qc)
+{
+	struct rtattr *tbs[TCA_STATS_MAX + 1];
+
+	tc_parse_rtattr(tbs, TCA_STATS_MAX, RTA_DATA(rta), RTA_PAYLOAD(rta));
+	if (tbs[TCA_STATS_BASIC]) {
+		struct gnet_stats_basic bs = {0};
+
+		memcpy(&bs, RTA_DATA(tbs[TCA_STATS_BASIC]),
+		       RTE_MIN(RTA_PAYLOAD(tbs[TCA_STATS_BASIC]),
+		       sizeof(bs)));
+		qc->bytes = bs.bytes;
+		qc->hits = bs.packets;
+		qc->bytes_set = 1;
+		qc->hits_set = 1;
+		return 0;
+	}
+	return -1;
+}
+
+/**
+ * Parse flower single action retrieving the flow counters from it if present.
+ *
+ * @param arg
+ *   flower action properties in the Netlink message received.
+ * @param[out] qc
+ *   Count statistics retrieved from the message query.
+ * @return
+ *   0 on successful retrieval of action counts, -1 otherwise.
+ */
+static int
+tc_flow_parse_one_action(struct rtattr *arg, struct rte_flow_query_count *qc)
+{
+	struct rtattr *tb[TCA_ACT_MAX + 1];
+
+	if (arg == NULL)
+		return -1;
+	tc_parse_rtattr(tb, TCA_ACT_MAX, RTA_DATA(arg), RTA_PAYLOAD(arg));
+	if (tb[TCA_ACT_KIND] == NULL)
+		return -1;
+	if (tb[TCA_ACT_STATS])
+		return tc_flow_extract_stats_attr(tb[TCA_ACT_STATS], qc);
+	return -1;
+}
+
+/**
+ * Parse flower action section in the message, retrieving the flow counters
+ * from the first action that contains them.
+ * flow counters are stored in the actions defined by the flow and not in the
+ * flow itself, therefore we need to traverse the flower action in search for
+ * them.
+ *
+ * @param opt
+ *   flower section in the Netlink message received.
+ * @param[out] qc
+ *   Count statistics retrieved from the message query.
+ */
+static void
+tc_flow_parse_action(const struct rtattr *arg, struct rte_flow_query_count *qc)
+{
+	struct rtattr *tb[TCA_ACT_MAX_PRIO + 1];
+	int i;
+
+	if (arg == NULL)
+		return;
+	tc_parse_rtattr(tb, TCA_ACT_MAX_PRIO, RTA_DATA(arg), RTA_PAYLOAD(arg));
+	for (i = 0; i <= TCA_ACT_MAX_PRIO; i++)
+		if (tb[i])
+			if (tc_flow_parse_one_action(tb[i], qc) == 0)
+				break;
+}
+
+/**
+ * Parse Netlink reply on flower type of filters, retrieving the flow counters
+ * from it.
+ *
+ * @param opt
+ *   flower section in the Netlink message received.
+ * @param[out] qc
+ *   Count statistics retrieved from the message query.
+ */
+static void
+tc_flower_parse_opt(struct rtattr *opt,
+		    struct rte_flow_query_count *qc)
+{
+	struct rtattr *tb[TCA_FLOWER_MAX + 1];
+
+	if (!opt)
+		return;
+	tc_parse_rtattr(tb, TCA_FLOWER_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt));
+	if (tb[TCA_FLOWER_ACT])
+		tc_flow_parse_action(tb[TCA_FLOWER_ACT], qc);
+}
+
+/**
+ * Parse Netlink reply on filter query, retrieving the flow counters.
+ *
+ * @param nlh
+ *   Message received from Netlink.
+ * @param[out] qc
+ *   Count statistics retrieved from the message query.
+ *
+ * @return
+ *   MNL_CB_ERROR on error, MNL_CB_OK value otherwise.
+ */
+static int
+mlx5_nl_flow_parse_filter(const struct nlmsghdr *nlh,
+			  struct rte_flow_query_count *qc)
+{
+	struct tcmsg *t = NLMSG_DATA(nlh);
+	int len = nlh->nlmsg_len;
+	struct rtattr *tb[TCA_MAX + 1] = { };
+
+	if (nlh->nlmsg_type != RTM_NEWTFILTER &&
+	    nlh->nlmsg_type != RTM_GETTFILTER &&
+	    nlh->nlmsg_type != RTM_DELTFILTER)
+		return MNL_CB_OK;
+	len -= NLMSG_LENGTH(sizeof(*t));
+	if (len < 0)
+		return MNL_CB_ERROR;
+	tc_parse_rtattr(tb, TCA_MAX, TCA_RTA(t), len);
+	if (tb[TCA_KIND])
+		if (strcmp(RTA_DATA(tb[TCA_KIND]), "flower") == 0)
+			tc_flower_parse_opt(tb[TCA_OPTIONS], qc);
+	return MNL_CB_OK;
+}
+
+/**
+ * A callback to parse Netlink reply on filter query attempting to retrieve the
+ * flow counters if present.
+ *
+ * @param nlh
+ *   Message received from Netlink.
+ * @param[out] data
+ *   pointer to the count statistics to be filled by the routine.
+ *
+ * @return
+ *   MNL_CB_ERROR on error, MNL_CB_OK value otherwise.
+ */
+static int
+mlx5_nl_flow_parse_message(const struct nlmsghdr *nlh, void *data)
+{
+	struct rte_flow_query_count *qc = (struct rte_flow_query_count *)data;
+
+	switch (nlh->nlmsg_type) {
+	case NLMSG_NOOP:
+		return MNL_CB_OK;
+	case NLMSG_ERROR:
+	case NLMSG_OVERRUN:
+		return MNL_CB_ERROR;
+	default:
+		break;
+	}
+	return mlx5_nl_flow_parse_filter(nlh, qc);
+}
+
+/**
+ * Query a Netlink flow rule for its statistics.
+ *
+ * @param nl
+ *   Libmnl socket to use.
+ * @param fbuf
+ *   Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
+ * @param rbuf
+ *   Buffer for holding Netlink response.
+ * @param[out] qc
+ *   Count statistics retrieved by the query.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_flow_query_count(struct mnl_socket *nl,
+			 void *fbuf,
+			 struct mlx5_nl_rbuf *rbuf,
+			 struct rte_flow_query_count *qc,
+			 struct rte_flow_error *error)
+{
+	struct nlmsghdr *nlh = fbuf;
+	uint32_t seq = random();
+	ssize_t ret;
+
+	if (qc == NULL)
+		return -EINVAL;
+
+	nlh->nlmsg_type = RTM_GETTFILTER;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ECHO;
+	nlh->nlmsg_seq = seq;
+	if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) == -1)
+		goto error_exit;
+	ret = mnl_socket_recvfrom(nl, rbuf->buf, rbuf->bsize);
+	if (ret == -1)
+		goto error_exit;
+	while (ret > 0) {
+		ret = mnl_cb_run(rbuf->buf, ret, seq,
+				 mnl_socket_get_portid(nl),
+				 mlx5_nl_flow_parse_message, qc);
+		if (ret <= MNL_CB_STOP)
+			break;
+		ret = mnl_socket_recvfrom(nl, rbuf->buf, rbuf->bsize);
+	}
+	return 0;
+error_exit:
+	return rte_flow_error_set
+			(error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+			 NULL, "netlink: failed to read flow rule statistics");
+}
+
+/**
  * Initialize ingress qdisc of a given network interface.
  *
  * @param nl
@@ -1226,3 +1495,48 @@ struct mnl_socket *
 {
 	mnl_socket_close(nl);
 }
+
+/**
+ * Create netlink receive buffer.
+ * Netlink queries my result a large netlink reply, in case of a short
+ * receive buffer a reply message may be truncated. To avoid this,
+ * we allocate a buffer of MNL_SOCKET_BUFFER_SIZE (which is system dependent
+ * and ususally a 8KB long). Using this buffer size ensures that netlink
+ * messages will be stored without truncating.
+ *
+ * @return
+ *   pointer to mlx5_nl_rbuf created, NULL value otherwise.
+ */
+struct mlx5_nl_rbuf *
+mlx5_nl_flow_rcv_buf_create(void)
+{
+	struct mlx5_nl_rbuf *rbuf = rte_zmalloc(__func__,
+						sizeof(struct mlx5_nl_rbuf),
+						sizeof(uint32_t));
+	uint8_t *buf = rte_zmalloc(__func__,
+				   MNL_SOCKET_BUFFER_SIZE,
+				   sizeof(uint32_t));
+	if (!buf || !rbuf) {
+		rte_free(buf);
+		rte_free(rbuf);
+		return NULL;
+	}
+	rbuf->buf = buf;
+	rbuf->bsize = MNL_SOCKET_BUFFER_SIZE;
+	return rbuf;
+}
+
+/**
+ * Destroy mlx5_nl_rbuf.
+ *
+ * @param rb
+ *   The receive buffer to destroy.
+ */
+void
+mlx5_nl_flow_rcv_buf_destroy(struct mlx5_nl_rbuf *rb)
+{
+	if (rb) {
+		rte_free(rb->buf);
+		rte_free(rb);
+	}
+}
-- 
1.8.3.1



More information about the dev mailing list