[dpdk-dev] [PATCH v2 1/4] net/tap: add remote netdevice traffic capture

Pascal Mazon pascal.mazon at 6wind.com
Tue Mar 7 17:38:51 CET 2017


By default, a tap netdevice is of no use when not fed by a separate
process. The ability to automatically feed it from another netdevice
allows applications to capture any kind of traffic normally destined to
the kernel stack.

This patch implements this ability through a new optional "remote"
parameter.

Packets matching filtering rules created with the flow API are matched
on the remote device and redirected to the tap PMD, where the relevant
action will be performed.

Signed-off-by: Pascal Mazon <pascal.mazon at 6wind.com>
Acked-by: Olga Shern <olgas at mellanox.com>
---
 doc/guides/nics/tap.rst       |  17 ++
 drivers/net/tap/rte_eth_tap.c |  79 +++++++-
 drivers/net/tap/tap.h         |   2 +
 drivers/net/tap/tap_flow.c    | 418 ++++++++++++++++++++++++++++++++++++++++--
 drivers/net/tap/tap_flow.h    |  24 +++
 5 files changed, 526 insertions(+), 14 deletions(-)

diff --git a/doc/guides/nics/tap.rst b/doc/guides/nics/tap.rst
index cdb528b5eae4..676a569b00ca 100644
--- a/doc/guides/nics/tap.rst
+++ b/doc/guides/nics/tap.rst
@@ -58,6 +58,23 @@ needed, but the interface does not enforce that speed, for example::
 
    --vdev=net_tap0,iface=foo0,speed=25000
 
+It is possible to specify a remote netdevice to capture packets from by adding
+``remote=foo1``, for example::
+
+   --vdev=net_tap,iface=tap0,remote=foo1
+
+If a ``remote`` is set, then all packets with the tap PMD's local MAC coming
+in on the remote netdevice will be redirected to the tap.
+If the tap is in promiscuous mode, then all packets will be redirected.
+In allmulti mode, all multicast packets will be redirected.
+It is possible to add explicit rte_flow rules on the tap PMD to capture specific
+traffic. For instance, in testpmd, the following rte_flow rule would capture
+packets with the given MAC address from the remote, and send it to the tap RX
+QUEUE 3::
+
+   testpmd> flow create 0 ingress pattern eth src is 02:03:04:05:06:07 / \
+            end actions queue index 3 / end
+
 After the DPDK application is started you can send and receive packets on the
 interface using the standard rx_burst/tx_burst APIs in DPDK. From the host
 point of view you can use any host tool like tcpdump, Wireshark, ping, Pktgen
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 5727f6228b17..b29cfbfb41f3 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -63,6 +63,7 @@
 
 #define ETH_TAP_IFACE_ARG       "iface"
 #define ETH_TAP_SPEED_ARG       "speed"
+#define ETH_TAP_REMOTE_ARG      "remote"
 
 #ifdef IFF_MULTI_QUEUE
 #define RTE_PMD_TAP_MAX_QUEUES	16
@@ -77,6 +78,7 @@ static struct rte_vdev_driver pmd_tap_drv;
 static const char *valid_arguments[] = {
 	ETH_TAP_IFACE_ARG,
 	ETH_TAP_SPEED_ARG,
+	ETH_TAP_REMOTE_ARG,
 	NULL
 };
 
@@ -435,6 +437,7 @@ tap_dev_close(struct rte_eth_dev *dev __rte_unused)
 	struct pmd_internals *internals = dev->data->dev_private;
 
 	tap_link_set_down(dev);
+	tap_flow_implicit_flush(dev, NULL);
 
 	for (i = 0; i < internals->nb_queues; i++) {
 		if (internals->rxq[i].fd != -1)
@@ -480,6 +483,8 @@ tap_promisc_enable(struct rte_eth_dev *dev)
 
 	dev->data->promiscuous = 1;
 	tap_link_set_flags(pmd, IFF_PROMISC, 1);
+	if (pmd->remote_if_index)
+		tap_flow_implicit_create(pmd, TAP_REMOTE_PROMISC);
 }
 
 static void
@@ -489,6 +494,8 @@ tap_promisc_disable(struct rte_eth_dev *dev)
 
 	dev->data->promiscuous = 0;
 	tap_link_set_flags(pmd, IFF_PROMISC, 0);
+	if (pmd->remote_if_index)
+		tap_flow_implicit_destroy(dev, TAP_REMOTE_PROMISC);
 }
 
 static void
@@ -498,6 +505,8 @@ tap_allmulti_enable(struct rte_eth_dev *dev)
 
 	dev->data->all_multicast = 1;
 	tap_link_set_flags(pmd, IFF_ALLMULTI, 1);
+	if (pmd->remote_if_index)
+		tap_flow_implicit_create(pmd, TAP_REMOTE_ALLMULTI);
 }
 
 static void
@@ -507,6 +516,8 @@ tap_allmulti_disable(struct rte_eth_dev *dev)
 
 	dev->data->all_multicast = 0;
 	tap_link_set_flags(pmd, IFF_ALLMULTI, 0);
+	if (pmd->remote_if_index)
+		tap_flow_implicit_destroy(dev, TAP_REMOTE_ALLMULTI);
 }
 
 static void
@@ -632,9 +643,42 @@ tap_setup_queue(struct rte_eth_dev *dev,
 				pmd->name);
 			return fd;
 		}
+		if (pmd->remote_if_index) {
+			/*
+			 * Flush usually returns negative value because it tries
+			 * to delete every QDISC (and on a running device, one
+			 * QDISC at least is needed). Ignore negative return
+			 * value.
+			 */
+			qdisc_flush(pmd->nlsk_fd, pmd->remote_if_index);
+			if (qdisc_create_ingress(pmd->nlsk_fd,
+						 pmd->remote_if_index) < 0)
+				goto remote_fail;
+			LIST_INIT(&pmd->implicit_flows);
+			if (tap_flow_implicit_create(
+				    pmd, TAP_REMOTE_LOCAL_MAC) < 0)
+				goto remote_fail;
+			if (tap_flow_implicit_create(
+				    pmd, TAP_REMOTE_BROADCAST) < 0)
+				goto remote_fail;
+			if (tap_flow_implicit_create(
+				    pmd, TAP_REMOTE_BROADCASTV6) < 0)
+				goto remote_fail;
+			if (tap_flow_implicit_create(
+				    pmd, TAP_REMOTE_TX) < 0)
+				goto remote_fail;
+		}
 	}
 
 	return fd;
+
+remote_fail:
+	RTE_LOG(ERR, PMD,
+		"Could not set up remote flow rules for %s: remote disabled.\n",
+		pmd->name);
+	pmd->remote_if_index = 0;
+	tap_flow_implicit_flush(dev, NULL);
+	return fd;
 }
 
 static int
@@ -848,7 +892,7 @@ tap_kernel_support(struct pmd_internals *pmd)
 }
 
 static int
-eth_dev_tap_create(const char *name, char *tap_name)
+eth_dev_tap_create(const char *name, char *tap_name, char *remote_iface)
 {
 	int numa_node = rte_socket_id();
 	struct rte_eth_dev *dev = NULL;
@@ -917,6 +961,13 @@ eth_dev_tap_create(const char *name, char *tap_name)
 	 * creating/destroying flow rules.
 	 */
 	pmd->nlsk_fd = nl_init();
+	if (strlen(remote_iface)) {
+		pmd->remote_if_index = if_nametoindex(remote_iface);
+		if (!pmd->remote_if_index)
+			RTE_LOG(ERR, PMD, "Could not find %s ifindex: "
+				"remote interface will remain unconfigured\n",
+				remote_iface);
+	}
 
 	return 0;
 
@@ -957,6 +1008,19 @@ set_interface_speed(const char *key __rte_unused,
 	return 0;
 }
 
+static int
+set_remote_iface(const char *key __rte_unused,
+		 const char *value,
+		 void *extra_args)
+{
+	char *name = (char *)extra_args;
+
+	if (value)
+		snprintf(name, RTE_ETH_NAME_MAX_LEN, "%s", value);
+
+	return 0;
+}
+
 /* Open a TAP interface device.
  */
 static int
@@ -966,6 +1030,7 @@ rte_pmd_tap_probe(const char *name, const char *params)
 	struct rte_kvargs *kvlist = NULL;
 	int speed;
 	char tap_name[RTE_ETH_NAME_MAX_LEN];
+	char remote_iface[RTE_ETH_NAME_MAX_LEN];
 
 	speed = ETH_SPEED_NUM_10G;
 	snprintf(tap_name, sizeof(tap_name), "%s%d",
@@ -993,6 +1058,15 @@ rte_pmd_tap_probe(const char *name, const char *params)
 				if (ret == -1)
 					goto leave;
 			}
+
+			if (rte_kvargs_count(kvlist, ETH_TAP_REMOTE_ARG) == 1) {
+				ret = rte_kvargs_process(kvlist,
+							 ETH_TAP_REMOTE_ARG,
+							 &set_remote_iface,
+							 remote_iface);
+				if (ret == -1)
+					goto leave;
+			}
 		}
 	}
 	pmd_link.link_speed = speed;
@@ -1000,7 +1074,7 @@ rte_pmd_tap_probe(const char *name, const char *params)
 	RTE_LOG(NOTICE, PMD, "Initializing pmd_tap for %s as %s\n",
 		name, tap_name);
 
-	ret = eth_dev_tap_create(name, tap_name);
+	ret = eth_dev_tap_create(name, tap_name, remote_iface);
 
 leave:
 	if (ret == -1) {
@@ -1031,6 +1105,7 @@ rte_pmd_tap_remove(const char *name)
 		return 0;
 
 	tap_flow_flush(eth_dev, NULL);
+	tap_flow_implicit_flush(eth_dev, NULL);
 
 	internals = eth_dev->data->dev_private;
 	if (internals->flower_support && internals->nlsk_fd)
diff --git a/drivers/net/tap/tap.h b/drivers/net/tap/tap.h
index 2c8cc7d5b485..4c4de939f1cc 100644
--- a/drivers/net/tap/tap.h
+++ b/drivers/net/tap/tap.h
@@ -71,6 +71,8 @@ struct pmd_internals {
 	int flower_support;               /* 1 if kernel supports, else 0 */
 	int flower_vlan_support;          /* 1 if kernel supports, else 0 */
 	LIST_HEAD(tap_flows, rte_flow) flows;        /* rte_flow rules */
+	/* implicit rte_flow rules set when a remote device is active */
+	LIST_HEAD(tap_implicit_flows, rte_flow) implicit_flows;
 	struct rx_queue rxq[RTE_PMD_TAP_MAX_QUEUES]; /* List of RX queues */
 	struct tx_queue txq[RTE_PMD_TAP_MAX_QUEUES]; /* List of TX queues */
 };
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index 3fb28b1db917..25260570ee50 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -82,6 +82,7 @@ enum {
 
 struct rte_flow {
 	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
+	struct rte_flow *remote_flow; /* associated remote flow */
 	struct nlmsg msg;
 };
 
@@ -92,6 +93,12 @@ struct convert_data {
 	struct rte_flow *flow;
 };
 
+struct remote_rule {
+	struct rte_flow_attr attr;
+	struct rte_flow_item items[2];
+	int mirred;
+};
+
 static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
 static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
 static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
@@ -249,6 +256,114 @@ static const struct tap_flow_items tap_flow_items[] = {
 	},
 };
 
+static struct remote_rule implicit_rte_flows[TAP_REMOTE_MAX_IDX] = {
+	[TAP_REMOTE_LOCAL_MAC] = {
+		.attr = {
+			.group = MAX_GROUP,
+			.priority = PRIORITY_MASK - TAP_REMOTE_LOCAL_MAC,
+			.ingress = 1,
+		},
+		.items[0] = {
+			.type = RTE_FLOW_ITEM_TYPE_ETH,
+			.mask =  &(const struct rte_flow_item_eth){
+				.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+			},
+		},
+		.items[1] = {
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+		.mirred = TCA_EGRESS_REDIR,
+	},
+	[TAP_REMOTE_BROADCAST] = {
+		.attr = {
+			.group = MAX_GROUP,
+			.priority = PRIORITY_MASK - TAP_REMOTE_BROADCAST,
+			.ingress = 1,
+		},
+		.items[0] = {
+			.type = RTE_FLOW_ITEM_TYPE_ETH,
+			.mask =  &(const struct rte_flow_item_eth){
+				.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+			},
+			.spec = &(const struct rte_flow_item_eth){
+				.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+			},
+		},
+		.items[1] = {
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+		.mirred = TCA_EGRESS_MIRROR,
+	},
+	[TAP_REMOTE_BROADCASTV6] = {
+		.attr = {
+			.group = MAX_GROUP,
+			.priority = PRIORITY_MASK - TAP_REMOTE_BROADCASTV6,
+			.ingress = 1,
+		},
+		.items[0] = {
+			.type = RTE_FLOW_ITEM_TYPE_ETH,
+			.mask =  &(const struct rte_flow_item_eth){
+				.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
+			},
+			.spec = &(const struct rte_flow_item_eth){
+				.dst.addr_bytes = "\x33\x33\x00\x00\x00\x00",
+			},
+		},
+		.items[1] = {
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+		.mirred = TCA_EGRESS_MIRROR,
+	},
+	[TAP_REMOTE_PROMISC] = {
+		.attr = {
+			.group = MAX_GROUP,
+			.priority = PRIORITY_MASK - TAP_REMOTE_PROMISC,
+			.ingress = 1,
+		},
+		.items[0] = {
+			.type = RTE_FLOW_ITEM_TYPE_VOID,
+		},
+		.items[1] = {
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+		.mirred = TCA_EGRESS_MIRROR,
+	},
+	[TAP_REMOTE_ALLMULTI] = {
+		.attr = {
+			.group = MAX_GROUP,
+			.priority = PRIORITY_MASK - TAP_REMOTE_ALLMULTI,
+			.ingress = 1,
+		},
+		.items[0] = {
+			.type = RTE_FLOW_ITEM_TYPE_ETH,
+			.mask =  &(const struct rte_flow_item_eth){
+				.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
+			},
+			.spec = &(const struct rte_flow_item_eth){
+				.dst.addr_bytes = "\x01\x00\x00\x00\x00\x00",
+			},
+		},
+		.items[1] = {
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+		.mirred = TCA_EGRESS_MIRROR,
+	},
+	[TAP_REMOTE_TX] = {
+		.attr = {
+			.group = 0,
+			.priority = TAP_REMOTE_TX,
+			.egress = 1,
+		},
+		.items[0] = {
+			.type = RTE_FLOW_ITEM_TYPE_VOID,
+		},
+		.items[1] = {
+			.type = RTE_FLOW_ITEM_TYPE_END,
+		},
+		.mirred = TCA_EGRESS_MIRROR,
+	},
+};
+
 /**
  * Make as much checks as possible on an Ethernet item, and if a flow is
  * provided, fill it appropriately with Ethernet info.
@@ -673,6 +788,47 @@ add_action_gact(struct rte_flow *flow, int action)
 }
 
 /**
+ * Transform a MIRRED action item in the provided flow for TC.
+ *
+ * @param[in, out] flow
+ *   Flow to be filled.
+ * @param[in] ifindex
+ *   Netdevice ifindex, where to mirror/redirect packet to.
+ * @param[in] action_type
+ *   Either TCA_EGRESS_REDIR for redirection or TCA_EGRESS_MIRROR for mirroring.
+ *
+ * @return
+ *   0 if checks are alright, -1 otherwise.
+ */
+static int
+add_action_mirred(struct rte_flow *flow, uint16_t ifindex, uint16_t action_type)
+{
+	struct nlmsg *msg = &flow->msg;
+	size_t act_index = 1;
+	struct tc_mirred p = {
+		.eaction = action_type,
+		.ifindex = ifindex,
+	};
+
+	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
+		return -1;
+	if (nlattr_nested_start(msg, act_index++) < 0)
+		return -1;
+	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("mirred"), "mirred");
+	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
+		return -1;
+	if (action_type == TCA_EGRESS_MIRROR)
+		p.action = TC_ACT_PIPE;
+	else /* REDIRECT */
+		p.action = TC_ACT_STOLEN;
+	nlattr_add(&msg->nh, TCA_MIRRED_PARMS, sizeof(p), &p);
+	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
+	nlattr_nested_finish(msg); /* nested act_index */
+	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
+	return 0;
+}
+
+/**
  * Transform a QUEUE action item in the provided flow for TC.
  *
  * @param[in, out] flow
@@ -723,6 +879,15 @@ add_action_skbedit(struct rte_flow *flow, uint16_t queue)
  *   Perform verbose error reporting if not NULL.
  * @param[in, out] flow
  *   Flow structure to update.
+ * @param[in] mirred
+ *   If set to TCA_EGRESS_REDIR, provided actions will be replaced with a
+ *   redirection to the tap netdevice, and the TC rule will be configured
+ *   on the remote netdevice in pmd.
+ *   If set to TCA_EGRESS_MIRROR, provided actions will be replaced with a
+ *   mirroring to the tap netdevice, and the TC rule will be configured
+ *   on the remote netdevice in pmd. Matching packets will thus be duplicated.
+ *   If set to 0, the standard behavior is to be used: set correct actions for
+ *   the TC rule, and apply it on the tap netdevice.
  *
  * @return
  *   0 on success, a negative errno value otherwise and rte_errno is set.
@@ -733,7 +898,8 @@ priv_flow_process(struct pmd_internals *pmd,
 		  const struct rte_flow_item items[],
 		  const struct rte_flow_action actions[],
 		  struct rte_flow_error *error,
-		  struct rte_flow *flow)
+		  struct rte_flow *flow,
+		  int mirred)
 {
 	const struct tap_flow_items *cur_item = tap_flow_items;
 	struct convert_data data = {
@@ -760,15 +926,21 @@ priv_flow_process(struct pmd_internals *pmd,
 		flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
 						 flow->msg.t.tcm_info);
 	}
-	if (!attr->ingress) {
-		rte_flow_error_set(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR,
-				   NULL, "direction should be ingress");
-		return -rte_errno;
-	}
-	/* rte_flow ingress is actually egress as seen in the kernel */
-	if (attr->ingress && flow)
-		flow->msg.t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
 	if (flow) {
+		if (mirred) {
+			/*
+			 * If attr->ingress, the rule applies on remote ingress
+			 * to match incoming packets
+			 * If attr->egress, the rule applies on tap ingress (as
+			 * seen from the kernel) to deal with packets going out
+			 * from the DPDK app.
+			 */
+			flow->msg.t.tcm_parent = TC_H_MAKE(TC_H_INGRESS, 0);
+		} else {
+			/* Standard rule on tap egress (kernel standpoint). */
+			flow->msg.t.tcm_parent =
+				TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+		}
 		/* use flower filter type */
 		nlattr_add(&flow->msg.nh, TCA_KIND, sizeof("flower"), "flower");
 		if (nlattr_nested_start(&flow->msg, TCA_OPTIONS) < 0)
@@ -821,6 +993,22 @@ priv_flow_process(struct pmd_internals *pmd,
 				     data.eth_type);
 		}
 	}
+	if (mirred && flow) {
+		uint16_t if_index = pmd->if_index;
+
+		/*
+		 * If attr->egress && mirred, then this is a special
+		 * case where the rule must be applied on the tap, to
+		 * redirect packets coming from the DPDK App, out
+		 * through the remote netdevice.
+		 */
+		if (attr->egress)
+			if_index = pmd->remote_if_index;
+		if (add_action_mirred(flow, if_index, mirred) < 0)
+			goto exit_action_not_supported;
+		else
+			goto end;
+	}
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; ++actions) {
 		int err = 0;
 
@@ -855,6 +1043,7 @@ priv_flow_process(struct pmd_internals *pmd,
 		if (err)
 			goto exit_action_not_supported;
 	}
+end:
 	if (flow)
 		nlattr_nested_finish(&flow->msg); /* nested TCA_OPTIONS */
 	return 0;
@@ -885,7 +1074,7 @@ tap_flow_validate(struct rte_eth_dev *dev,
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
 
-	return priv_flow_process(pmd, attr, items, actions, error, NULL);
+	return priv_flow_process(pmd, attr, items, actions, error, NULL, 0);
 }
 
 /**
@@ -941,6 +1130,7 @@ tap_flow_create(struct rte_eth_dev *dev,
 		struct rte_flow_error *error)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *remote_flow = NULL;
 	struct rte_flow *flow = NULL;
 	struct nlmsg *msg = NULL;
 	int err;
@@ -951,6 +1141,17 @@ tap_flow_create(struct rte_eth_dev *dev,
 				   "can't create rule, ifindex not found");
 		goto fail;
 	}
+	/*
+	 * No rules configured through standard rte_flow should be set on the
+	 * priorities used by implicit rules.
+	 */
+	if ((attr->group == MAX_GROUP) &&
+	    attr->priority > (MAX_PRIORITY - TAP_REMOTE_MAX_IDX)) {
+		rte_flow_error_set(
+			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+			NULL, "priority value too big");
+		goto fail;
+	}
 	flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
 	if (!flow) {
 		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
@@ -962,7 +1163,7 @@ tap_flow_create(struct rte_eth_dev *dev,
 		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
 	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
 	tap_flow_set_handle(flow);
-	if (priv_flow_process(pmd, attr, items, actions, error, flow))
+	if (priv_flow_process(pmd, attr, items, actions, error, flow, 0))
 		goto fail;
 	err = nl_send(pmd->nlsk_fd, &msg->nh);
 	if (err < 0) {
@@ -977,14 +1178,183 @@ tap_flow_create(struct rte_eth_dev *dev,
 		goto fail;
 	}
 	LIST_INSERT_HEAD(&pmd->flows, flow, next);
+	/**
+	 * If a remote device is configured, a TC rule with identical items for
+	 * matching must be set on that device, with a single action: redirect
+	 * to the local pmd->if_index.
+	 */
+	if (pmd->remote_if_index) {
+		remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
+		if (!remote_flow) {
+			rte_flow_error_set(
+				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+				"cannot allocate memory for rte_flow");
+			goto fail;
+		}
+		msg = &remote_flow->msg;
+		/* set the rule if_index for the remote netdevice */
+		tc_init_msg(
+			msg, pmd->remote_if_index, RTM_NEWTFILTER,
+			NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+		msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
+		tap_flow_set_handle(remote_flow);
+		if (priv_flow_process(pmd, attr, items, NULL,
+				      error, remote_flow, TCA_EGRESS_REDIR)) {
+			rte_flow_error_set(
+				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
+				NULL, "rte flow rule validation failed");
+			goto fail;
+		}
+		err = nl_send(pmd->nlsk_fd, &msg->nh);
+		if (err < 0) {
+			rte_flow_error_set(
+				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
+				NULL, "Failure sending nl request");
+			goto fail;
+		}
+		err = nl_recv_ack(pmd->nlsk_fd);
+		if (err < 0) {
+			rte_flow_error_set(
+				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
+				NULL, "overlapping rules");
+			goto fail;
+		}
+		flow->remote_flow = remote_flow;
+	}
 	return flow;
 fail:
+	if (remote_flow)
+		rte_free(remote_flow);
 	if (flow)
 		rte_free(flow);
 	return NULL;
 }
 
 /**
+ * Add an implicit flow rule on the remote device to make sure traffic gets to
+ * the tap netdevice from there.
+ *
+ * @param pmd
+ *   Pointer to private structure.
+ * @param[in] idx
+ *   The idx in the implicit_rte_flows array specifying which rule to apply.
+ *
+ * @return -1 if the rule couldn't be applied, 0 otherwise.
+ */
+int tap_flow_implicit_create(struct pmd_internals *pmd,
+			     enum implicit_rule_index idx)
+{
+	struct rte_flow_item *items = implicit_rte_flows[idx].items;
+	struct rte_flow_attr *attr = &implicit_rte_flows[idx].attr;
+	struct rte_flow_item_eth eth_local = { .type = 0 };
+	uint16_t if_index = pmd->remote_if_index;
+	struct rte_flow *remote_flow = NULL;
+	struct nlmsg *msg = NULL;
+	int err = 0;
+	struct rte_flow_item items_local[2] = {
+		[0] = {
+			.type = items[0].type,
+			.spec = &eth_local,
+			.mask = items[0].mask,
+		},
+		[1] = {
+			.type = items[1].type,
+		}
+	};
+
+	remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
+	if (!remote_flow) {
+		RTE_LOG(ERR, PMD, "Cannot allocate memory for rte_flow");
+		goto fail;
+	}
+	msg = &remote_flow->msg;
+	if (idx == TAP_REMOTE_TX) {
+		if_index = pmd->if_index;
+	} else if (idx == TAP_REMOTE_LOCAL_MAC) {
+		/*
+		 * eth addr couldn't be set in implicit_rte_flows[] as it is not
+		 * known at compile time.
+		 */
+		memcpy(&eth_local.dst, &pmd->eth_addr, sizeof(pmd->eth_addr));
+		items = items_local;
+	}
+	tc_init_msg(msg, if_index, RTM_NEWTFILTER,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
+	tap_flow_set_handle(remote_flow);
+	if (priv_flow_process(pmd, attr, items, NULL, NULL,
+			      remote_flow, implicit_rte_flows[idx].mirred)) {
+		RTE_LOG(ERR, PMD, "rte flow rule validation failed\n");
+		goto fail;
+	}
+	err = nl_send(pmd->nlsk_fd, &msg->nh);
+	if (err < 0) {
+		RTE_LOG(ERR, PMD, "Failure sending nl request");
+		goto fail;
+	}
+	err = nl_recv_ack(pmd->nlsk_fd);
+	if (err < 0) {
+		RTE_LOG(ERR, PMD,
+			"Kernel refused TC filter rule creation");
+		goto fail;
+	}
+	LIST_INSERT_HEAD(&pmd->implicit_flows, remote_flow, next);
+	return 0;
+fail:
+	if (remote_flow)
+		rte_free(remote_flow);
+	return -1;
+}
+
+/**
+ * Remove specific implicit flow rule on the remote device.
+ *
+ * @param pmd
+ *   Pointer to private structure.
+ * @param[in] idx
+ *   The idx in the implicit_rte_flows array specifying which rule to remove.
+ *
+ * @return -1 if one of the implicit rules couldn't be created, 0 otherwise.
+ */
+int tap_flow_implicit_destroy(struct rte_eth_dev *dev,
+			      enum implicit_rule_index idx)
+{
+	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *remote_flow;
+	int cur_prio = -1;
+	int idx_prio = implicit_rte_flows[idx].attr.priority + PRIORITY_OFFSET;
+
+	for (remote_flow = LIST_FIRST(&pmd->implicit_flows);
+	     remote_flow;
+	     remote_flow = LIST_NEXT(remote_flow, next)) {
+		cur_prio = (remote_flow->msg.t.tcm_info >> 16) & PRIORITY_MASK;
+		if (cur_prio != idx_prio)
+			continue;
+		return tap_flow_destroy(dev, remote_flow, NULL);
+	}
+	return 0;
+}
+
+/**
+ * Destroy all implicit flows.
+ *
+ * @see rte_flow_flush()
+ */
+int
+tap_flow_implicit_flush(struct rte_eth_dev *dev, struct rte_flow_error *error)
+{
+	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *remote_flow;
+
+	while (!LIST_EMPTY(&pmd->implicit_flows)) {
+		remote_flow = LIST_FIRST(&pmd->implicit_flows);
+		if (tap_flow_destroy(dev, remote_flow, error) < 0)
+			return -1;
+	}
+	return 0;
+}
+
+/**
  * Destroy a flow.
  *
  * @see rte_flow_destroy()
@@ -996,6 +1366,7 @@ tap_flow_destroy(struct rte_eth_dev *dev,
 		 struct rte_flow_error *error)
 {
 	struct pmd_internals *pmd = dev->data->dev_private;
+	struct rte_flow *remote_flow = flow->remote_flow;
 	int ret = 0;
 
 	LIST_REMOVE(flow, next);
@@ -1009,11 +1380,34 @@ tap_flow_destroy(struct rte_eth_dev *dev,
 		goto end;
 	}
 	ret = nl_recv_ack(pmd->nlsk_fd);
-	if (ret < 0)
+	if (ret < 0) {
 		rte_flow_error_set(
 			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
 			"couldn't receive kernel ack to our request");
+		goto end;
+	}
+	if (remote_flow) {
+		remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+		remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
+
+		ret = nl_send(pmd->nlsk_fd, &remote_flow->msg.nh);
+		if (ret < 0) {
+			rte_flow_error_set(
+				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
+				NULL, "Failure sending nl request");
+			goto end;
+		}
+		ret = nl_recv_ack(pmd->nlsk_fd);
+		if (ret < 0) {
+			rte_flow_error_set(
+				error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
+				NULL, "Failure trying to receive nl ack");
+			goto end;
+		}
+	}
 end:
+	if (remote_flow)
+		rte_free(remote_flow);
 	rte_free(flow);
 	return ret;
 }
diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
index a05e945df523..0134cdbaeb90 100644
--- a/drivers/net/tap/tap_flow.h
+++ b/drivers/net/tap/tap_flow.h
@@ -36,6 +36,7 @@
 
 #include <rte_flow.h>
 #include <rte_flow_driver.h>
+#include <tap.h>
 
 /**
  * In TC, priority 0 means we require the kernel to allocate one for us.
@@ -49,10 +50,33 @@
 #define GROUP_SHIFT 12
 #define MAX_GROUP GROUP_MASK
 
+/**
+ * These index are actually in reversed order: their priority is processed
+ * by subtracting their value to the lowest priority (PRIORITY_MASK).
+ * Thus the first one will have the lowest priority in the end
+ * (but biggest value).
+ */
+enum implicit_rule_index {
+	TAP_REMOTE_TX,
+	TAP_REMOTE_BROADCASTV6,
+	TAP_REMOTE_BROADCAST,
+	TAP_REMOTE_ALLMULTI,
+	TAP_REMOTE_PROMISC,
+	TAP_REMOTE_LOCAL_MAC,
+	TAP_REMOTE_MAX_IDX,
+};
+
 int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
 			enum rte_filter_type filter_type,
 			enum rte_filter_op filter_op,
 			void *arg);
 int tap_flow_flush(struct rte_eth_dev *dev, struct rte_flow_error *error);
 
+int tap_flow_implicit_create(struct pmd_internals *pmd,
+			     enum implicit_rule_index idx);
+int tap_flow_implicit_destroy(struct rte_eth_dev *dev,
+			      enum implicit_rule_index idx);
+int tap_flow_implicit_flush(struct rte_eth_dev *dev,
+			    struct rte_flow_error *error);
+
 #endif /* _TAP_FLOW_H_ */
-- 
2.8.0.rc0



More information about the dev mailing list