[dpdk-dev] [PATCH 3/4] net/tap: add netlink back-end for flow API

Pascal Mazon pascal.mazon at 6wind.com
Fri Mar 3 11:45:56 CET 2017


Each kernel netdevice may have queueing disciplines set for it, which
determine how to handle the packet (mostly on egress). That's part of
the TC (Traffic Control) mechanism.

Through TC, it is possible to set filter rules that match specific
packets, and act according to what is in the rule. This is a perfect
candidate to implement the flow API for the tap PMD, as it has an
associated kernel netdevice automatically.

Each flow API rule will be translated into its TC counterpart.

To leverage TC, it is necessary to communicate with the kernel using
netlink. This patch introduces a library to help that communication.

Inside netlink.c, functions are generic for any netlink messaging.
Inside tcmsgs.c, functions are specific to deal with TC rules.

Signed-off-by: Pascal Mazon <pascal.mazon at 6wind.com>
Acked-by: Olga Shern <olgas at mellanox.com>
---
 drivers/net/tap/Makefile      |   2 +
 drivers/net/tap/tap_netlink.c | 367 ++++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_netlink.h |  69 ++++++++
 drivers/net/tap/tap_tcmsgs.c  | 378 ++++++++++++++++++++++++++++++++++++++++++
 drivers/net/tap/tap_tcmsgs.h  |  63 +++++++
 5 files changed, 879 insertions(+)
 create mode 100644 drivers/net/tap/tap_netlink.c
 create mode 100644 drivers/net/tap/tap_netlink.h
 create mode 100644 drivers/net/tap/tap_tcmsgs.c
 create mode 100644 drivers/net/tap/tap_tcmsgs.h

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index 386b8b0594d3..4ae2ca6cfbab 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -48,6 +48,8 @@ CFLAGS += $(WERROR_FLAGS)
 #
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += rte_eth_tap.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_flow.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_netlink.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += tap_tcmsgs.c
 
 # this lib depends upon:
 DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_TAP) += lib/librte_eal
diff --git a/drivers/net/tap/tap_netlink.c b/drivers/net/tap/tap_netlink.c
new file mode 100644
index 000000000000..10f00d1931c6
--- /dev/null
+++ b/drivers/net/tap/tap_netlink.c
@@ -0,0 +1,367 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/netlink.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include <rte_malloc.h>
+#include <tap_netlink.h>
+#include <rte_random.h>
+
+/* Must be quite large to support dumping a huge list of QDISC or filters. */
+#define BUF_SIZE (32 * 1024) /* Size of the buffer to receive kernel messages */
+#define SNDBUF_SIZE 32768 /* Send buffer size for the netlink socket */
+#define RCVBUF_SIZE 32768 /* Receive buffer size for the netlink socket */
+
+struct nested_tail {
+	struct rtattr *tail;
+	struct nested_tail *prev;
+};
+
+/**
+ * Initialize a netlink socket for communicating with the kernel.
+ *
+ * @return
+ *   netlink socket file descriptor on success, -1 otherwise.
+ */
+int
+nl_init(void)
+{
+	int fd, sndbuf_size = SNDBUF_SIZE, rcvbuf_size = RCVBUF_SIZE;
+	struct sockaddr_nl local = { .nl_family = AF_NETLINK };
+
+	fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+	if (fd < 0) {
+		RTE_LOG(ERR, PMD, "Unable to create a netlink socket\n");
+		return -1;
+	}
+	if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int))) {
+		RTE_LOG(ERR, PMD, "Unable to set socket buffer send size\n");
+		return -1;
+	}
+	if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int))) {
+		RTE_LOG(ERR, PMD, "Unable to set socket buffer receive size\n");
+		return -1;
+	}
+	if (bind(fd, (struct sockaddr *)&local, sizeof(local)) < 0) {
+		RTE_LOG(ERR, PMD, "Unable to bind to the netlink socket\n");
+		return -1;
+	}
+	return fd;
+}
+
+/**
+ * Clean up a netlink socket once all communicating with the kernel is finished.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ *
+ * @return
+ *   netlink socket file descriptor on success, -1 otherwise.
+ */
+int
+nl_final(int nlsk_fd)
+{
+	if (close(nlsk_fd)) {
+		RTE_LOG(ERR, PMD, "Failed to close netlink socket: %s (%d)\n",
+			strerror(errno), errno);
+		return -1;
+	}
+	return 0;
+}
+
+/**
+ * Send a message to the kernel on the netlink socket.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] nh
+ *   The netlink message send to the kernel.
+ *
+ * @return
+ *   the number of sent bytes on success, -1 otherwise.
+ */
+int
+nl_send(int nlsk_fd, struct nlmsghdr *nh)
+{
+	/* man 7 netlink EXAMPLE */
+	struct sockaddr_nl sa = {
+		.nl_family = AF_NETLINK,
+	};
+	struct iovec iov = {
+		.iov_base = nh,
+		.iov_len = nh->nlmsg_len,
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	int send_bytes;
+
+	nh->nlmsg_pid = 0; /* communication with the kernel uses pid 0 */
+	nh->nlmsg_seq = (uint32_t)rte_rand();
+	send_bytes = sendmsg(nlsk_fd, &msg, 0);
+	if (send_bytes < 0) {
+		RTE_LOG(ERR, PMD, "Failed to send netlink message: %s (%d)\n",
+			strerror(errno), errno);
+		return -1;
+	}
+	return send_bytes;
+}
+
+/**
+ * Check that the kernel sends an appropriate ACK in response to an nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ *
+ * @return
+ *   the number of sent bytes on success, -1 otherwise.
+ */
+int
+nl_recv_ack(int nlsk_fd)
+{
+	return nl_recv(nlsk_fd, NULL, NULL);
+}
+
+/**
+ * Receive a message from the kernel on the netlink socket, following an
+ * nl_send().
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] cb
+ *   The callback function to call for each netlink message received.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   the number of received bytes on success, -1 otherwise.
+ */
+int
+nl_recv(int nlsk_fd, int (*cb)(struct nlmsghdr *, void *arg), void *arg)
+{
+	/* man 7 netlink EXAMPLE */
+	struct sockaddr_nl sa;
+	struct nlmsghdr *nh;
+	char buf[BUF_SIZE];
+	struct iovec iov = {
+		.iov_base = buf,
+		.iov_len = sizeof(buf),
+	};
+	struct msghdr msg = {
+		.msg_name = &sa,
+		.msg_namelen = sizeof(sa),
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+	int recv_bytes = 0, done = 0, multipart = 0, error = 0;
+
+read:
+	recv_bytes = recvmsg(nlsk_fd, &msg, 0);
+	if (recv_bytes < 0)
+		return -1;
+	for (nh = (struct nlmsghdr *)buf;
+	     NLMSG_OK(nh, (unsigned int)recv_bytes);
+	     nh = NLMSG_NEXT(nh, recv_bytes)) {
+		/*
+		 * Multi-part messages and their following DONE message have the
+		 * NLM_F_MULTI flag set. Make note, in order to read the DONE
+		 * message afterwards.
+		 */
+		if (nh->nlmsg_flags & NLM_F_MULTI)
+			multipart = 1;
+		if (nh->nlmsg_type == NLMSG_ERROR) {
+			struct nlmsgerr *err_data = NLMSG_DATA(nh);
+
+			if (err_data->error == 0)
+				RTE_LOG(DEBUG, PMD, "%s() ack message recvd\n",
+					__func__);
+			else {
+				RTE_LOG(DEBUG, PMD,
+					"%s() error message recvd\n", __func__);
+				error = 1;
+			}
+		}
+		/* The end of multipart message. */
+		if (nh->nlmsg_type == NLMSG_DONE)
+			/* No need to call the callback for a DONE message. */
+			done = 1;
+		else if (cb)
+			if (cb(nh, arg) < 0)
+				error = 1;
+	}
+	if (multipart && !done)
+		goto read;
+	if (error)
+		return -1;
+	return 0;
+}
+
+/**
+ * Append a netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data_len
+ *   The length of the data to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add(struct nlmsghdr *nh, unsigned short type,
+	   unsigned int data_len, const void *data)
+{
+	/* see man 3 rtnetlink */
+	struct rtattr *rta;
+
+	rta = (struct rtattr *)NLMSG_TAIL(nh);
+	rta->rta_len = RTA_LENGTH(data_len);
+	rta->rta_type = type;
+	memcpy(RTA_DATA(rta), data, data_len);
+	nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + RTA_ALIGN(rta->rta_len);
+}
+
+/**
+ * Append a uint8_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data)
+{
+	nlattr_add(nh, type, sizeof(uint8_t), &data);
+}
+
+/**
+ * Append a uint16_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data)
+{
+	nlattr_add(nh, type, sizeof(uint16_t), &data);
+}
+
+/**
+ * Append a uint16_t netlink attribute to a message.
+ *
+ * @param[in, out] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] type
+ *   The type of attribute to append.
+ * @param[in] data
+ *   The data to append.
+ */
+void
+nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data)
+{
+	nlattr_add(nh, type, sizeof(uint32_t), &data);
+}
+
+/**
+ * Start a nested netlink attribute.
+ * It must be followed later by a call to nlattr_nested_finish().
+ *
+ * @param[in, out] msg
+ *   The netlink message where to edit the nested_tails metadata.
+ * @param[in] type
+ *   The nested attribute type to append.
+ *
+ * @return
+ *   -1 if adding a nested netlink attribute failed, 0 otherwise.
+ */
+int
+nlattr_nested_start(struct nlmsg *msg, uint16_t type)
+{
+	struct nested_tail *tail;
+
+	tail = rte_zmalloc(NULL, sizeof(struct nested_tail), 0);
+	if (!tail) {
+		RTE_LOG(ERR, PMD,
+			"Couldn't allocate memory for nested netlink"
+			" attribute\n");
+		return -1;
+	}
+
+	tail->tail = (struct rtattr *)NLMSG_TAIL(&msg->nh);
+
+	nlattr_add(&msg->nh, type, 0, NULL);
+
+	tail->prev = msg->nested_tails;
+
+	msg->nested_tails = tail;
+
+	return 0;
+}
+
+/**
+ * End a nested netlink attribute.
+ * It follows a call to nlattr_nested_start().
+ * In effect, it will modify the nested attribute length to include every bytes
+ * from the nested attribute start, up to here.
+ *
+ * @param[in, out] msg
+ *   The netlink message where to edit the nested_tails metadata.
+ */
+void
+nlattr_nested_finish(struct nlmsg *msg)
+{
+	struct nested_tail *tail = msg->nested_tails;
+
+	tail->tail->rta_len = (char *)NLMSG_TAIL(&msg->nh) - (char *)tail->tail;
+
+	if (tail->prev)
+		msg->nested_tails = tail->prev;
+
+	rte_free(tail);
+}
diff --git a/drivers/net/tap/tap_netlink.h b/drivers/net/tap/tap_netlink.h
new file mode 100644
index 000000000000..52ba8c030dcc
--- /dev/null
+++ b/drivers/net/tap/tap_netlink.h
@@ -0,0 +1,69 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_NETLINK_H_
+#define _TAP_NETLINK_H_
+
+#include <ctype.h>
+#include <inttypes.h>
+#include <linux/rtnetlink.h>
+#include <linux/netlink.h>
+#include <stdio.h>
+
+#include <rte_log.h>
+
+#define NLMSG_BUF 512
+
+struct nlmsg {
+	struct nlmsghdr nh;
+	struct tcmsg t;
+	char buf[NLMSG_BUF];
+	struct nested_tail *nested_tails;
+};
+
+#define NLMSG_TAIL(nlh) (void *)((char *)(nlh) + NLMSG_ALIGN((nlh)->nlmsg_len))
+
+int nl_init(void);
+int nl_final(int nlsk_fd);
+int nl_send(int nlsk_fd, struct nlmsghdr *nh);
+int nl_recv(int nlsk_fd, int (*callback)(struct nlmsghdr *, void *), void *arg);
+int nl_recv_ack(int nlsk_fd);
+void nlattr_add(struct nlmsghdr *nh, unsigned short type,
+		unsigned int data_len, const void *data);
+void nlattr_add8(struct nlmsghdr *nh, unsigned short type, uint8_t data);
+void nlattr_add16(struct nlmsghdr *nh, unsigned short type, uint16_t data);
+void nlattr_add32(struct nlmsghdr *nh, unsigned short type, uint32_t data);
+int nlattr_nested_start(struct nlmsg *msg, uint16_t type);
+void nlattr_nested_finish(struct nlmsg *msg);
+
+#endif /* _TAP_NETLINK_H_ */
diff --git a/drivers/net/tap/tap_tcmsgs.c b/drivers/net/tap/tap_tcmsgs.c
new file mode 100644
index 000000000000..9a146d165b08
--- /dev/null
+++ b/drivers/net/tap/tap_tcmsgs.c
@@ -0,0 +1,378 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <inttypes.h>
+#include <linux/netlink.h>
+#include <net/if.h>
+#include <string.h>
+
+#include <rte_log.h>
+#include <tap_tcmsgs.h>
+
+struct qdisc {
+	uint32_t handle;
+	uint32_t parent;
+};
+
+struct list_args {
+	int nlsk_fd;
+	uint16_t ifindex;
+	void *custom_arg;
+};
+
+struct qdisc_custom_arg {
+	uint32_t handle;
+	uint32_t parent;
+	uint8_t exists;
+};
+
+/**
+ * Initialize a netlink message with a TC header.
+ *
+ * @param[in, out] msg
+ *   The netlink message to fill.
+ * @param[in] ifindex
+ *   The netdevice ifindex where the rule will be applied.
+ * @param[in] type
+ *   The type of TC message to create (RTM_NEWTFILTER, RTM_NEWQDISC, etc.).
+ * @param[in] flags
+ *   Overrides the default netlink flags for this msg with those specified.
+ */
+void
+tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type, uint16_t flags)
+{
+	struct nlmsghdr *n = &msg->nh;
+
+	n->nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+	n->nlmsg_type = type;
+	if (flags)
+		n->nlmsg_flags = flags;
+	else
+		n->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	msg->t.tcm_family = AF_UNSPEC;
+	msg->t.tcm_ifindex = ifindex;
+}
+
+/**
+ * Delete a specific QDISC identified by its iface, and it's handle and parent.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex on whom the deletion will happen.
+ * @param[in] qinfo
+ *   Additional info to identify the QDISC (handle and parent).
+ *
+ * @return
+ *   0 on success, -1 otherwise.
+ */
+static int
+qdisc_del(int nlsk_fd, uint16_t ifindex, struct qdisc *qinfo)
+{
+	struct nlmsg msg;
+	int fd = 0;
+
+	tc_init_msg(&msg, ifindex, RTM_DELQDISC, 0);
+	msg.t.tcm_handle = qinfo->handle;
+	msg.t.tcm_parent = qinfo->parent;
+	/* if no netlink socket is provided, create one */
+	if (!nlsk_fd) {
+		fd = nl_init();
+		if (fd < 0) {
+			RTE_LOG(ERR, PMD,
+				"Could not delete QDISC: null netlink socket\n");
+			return -1;
+		}
+	} else {
+		fd = nlsk_fd;
+	}
+	if (nl_send(fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(fd) < 0)
+		return -1;
+	if (!nlsk_fd)
+		return nl_final(fd);
+	return 0;
+}
+
+/**
+ * Add the multiqueue QDISC with MULTIQ_MAJOR_HANDLE handle.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the multiqueue QDISC.
+ *
+ * @return
+ *   -1 if the qdisc cannot be added, and 0 otherwise.
+ */
+int
+qdisc_add_multiq(int nlsk_fd, uint16_t ifindex)
+{
+	struct tc_multiq_qopt opt;
+	struct nlmsg msg;
+
+	tc_init_msg(&msg, ifindex, RTM_NEWQDISC,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg.t.tcm_handle = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+	msg.t.tcm_parent = TC_H_ROOT;
+	nlattr_add(&msg.nh, TCA_KIND, sizeof("multiq"), "multiq");
+	nlattr_add(&msg.nh, TCA_OPTIONS, sizeof(opt), &opt);
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(nlsk_fd) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Add the ingress QDISC with default ffff: handle.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where the QDISC will be added.
+ *
+ * @return
+ *   -1 if the qdisc cannot be added, and 0 otherwise.
+ */
+int
+qdisc_add_ingress(int nlsk_fd, uint16_t ifindex)
+{
+	struct nlmsg msg;
+
+	tc_init_msg(&msg, ifindex, RTM_NEWQDISC,
+		    NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+	msg.t.tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
+	msg.t.tcm_parent = TC_H_INGRESS;
+	nlattr_add(&msg.nh, TCA_KIND, sizeof("ingress"), "ingress");
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv_ack(nlsk_fd) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Callback function to check for QDISC existence.
+ * If the QDISC is found to exist, increment "exists" in the custom arg.
+ *
+ * @param[in] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in, out] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0.
+ */
+static int
+qdisc_exist_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct list_args *args = (struct list_args *)arg;
+	struct qdisc_custom_arg *custom = args->custom_arg;
+	struct tcmsg *t = NLMSG_DATA(nh);
+
+	/* filter by request iface */
+	if (args->ifindex != (unsigned int)t->tcm_ifindex)
+		return 0;
+	if (t->tcm_handle != custom->handle || t->tcm_parent != custom->parent)
+		return 0;
+	custom->exists++;
+	return 0;
+}
+
+/**
+ * Callback function to delete a QDISC.
+ *
+ * @param[in] nh
+ *   The netlink message to parse, received from the kernel.
+ * @param[in] arg
+ *   Custom arguments for the callback.
+ *
+ * @return
+ *   0.
+ */
+static int
+qdisc_del_cb(struct nlmsghdr *nh, void *arg)
+{
+	struct tcmsg *t = NLMSG_DATA(nh);
+	struct list_args *args = arg;
+
+	struct qdisc qinfo = {
+		.handle = t->tcm_handle,
+		.parent = t->tcm_parent,
+	};
+
+	/* filter out other ifaces' qdiscs */
+	if (args->ifindex != (unsigned int)t->tcm_ifindex)
+		return 0;
+	/*
+	 * Use another nlsk_fd (0) to avoid tampering with the current list
+	 * iteration.
+	 */
+	return qdisc_del(0, args->ifindex, &qinfo);
+}
+
+/**
+ * Iterate over all QDISC, and call the callback() function for each.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to find QDISCs.
+ * @param[in] callback
+ *   The function to call for each QDISC.
+ * @param[in, out] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   -1 if either sending the netlink message failed, or if receiving the answer
+ *   failed, or finally if the callback returned a negative value for that
+ *   answer.
+ *   0 is returned otherwise.
+ */
+static int
+qdisc_iterate(int nlsk_fd, uint16_t ifindex,
+	      int (*callback)(struct nlmsghdr *, void *), void *arg)
+{
+	struct nlmsg msg;
+	struct list_args args = {
+		.nlsk_fd = nlsk_fd,
+		.ifindex = ifindex,
+		.custom_arg = arg,
+	};
+
+	tc_init_msg(&msg, ifindex, RTM_GETQDISC, NLM_F_REQUEST | NLM_F_DUMP);
+	if (nl_send(nlsk_fd, &msg.nh) < 0)
+		return -1;
+	if (nl_recv(nlsk_fd, callback, &args) < 0)
+		return -1;
+	return 0;
+}
+
+/**
+ * Check whether a given QDISC already exists for the netdevice.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex to check QDISC existence for.
+ * @param[in] callback
+ *   The function to call for each QDISC.
+ * @param[in, out] arg
+ *   The arguments to provide the callback function with.
+ *
+ * @return
+ *   1 if the qdisc exists, 0 otherwise.
+ */
+int
+qdisc_exists(int nlsk_fd, uint16_t ifindex, uint32_t handle, uint32_t parent)
+{
+	struct qdisc_custom_arg arg = {
+		.handle = handle,
+		.parent = parent,
+		.exists = 0,
+	};
+
+	qdisc_iterate(nlsk_fd, ifindex, qdisc_exist_cb, &arg);
+	if (arg.exists)
+		return 1;
+	return 0;
+}
+
+/**
+ * Delete all QDISCs for a given netdevice.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to find QDISCs.
+ *
+ * @return
+ *   -1 if the lookup failed, 0 otherwise.
+ */
+int
+qdisc_flush(int nlsk_fd, uint16_t ifindex)
+{
+	return qdisc_iterate(nlsk_fd, ifindex, qdisc_del_cb, NULL);
+}
+
+/**
+ * Create the multiqueue QDISC, only if it does not exist already.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the multiqueue QDISC.
+ *
+ * @return
+ *   0 if the qdisc exists or if has been successfully added.
+ *   Return -1 otherwise.
+ */
+int
+qdisc_create_multiq(int nlsk_fd, uint16_t ifindex)
+{
+	if (!qdisc_exists(nlsk_fd, ifindex,
+			  TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0), TC_H_ROOT)) {
+		if (qdisc_add_multiq(nlsk_fd, ifindex) < 0) {
+			RTE_LOG(ERR, PMD, "Could not add multiq qdisc\n");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+/**
+ * Create the ingress QDISC, only if it does not exist already.
+ *
+ * @param[in] nlsk_fd
+ *   The netlink socket file descriptor used for communication.
+ * @param[in] ifindex
+ *   The netdevice ifindex where to add the ingress QDISC.
+ *
+ * @return
+ *   0 if the qdisc exists or if has been successfully added.
+ *   Return -1 otherwise.
+ */
+int
+qdisc_create_ingress(int nlsk_fd, uint16_t ifindex)
+{
+	if (!qdisc_exists(nlsk_fd, ifindex,
+			  TC_H_MAKE(TC_H_INGRESS, 0), TC_H_INGRESS)) {
+		if (qdisc_add_ingress(nlsk_fd, ifindex) < 0) {
+			RTE_LOG(ERR, PMD, "Could not add ingress qdisc\n");
+			return -1;
+		}
+	}
+	return 0;
+}
diff --git a/drivers/net/tap/tap_tcmsgs.h b/drivers/net/tap/tap_tcmsgs.h
new file mode 100644
index 000000000000..a571a56d6964
--- /dev/null
+++ b/drivers/net/tap/tap_tcmsgs.h
@@ -0,0 +1,63 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2017 6WIND S.A.
+ *   Copyright 2017 Mellanox.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of 6WIND S.A. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAP_TCMSGS_H_
+#define _TAP_TCMSGS_H_
+
+#include <linux/if_ether.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include <linux/pkt_cls.h>
+#include <linux/tc_act/tc_mirred.h>
+#include <linux/tc_act/tc_gact.h>
+#include <linux/tc_act/tc_skbedit.h>
+#include <inttypes.h>
+
+#include <rte_ether.h>
+#include <tap_netlink.h>
+
+#define MULTIQ_MAJOR_HANDLE (1 << 16)
+
+void tc_init_msg(struct nlmsg *msg, uint16_t ifindex, uint16_t type,
+		 uint16_t flags);
+int qdisc_exists(int nlsk_fd, uint16_t ifindex, uint32_t handle,
+		 uint32_t parent);
+int qdisc_list(int nlsk_fd, uint16_t ifindex);
+int qdisc_flush(int nlsk_fd, uint16_t ifindex);
+int qdisc_create_ingress(int nlsk_fd, uint16_t ifindex);
+int qdisc_create_multiq(int nlsk_fd, uint16_t ifindex);
+int qdisc_add_ingress(int nlsk_fd, uint16_t ifindex);
+int qdisc_add_multiq(int nlsk_fd, uint16_t ifindex);
+int filter_list_ingress(int nlsk_fd, uint16_t ifindex);
+
+#endif /* _TAP_TCMSGS_H_ */
-- 
2.8.0.rc0



More information about the dev mailing list