[dpdk-dev] [PATCH v2 7/7] net/mlx5: e-switch VXLAN rule cleanup routines

Yongseok Koh yskoh at mellanox.com
Thu Oct 25 02:36:44 CEST 2018


On Mon, Oct 15, 2018 at 02:13:35PM +0000, Viacheslav Ovsiienko wrote:
> The last part of patchset contains the rule cleanup routines.
> These ones is the part of outer interface initialization at
> the moment of VXLAN VTEP attaching. These routines query
> the list of attached VXLAN devices, the list of local IP
> addresses with peer and link scope attribute and the list
> of permanent neigh rules, then all found abovementioned
> items on the specified outer device are flushed.
> 
> Suggested-by: Adrien Mazarguil <adrien.mazarguil at 6wind.com>
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo at mellanox.com>
> ---
>  drivers/net/mlx5/mlx5_flow_tcf.c | 505 ++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 499 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c
> index a1d7733..a3348ea 100644
> --- a/drivers/net/mlx5/mlx5_flow_tcf.c
> +++ b/drivers/net/mlx5/mlx5_flow_tcf.c
> @@ -4012,6 +4012,502 @@ static LIST_HEAD(, mlx5_flow_tcf_vtep)
>  }
>  #endif /* HAVE_IFLA_VXLAN_COLLECT_METADATA */
>  
> +#define MNL_REQUEST_SIZE_MIN 256
> +#define MNL_REQUEST_SIZE_MAX 2048
> +#define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
> +				 MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
> +
> +/* Data structures used by flow_tcf_xxx_cb() routines. */
> +struct tcf_nlcb_buf {
> +	LIST_ENTRY(tcf_nlcb_buf) next;
> +	uint32_t size;
> +	alignas(struct nlmsghdr)
> +	uint8_t msg[]; /**< Netlink message data. */
> +};
> +
> +struct tcf_nlcb_context {
> +	unsigned int ifindex; /**< Base interface index. */
> +	uint32_t bufsize;
> +	LIST_HEAD(, tcf_nlcb_buf) nlbuf;
> +};
> +
> +/**
> + * Allocate space for netlink command in buffer list
> + *
> + * @param[in, out] ctx
> + *   Pointer to callback context with command buffers list.
> + * @param[in] size
> + *   Required size of data buffer to be allocated.
> + *
> + * @return
> + *   Pointer to allocated memory, aligned as message header.
> + *   NULL if some error occurred.
> + */
> +static struct nlmsghdr *
> +flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size)
> +{
> +	struct tcf_nlcb_buf *buf;
> +	struct nlmsghdr *nlh;
> +
> +	size = NLMSG_ALIGN(size);
> +	buf = LIST_FIRST(&ctx->nlbuf);
> +	if (buf && (buf->size + size) <= ctx->bufsize) {
> +		nlh = (struct nlmsghdr *)&buf->msg[buf->size];
> +		buf->size += size;
> +		return nlh;
> +	}
> +	if (size > ctx->bufsize) {
> +		DRV_LOG(WARNING, "netlink: too long command buffer requested");
> +		return NULL;
> +	}
> +	buf = rte_malloc(__func__,
> +			ctx->bufsize + sizeof(struct tcf_nlcb_buf),
> +			alignof(struct tcf_nlcb_buf));
> +	if (!buf) {
> +		DRV_LOG(WARNING, "netlink: no memory for command buffer");
> +		return NULL;
> +	}
> +	LIST_INSERT_HEAD(&ctx->nlbuf, buf, next);
> +	buf->size = size;
> +	nlh = (struct nlmsghdr *)&buf->msg[0];
> +	return nlh;
> +}
> +
> +/**
> + * Set NLM_F_ACK flags in the last netlink command in buffer.
> + * Only last command in the buffer will be acked by system.
> + *
> + * @param[in, out] buf
> + *   Pointer to buffer with netlink commands.
> + */
> +static void
> +flow_tcf_setack_nlcmd(struct tcf_nlcb_buf *buf)
> +{
> +	struct nlmsghdr *nlh;
> +	uint32_t size = 0;
> +
> +	assert(buf->size);
> +	do {
> +		nlh = (struct nlmsghdr *)&buf->msg[size];
> +		size += NLMSG_ALIGN(nlh->nlmsg_len);
> +		if (size >= buf->size) {
> +			nlh->nlmsg_flags |= NLM_F_ACK;
> +			break;
> +		}
> +	} while (true);
> +}
> +
> +/**
> + * Send the buffers with prepared netlink commands. Scans the list and
> + * sends all found buffers. Buffers are sent and freed anyway in order
> + * to prevent memory leakage if some every message in received packet.
> + *
> + * @param[in] tcf
> + *   Context object initialized by mlx5_flow_tcf_context_create().
> + * @param[in, out] ctx
> + *   Pointer to callback context with command buffers list.
> + *
> + * @return
> + *   Zero value on success, negative errno value otherwise
> + *   and rte_errno is set.
> + */
> +static int
> +flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf,
> +		    struct tcf_nlcb_context *ctx)
> +{
> +	struct tcf_nlcb_buf *bc, *bn;
> +	struct nlmsghdr *nlh;
> +	int ret = 0;
> +
> +	bc = LIST_FIRST(&ctx->nlbuf);
> +	while (bc) {
> +		int rc;
> +
> +		bn = LIST_NEXT(bc, next);
> +		if (bc->size) {
> +			flow_tcf_setack_nlcmd(bc);
> +			nlh = (struct nlmsghdr *)&bc->msg;
> +			rc = flow_tcf_nl_ack(tcf, nlh, bc->size, NULL, NULL);
> +			if (rc && !ret)
> +				ret = rc;
> +		}
> +		rte_free(bc);
> +		bc = bn;
> +	}
> +	LIST_INIT(&ctx->nlbuf);
> +	return ret;
> +}
> +
> +/**
> + * Collect local IP address rules with scope link attribute  on specified
> + * network device. This is callback routine called by libmnl mnl_cb_run()
> + * in loop for every message in received packet.
> + *
> + * @param[in] nlh
> + *   Pointer to reply header.
> + * @param[in, out] arg
> + *   Opaque data pointer for this callback.
> + *
> + * @return
> + *   A positive, nonzero value on success, negative errno value otherwise
> + *   and rte_errno is set.
> + */
> +static int
> +flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg)
> +{
> +	struct tcf_nlcb_context *ctx = arg;
> +	struct nlmsghdr *cmd;
> +	struct ifaddrmsg *ifa;
> +	struct nlattr *na;
> +	struct nlattr *na_local = NULL;
> +	struct nlattr *na_peer = NULL;
> +	unsigned char family;
> +
> +	if (nlh->nlmsg_type != RTM_NEWADDR) {
> +		rte_errno = EINVAL;
> +		return -rte_errno;
> +	}
> +	ifa = mnl_nlmsg_get_payload(nlh);
> +	family = ifa->ifa_family;
> +	if (ifa->ifa_index != ctx->ifindex ||
> +	    ifa->ifa_scope != RT_SCOPE_LINK ||
> +	    !(ifa->ifa_flags & IFA_F_PERMANENT) ||
> +	    (family != AF_INET && family != AF_INET6))
> +		return 1;
> +	mnl_attr_for_each(na, nlh, sizeof(*ifa)) {
> +		switch (mnl_attr_get_type(na)) {
> +		case IFA_LOCAL:
> +			na_local = na;
> +			break;
> +		case IFA_ADDRESS:
> +			na_peer = na;
> +			break;
> +		}
> +		if (na_local && na_peer)
> +			break;
> +	}
> +	if (!na_local || !na_peer)
> +		return 1;
> +	/* Local rule found with scope link, permanent and assigned peer. */
> +	cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
> +					MNL_ALIGN(sizeof(struct ifaddrmsg)) +
> +					(family == AF_INET6
> +					? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
> +					: 2 * SZ_NLATTR_TYPE_OF(uint32_t)));

Better to use IPV4_ADDR_LEN instead?

> +	if (!cmd) {
> +		rte_errno = ENOMEM;
> +		return -rte_errno;
> +	}
> +	cmd = mnl_nlmsg_put_header(cmd);
> +	cmd->nlmsg_type = RTM_DELADDR;
> +	cmd->nlmsg_flags = NLM_F_REQUEST;
> +	ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa));
> +	ifa->ifa_flags = IFA_F_PERMANENT;
> +	ifa->ifa_scope = RT_SCOPE_LINK;
> +	ifa->ifa_index = ctx->ifindex;
> +	if (family == AF_INET) {
> +		ifa->ifa_family = AF_INET;
> +		ifa->ifa_prefixlen = 32;
> +		mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local));
> +		mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer));
> +	} else {
> +		ifa->ifa_family = AF_INET6;
> +		ifa->ifa_prefixlen = 128;
> +		mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN,
> +			mnl_attr_get_payload(na_local));
> +		mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN,
> +			mnl_attr_get_payload(na_peer));
> +	}
> +	return 1;
> +}
> +
> +/**
> + * Cleanup the local IP addresses on outer interface.
> + *
> + * @param[in] tcf
> + *   Context object initialized by mlx5_flow_tcf_context_create().
> + * @param[in] ifindex
> + *   Network inferface index to perform cleanup.
> + */
> +static void
> +flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf,
> +			    unsigned int ifindex)
> +{
> +	struct nlmsghdr *nlh;
> +	struct ifaddrmsg *ifa;
> +	struct tcf_nlcb_context ctx = {
> +		.ifindex = ifindex,
> +		.bufsize = MNL_REQUEST_SIZE,
> +		.nlbuf = LIST_HEAD_INITIALIZER(),
> +	};
> +	int ret;
> +
> +	assert(ifindex);
> +	/*
> +	 * Seek and destroy leftovers of local IP addresses with
> +	 * matching properties "scope link".
> +	 */
> +	nlh = mnl_nlmsg_put_header(tcf->buf);
> +	nlh->nlmsg_type = RTM_GETADDR;
> +	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
> +	ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
> +	ifa->ifa_family = AF_UNSPEC;
> +	ifa->ifa_index = ifindex;
> +	ifa->ifa_scope = RT_SCOPE_LINK;
> +	ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_local_cb, &ctx);
> +	if (ret)
> +		DRV_LOG(WARNING, "netlink: query device list error %d", ret);
> +	ret = flow_tcf_send_nlcmd(tcf, &ctx);
> +	if (ret)
> +		DRV_LOG(WARNING, "netlink: device delete error %d", ret);
> +}
> +
> +/**
> + * Collect neigh permament rules on specified network device.
> + * This is callback routine called by libmnl mnl_cb_run() in loop for
> + * every message in received packet.
> + *
> + * @param[in] nlh
> + *   Pointer to reply header.
> + * @param[in, out] arg
> + *   Opaque data pointer for this callback.
> + *
> + * @return
> + *   A positive, nonzero value on success, negative errno value otherwise
> + *   and rte_errno is set.
> + */
> +static int
> +flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg)
> +{
> +	struct tcf_nlcb_context *ctx = arg;
> +	struct nlmsghdr *cmd;
> +	struct ndmsg *ndm;
> +	struct nlattr *na;
> +	struct nlattr *na_ip = NULL;
> +	struct nlattr *na_mac = NULL;
> +	unsigned char family;
> +
> +	if (nlh->nlmsg_type != RTM_NEWNEIGH) {
> +		rte_errno = EINVAL;
> +		return -rte_errno;
> +	}
> +	ndm = mnl_nlmsg_get_payload(nlh);
> +	family = ndm->ndm_family;
> +	if (ndm->ndm_ifindex != (int)ctx->ifindex ||
> +	   !(ndm->ndm_state & NUD_PERMANENT) ||
> +	   (family != AF_INET && family != AF_INET6))
> +		return 1;
> +	mnl_attr_for_each(na, nlh, sizeof(*ndm)) {
> +		switch (mnl_attr_get_type(na)) {
> +		case NDA_DST:
> +			na_ip = na;
> +			break;
> +		case NDA_LLADDR:
> +			na_mac = na;
> +			break;
> +		}
> +		if (na_mac && na_ip)
> +			break;
> +	}
> +	if (!na_mac || !na_ip)
> +		return 1;
> +	/* Neigh rule with permenent attribute found. */
> +	cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
> +					MNL_ALIGN(sizeof(struct ndmsg)) +
> +					SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) +
> +					(family == AF_INET6
> +					? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN)
> +					: SZ_NLATTR_TYPE_OF(uint32_t)));

Better to use IPV4_ADDR_LEN instead?

> +	if (!cmd) {
> +		rte_errno = ENOMEM;
> +		return -rte_errno;
> +	}
> +	cmd = mnl_nlmsg_put_header(cmd);
> +	cmd->nlmsg_type = RTM_DELNEIGH;
> +	cmd->nlmsg_flags = NLM_F_REQUEST;
> +	ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm));
> +	ndm->ndm_ifindex = ctx->ifindex;
> +	ndm->ndm_state = NUD_PERMANENT;
> +	ndm->ndm_flags = 0;
> +	ndm->ndm_type = 0;
> +	if (family == AF_INET) {
> +		ndm->ndm_family = AF_INET;
> +		mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip));
> +	} else {
> +		ndm->ndm_family = AF_INET6;
> +		mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN,
> +			     mnl_attr_get_payload(na_ip));
> +	}
> +	mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN,
> +		     mnl_attr_get_payload(na_mac));
> +	return 1;
> +}
> +
> +/**
> + * Cleanup the neigh rules on outer interface.
> + *
> + * @param[in] tcf
> + *   Context object initialized by mlx5_flow_tcf_context_create().
> + * @param[in] ifindex
> + *   Network inferface index to perform cleanup.
> + */
> +static void
> +flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf,
> +			    unsigned int ifindex)
> +{
> +	struct nlmsghdr *nlh;
> +	struct ndmsg *ndm;
> +	struct tcf_nlcb_context ctx = {
> +		.ifindex = ifindex,
> +		.bufsize = MNL_REQUEST_SIZE,
> +		.nlbuf = LIST_HEAD_INITIALIZER(),
> +	};
> +	int ret;
> +
> +	assert(ifindex);
> +	/* Seek and destroy leftovers of neigh rules. */
> +	nlh = mnl_nlmsg_put_header(tcf->buf);
> +	nlh->nlmsg_type = RTM_GETNEIGH;
> +	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
> +	ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
> +	ndm->ndm_family = AF_UNSPEC;
> +	ndm->ndm_ifindex = ifindex;
> +	ndm->ndm_state = NUD_PERMANENT;
> +	ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_neigh_cb, &ctx);
> +	if (ret)
> +		DRV_LOG(WARNING, "netlink: query device list error %d", ret);
> +	ret = flow_tcf_send_nlcmd(tcf, &ctx);
> +	if (ret)
> +		DRV_LOG(WARNING, "netlink: device delete error %d", ret);
> +}
> +
> +/**
> + * Collect indices of VXLAN encap/decap interfaces associated with device.
> + * This is callback routine called by libmnl mnl_cb_run() in loop for
> + * every message in received packet.
> + *
> + * @param[in] nlh
> + *   Pointer to reply header.
> + * @param[in, out] arg
> + *   Opaque data pointer for this callback.
> + *
> + * @return
> + *   A positive, nonzero value on success, negative errno value otherwise
> + *   and rte_errno is set.
> + */
> +static int
> +flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg)
> +{
> +	struct tcf_nlcb_context *ctx = arg;
> +	struct nlmsghdr *cmd;
> +	struct ifinfomsg *ifm;
> +	struct nlattr *na;
> +	struct nlattr *na_info = NULL;
> +	struct nlattr *na_vxlan = NULL;
> +	bool found = false;
> +	unsigned int vxindex;
> +
> +	if (nlh->nlmsg_type != RTM_NEWLINK) {
> +		rte_errno = EINVAL;
> +		return -rte_errno;
> +	}
> +	ifm = mnl_nlmsg_get_payload(nlh);
> +	if (!ifm->ifi_index) {
> +		rte_errno = EINVAL;
> +		return -rte_errno;
> +	}
> +	mnl_attr_for_each(na, nlh, sizeof(*ifm))
> +		if (mnl_attr_get_type(na) == IFLA_LINKINFO) {
> +			na_info = na;
> +			break;
> +		}
> +	if (!na_info)
> +		return 1;
> +	mnl_attr_for_each_nested(na, na_info) {
> +		switch (mnl_attr_get_type(na)) {
> +		case IFLA_INFO_KIND:
> +			if (!strncmp("vxlan", mnl_attr_get_str(na),
> +				     mnl_attr_get_len(na)))
> +				found = true;
> +			break;
> +		case IFLA_INFO_DATA:
> +			na_vxlan = na;
> +			break;
> +		}
> +		if (found && na_vxlan)
> +			break;
> +	}
> +	if (!found || !na_vxlan)
> +		return 1;
> +	found = false;
> +	mnl_attr_for_each_nested(na, na_vxlan) {
> +		if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK &&
> +		    mnl_attr_get_u32(na) == ctx->ifindex) {
> +			found = true;
> +			break;
> +		}
> +	}
> +	if (!found)
> +		return 1;
> +	/* Attached VXLAN device found, store the command to delete. */
> +	vxindex = ifm->ifi_index;
> +	cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) +
> +					MNL_ALIGN(sizeof(struct ifinfomsg)));
> +	if (!nlh) {
> +		rte_errno = ENOMEM;
> +		return -rte_errno;
> +	}
> +	cmd = mnl_nlmsg_put_header(cmd);
> +	cmd->nlmsg_type = RTM_DELLINK;
> +	cmd->nlmsg_flags = NLM_F_REQUEST;
> +	ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm));
> +	ifm->ifi_family = AF_UNSPEC;
> +	ifm->ifi_index = vxindex;
> +	return 1;
> +}
> +
> +/**
> + * Cleanup the outer interface. Removes all found vxlan devices
> + * attached to specified index, flushes the meigh and local IP
> + * datavase.
> + *
> + * @param[in] tcf
> + *   Context object initialized by mlx5_flow_tcf_context_create().
> + * @param[in] ifindex
> + *   Network inferface index to perform cleanup.
> + */
> +static void
> +flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf,
> +			    unsigned int ifindex)
> +{
> +	struct nlmsghdr *nlh;
> +	struct ifinfomsg *ifm;
> +	struct tcf_nlcb_context ctx = {
> +		.ifindex = ifindex,
> +		.bufsize = MNL_REQUEST_SIZE,
> +		.nlbuf = LIST_HEAD_INITIALIZER(),
> +	};
> +	int ret;
> +
> +	assert(ifindex);
> +	/*
> +	 * Seek and destroy leftover VXLAN encap/decap interfaces with
> +	 * matching properties.
> +	 */
> +	nlh = mnl_nlmsg_put_header(tcf->buf);
> +	nlh->nlmsg_type = RTM_GETLINK;
> +	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
> +	ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm));
> +	ifm->ifi_family = AF_UNSPEC;
> +	ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_vxlan_cb, &ctx);
> +	if (ret)
> +		DRV_LOG(WARNING, "netlink: query device list error %d", ret);
> +	ret = flow_tcf_send_nlcmd(tcf, &ctx);
> +	if (ret)
> +		DRV_LOG(WARNING, "netlink: device delete error %d", ret);
> +}
> +
> +
>  /**
>   * Create target interface index for VXLAN tunneling decapsulation.
>   * In order to share the UDP port within the other interfaces the
> @@ -4100,12 +4596,9 @@ static LIST_HEAD(, mlx5_flow_tcf_vtep)
>  		uint16_t pcnt;
>  
>  		/* Not found, we should create the new attached VTEP. */
> -/*
> - * TODO: not implemented yet
> - * flow_tcf_encap_iface_cleanup(tcf, ifouter);
> - * flow_tcf_encap_local_cleanup(tcf, ifouter);
> - * flow_tcf_encap_neigh_cleanup(tcf, ifouter);
> - */
> +		flow_tcf_encap_iface_cleanup(tcf, ifouter);
> +		flow_tcf_encap_local_cleanup(tcf, ifouter);
> +		flow_tcf_encap_neigh_cleanup(tcf, ifouter);

I have a fundamental questioin. Why are these cleanups needed? If I read the
code correctly, it looks like cleaning up vtep, ip assginment and neigh entry
which are not created/set by PMD. The reason why we have to clean up things is that
PMD exclusively owns the interface (ifouter). Is my understanding correct?

Thanks,
Yongseok

>  		for (pcnt = 0; pcnt <= (MLX5_VXLAN_PORT_RANGE_MAX
>  				     - MLX5_VXLAN_PORT_RANGE_MIN); pcnt++) {
>  			encap_port++;
>


More information about the dev mailing list