[dpdk-dev] [PATCH v2 6/7] net/mlx5: e-switch VXLAN encapsulation rules management

Yongseok Koh yskoh at mellanox.com
Thu Oct 25 02:33:14 CEST 2018


On Mon, Oct 15, 2018 at 02:13:34PM +0000, Viacheslav Ovsiienko wrote:
> VXLAN encap rules are applied to the VF ingress traffic and have the
> VTEP as actual redirection destinations instead of outer PF.
> The encapsulation rule should provide:
> - redirection action VF->PF
> - VF port ID
> - some inner network parameters (MACs/IP)
> - the tunnel outer source IP (v4/v6)
> - the tunnel outer destination IP (v4/v6). Current
> - VNI - Virtual Network Identifier
> 
> There is no direct way found to provide kernel with all required
> encapsulatioh header parameters. The encapsulation VTEP is created
> attached to the outer interface and assumed as default path for
> egress encapsulated traffic. The outer tunnel IP address are
> assigned to interface using Netlink, the implicit route is
> created like this:
> 
>   ip addr add <src_ip> peer <dst_ip> dev <outer> scope link
> 
> Peer address provides implicit route, and scode link reduces
> the risk of conflicts. At initialization time all local scope
> link addresses are flushed from device (see next part of patchset).
> 
> The destination MAC address is provided via permenent neigh rule:
> 
>   ip neigh add dev <outer> lladdr <dst_mac> to <dst_ip> nud permanent
> 
> At initialization time all neigh rules of this type are flushed
> from device (see the next part of patchset).
> 
> Suggested-by: Adrien Mazarguil <adrien.mazarguil at 6wind.com>
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo at mellanox.com>
> ---
>  drivers/net/mlx5/mlx5_flow_tcf.c | 394 ++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 389 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c
> index efa9c3b..a1d7733 100644
> --- a/drivers/net/mlx5/mlx5_flow_tcf.c
> +++ b/drivers/net/mlx5/mlx5_flow_tcf.c
> @@ -3443,6 +3443,376 @@ struct pedit_parser {
>  	return -err;
>  }
>  
> +/**
> + * Emit Netlink message to add/remove local address to the outer device.
> + * The address being added is visible within the link only (scope link).
> + *
> + * Note that an implicit route is maintained by the kernel due to the
> + * presence of a peer address (IFA_ADDRESS).
> + *
> + * These rules are used for encapsultion only and allow to assign
> + * the outer tunnel source IP address.
> + *
> + * @param[in] tcf
> + *   Libmnl socket context object.
> + * @param[in] encap
> + *   Encapsulation properties (source address and its peer).
> + * @param[in] ifindex
> + *   Network interface to apply rule.
> + * @param[in] enable
> + *   Toggle between add and remove.
> + * @param[out] error
> + *   Perform verbose error reporting if not NULL.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
> +		    const struct mlx5_flow_tcf_vxlan_encap *encap,
> +		    unsigned int ifindex,
> +		    bool enable,
> +		    struct rte_flow_error *error)
> +{
> +	struct nlmsghdr *nlh;
> +	struct ifaddrmsg *ifa;
> +	alignas(struct nlmsghdr)
> +	uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
> +
> +	nlh = mnl_nlmsg_put_header(buf);
> +	nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
> +	nlh->nlmsg_flags =
> +		NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
> +	nlh->nlmsg_seq = 0;
> +	ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
> +	ifa->ifa_flags = IFA_F_PERMANENT;
> +	ifa->ifa_scope = RT_SCOPE_LINK;
> +	ifa->ifa_index = ifindex;
> +	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC) {
> +		ifa->ifa_family = AF_INET;
> +		ifa->ifa_prefixlen = 32;
> +		mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
> +		if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST)
> +			mnl_attr_put_u32(nlh, IFA_ADDRESS,
> +					      encap->ipv4.dst);
> +	} else {
> +		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC);
> +		ifa->ifa_family = AF_INET6;
> +		ifa->ifa_prefixlen = 128;
> +		mnl_attr_put(nlh, IFA_LOCAL,
> +				  sizeof(encap->ipv6.src),
> +				  &encap->ipv6.src);
> +		if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST)
> +			mnl_attr_put(nlh, IFA_ADDRESS,
> +					  sizeof(encap->ipv6.dst),
> +					  &encap->ipv6.dst);
> +	}
> +	if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL))
> +		return 0;
> +	return rte_flow_error_set
> +		(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
> +		 "netlink: cannot complete IFA request (ip addr add)");
> +}
> +
> +/**
> + * Emit Netlink message to add/remove neighbor.
> + *
> + * @param[in] tcf
> + *   Libmnl socket context object.
> + * @param[in] encap
> + *   Encapsulation properties (destination address).
> + * @param[in] ifindex
> + *   Network interface.
> + * @param[in] enable
> + *   Toggle between add and remove.
> + * @param[out] error
> + *   Perform verbose error reporting if not NULL.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
> +		     const struct mlx5_flow_tcf_vxlan_encap *encap,
> +		     unsigned int ifindex,
> +		     bool enable,
> +		     struct rte_flow_error *error)
> +{
> +	struct nlmsghdr *nlh;
> +	struct ndmsg *ndm;
> +	alignas(struct nlmsghdr)
> +	uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
> +
> +	nlh = mnl_nlmsg_put_header(buf);
> +	nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
> +	nlh->nlmsg_flags =
> +		NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
> +	nlh->nlmsg_seq = 0;
> +	ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
> +	ndm->ndm_ifindex = ifindex;
> +	ndm->ndm_state = NUD_PERMANENT;
> +	ndm->ndm_flags = 0;
> +	ndm->ndm_type = 0;
> +	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST) {
> +		ndm->ndm_family = AF_INET;
> +		mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
> +	} else {
> +		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST);
> +		ndm->ndm_family = AF_INET6;
> +		mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
> +						 &encap->ipv6.dst);
> +	}
> +	if (encap->mask & MLX5_FLOW_TCF_ENCAP_ETH_SRC && enable)
> +		DRV_LOG(WARNING,
> +			"Outer ethernet source address cannot be "
> +			"forced for VXLAN encapsulation");
> +	if (encap->mask & MLX5_FLOW_TCF_ENCAP_ETH_DST)
> +		mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
> +						    &encap->eth.dst);
> +	if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL))
> +		return 0;
> +	return rte_flow_error_set
> +		(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
> +		 "netlink: cannot complete ND request (ip neigh)");
> +}
> +
> +/**
> + * Manage the local IP addresses and their peers IP addresses on the
> + * outer interface for encapsulation purposes. The kernel searches the
> + * appropriate device for tunnel egress traffic using the outer source
> + * IP, this IP should be assigned to the outer network device, otherwise
> + * kernel rejects the rule.
> + *
> + * Adds or removes the addresses using the Netlink command like this:
> + *   ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
> + *
> + * The addresses are local to the netdev ("scope link"), this reduces
> + * the risk of conflicts. Note that an implicit route is maintained by
> + * the kernel due to the presence of a peer address (IFA_ADDRESS).
> + *
> + * @param[in] tcf
> + *   Libmnl socket context object.
> + * @param[in] vtep
> + *   VTEP object, contains rule database and ifouter index.
> + * @param[in] dev_flow
> + *   Flow object, contains the tunnel parameters (for encap only).
> + * @param[in] enable
> + *   Toggle between add and remove.
> + * @param[out] error
> + *   Perform verbose error reporting if not NULL.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
> +		     struct mlx5_flow_tcf_vtep *vtep,
> +		     struct mlx5_flow *dev_flow,
> +		     bool enable,
> +		     struct rte_flow_error *error)
> +{
> +	const struct mlx5_flow_tcf_vxlan_encap *encap =
> +						dev_flow->tcf.vxlan_encap;
> +	struct tcf_local_rule *rule;
> +	bool found = false;
> +	int ret;
> +
> +	assert(encap);
> +	assert(encap->hdr.type == MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP);
> +	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC) {
> +		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST);
> +		LIST_FOREACH(rule, &vtep->local, next) {
> +			if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC &&
> +			    encap->ipv4.src == rule->ipv4.src &&
> +			    encap->ipv4.dst == rule->ipv4.dst) {
> +				found = true;
> +				break;
> +			}
> +		}
> +	} else {
> +		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC);
> +		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST);
> +		LIST_FOREACH(rule, &vtep->local, next) {
> +			if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC &&
> +			    !memcmp(&encap->ipv6.src, &rule->ipv6.src,
> +					    sizeof(encap->ipv6.src)) &&
> +			    !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
> +					    sizeof(encap->ipv6.dst))) {
> +				found = true;
> +				break;
> +			}
> +		}
> +	}
> +	if (found) {
> +		if (enable) {
> +			rule->refcnt++;
> +			return 0;
> +		}
> +		if (!rule->refcnt || !--rule->refcnt) {

Same suggestion for this as that of vtep - refcnt handling and adding get()
func.

> +			LIST_REMOVE(rule, next);
> +			return flow_tcf_rule_local(tcf, encap,
> +					vtep->ifouter, false, error);
> +		}
> +		return 0;
> +	}
> +	if (!enable) {
> +		DRV_LOG(WARNING, "Disabling not existing local rule");
> +		rte_flow_error_set
> +			(error, ENOENT, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +			 NULL, "Disabling not existing local rule");
> +		return -ENOENT;
> +	}
> +	rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
> +				alignof(struct tcf_local_rule));
> +	if (!rule) {
> +		rte_flow_error_set
> +			(error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +			 NULL, "unable to allocate memory for local rule");
> +		return -rte_errno;
> +	}
> +	*rule = (struct tcf_local_rule){.refcnt = 0,
> +					.mask = 0,
> +					};

Is it effective? The allocated memory is already zeroed out.

> +	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC) {
> +		rule->mask = MLX5_FLOW_TCF_ENCAP_IPV4_SRC
> +			   | MLX5_FLOW_TCF_ENCAP_IPV4_DST;
> +		rule->ipv4.src = encap->ipv4.src;
> +		rule->ipv4.dst = encap->ipv4.dst;
> +	} else {
> +		rule->mask = MLX5_FLOW_TCF_ENCAP_IPV6_SRC
> +			   | MLX5_FLOW_TCF_ENCAP_IPV6_DST;
> +		memcpy(&rule->ipv6.src, &encap->ipv6.src,
> +				sizeof(rule->ipv6.src));
> +		memcpy(&rule->ipv6.dst, &encap->ipv6.dst,
> +				sizeof(rule->ipv6.dst));
> +	}
> +	ret = flow_tcf_rule_local(tcf, encap, vtep->ifouter, true, error);
> +	if (ret) {
> +		rte_free(rule);
> +		return ret;
> +	}
> +	rule->refcnt++;
> +	LIST_INSERT_HEAD(&vtep->local, rule, next);
> +	return 0;
> +}
> +
> +/**
> + * Manage the destination MAC/IP addresses neigh database, kernel uses
> + * this one to determine the destination MAC address within encapsulation
> + * header. Adds or removes the entries using the Netlink command like this:
> + *   ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
> + *
> + * @param[in] tcf
> + *   Libmnl socket context object.
> + * @param[in] vtep
> + *   VTEP object, contains rule database and ifouter index.
> + * @param[in] dev_flow
> + *   Flow object, contains the tunnel parameters (for encap only).
> + * @param[in] enable
> + *   Toggle between add and remove.
> + * @param[out] error
> + *   Perform verbose error reporting if not NULL.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
> +		     struct mlx5_flow_tcf_vtep *vtep,
> +		     struct mlx5_flow *dev_flow,
> +		     bool enable,
> +		     struct rte_flow_error *error)
> +{
> +	const struct mlx5_flow_tcf_vxlan_encap *encap =
> +						dev_flow->tcf.vxlan_encap;
> +	struct tcf_neigh_rule *rule;
> +	bool found = false;
> +	int ret;
> +
> +	assert(encap);
> +	assert(encap->hdr.type == MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP);
> +	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST) {
> +		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC);
> +		LIST_FOREACH(rule, &vtep->neigh, next) {
> +			if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST &&
> +			    encap->ipv4.dst == rule->ipv4.dst) {
> +				found = true;
> +				break;
> +			}
> +		}
> +	} else {
> +		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC);
> +		assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST);
> +		LIST_FOREACH(rule, &vtep->neigh, next) {
> +			if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST &&
> +			    !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
> +						sizeof(encap->ipv6.dst))) {
> +				found = true;
> +				break;
> +			}
> +		}
> +	}
> +	if (found) {
> +		if (memcmp(&encap->eth.dst, &rule->eth,
> +			   sizeof(encap->eth.dst))) {
> +			DRV_LOG(WARNING, "Destination MAC differs"
> +					 " in neigh rule");
> +			rte_flow_error_set(error, EEXIST,
> +					   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +					   NULL, "Different MAC address"
> +					   " neigh rule for the same"
> +					   " destination IP");
> +					return -EEXIST;
> +		}
> +		if (enable) {
> +			rule->refcnt++;
> +			return 0;
> +		}
> +		if (!rule->refcnt || !--rule->refcnt) {

Same suggestion for this as that of vtep - refcnt handling by adding
create()/get()/release() func.

> +			LIST_REMOVE(rule, next);
> +			return flow_tcf_rule_neigh(tcf, encap,
> +						   vtep->ifouter,
> +						   false, error);
> +		}
> +		return 0;
> +	}
> +	if (!enable) {
> +		DRV_LOG(WARNING, "Disabling not existing neigh rule");
> +		rte_flow_error_set
> +			(error, ENOENT, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +			 NULL, "unable to allocate memory for neigh rule");
> +		return -ENOENT;
> +	}
> +	rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
> +				alignof(struct tcf_neigh_rule));
> +	if (!rule) {
> +		rte_flow_error_set
> +			(error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +			 NULL, "unadble to allocate memory for neigh rule");
> +		return -rte_errno;
> +	}
> +	*rule = (struct tcf_neigh_rule){.refcnt = 0,
> +					.mask = 0,
> +					};

Is it effective? The allocated memory is already zeroed out.

> +	if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST) {
> +		rule->mask = MLX5_FLOW_TCF_ENCAP_IPV4_DST;
> +		rule->ipv4.dst = encap->ipv4.dst;
> +	} else {
> +		rule->mask = MLX5_FLOW_TCF_ENCAP_IPV6_DST;
> +		memcpy(&rule->ipv6.dst, &encap->ipv6.dst,
> +					sizeof(rule->ipv6.dst));
> +	}
> +	memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
> +	ret = flow_tcf_rule_neigh(tcf, encap, vtep->ifouter, true, error);
> +	if (ret) {
> +		rte_free(rule);
> +		return ret;
> +	}
> +	rule->refcnt++;
> +	LIST_INSERT_HEAD(&vtep->neigh, rule, next);
> +	return 0;
> +}
> +
>  /* VTEP device list is shared between PMD port instances. */
>  static LIST_HEAD(, mlx5_flow_tcf_vtep)
>  			vtep_list_vxlan = LIST_HEAD_INITIALIZER();
> @@ -3715,6 +4085,7 @@ static LIST_HEAD(, mlx5_flow_tcf_vtep)
>  {
>  	static uint16_t encap_port = MLX5_VXLAN_PORT_RANGE_MIN - 1;
>  	struct mlx5_flow_tcf_vtep *vtep, *vlst;
> +	int ret;
>  
>  	assert(ifouter);
>  	/* Look whether the attached VTEP for encap is created. */
> @@ -3766,6 +4137,21 @@ static LIST_HEAD(, mlx5_flow_tcf_vtep)
>  	}
>  	if (!vtep)
>  		return 0;
> +	/* Create local ipaddr with peer to specify the outer IPs. */
> +	ret = flow_tcf_encap_local(tcf, vtep, dev_flow, true, error);
> +	if (ret) {
> +		if (!vtep->refcnt)
> +			flow_tcf_delete_iface(tcf, vtep);

There's no possibility of decreasing vtep->refcnt in flow_tcf_encap_local(),
then why do you expect it to be zero here? If it is already zero at this point,
it should've been deleted when it became zero.

> +		return 0;
> +	}
> +	/* Create neigh rule to specify outer destination MAC. */
> +	ret = flow_tcf_encap_neigh(tcf, vtep, dev_flow, true, error);
> +	if (ret) {
> +		flow_tcf_encap_local(tcf, vtep, dev_flow, false, error);
> +		if (!vtep->refcnt)
> +			flow_tcf_delete_iface(tcf, vtep);

Same here.

Thanks,
Yongseok

> +		return 0;
> +	}
>  	vtep->refcnt++;
>  	assert(vtep->ifindex);
>  	return vtep->ifindex;
> @@ -3848,11 +4234,9 @@ static LIST_HEAD(, mlx5_flow_tcf_vtep)
>  	case MLX5_FLOW_TCF_TUNACT_VXLAN_DECAP:
>  		break;
>  	case MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP:
> -/*
> - * TODO: Remove the encap ancillary rules first.
> - * flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL);
> - * flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL);
> - */
> +		/* Remove the encap ancillary rules first. */
> +		flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL);
> +		flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL);
>  		break;
>  	default:
>  		assert(false);
> 


More information about the dev mailing list