[dpdk-dev] [PATCH v2 6/7] net/mlx5: e-switch VXLAN encapsulation rules management
Yongseok Koh
yskoh at mellanox.com
Thu Oct 25 02:33:14 CEST 2018
On Mon, Oct 15, 2018 at 02:13:34PM +0000, Viacheslav Ovsiienko wrote:
> VXLAN encap rules are applied to the VF ingress traffic and have the
> VTEP as actual redirection destinations instead of outer PF.
> The encapsulation rule should provide:
> - redirection action VF->PF
> - VF port ID
> - some inner network parameters (MACs/IP)
> - the tunnel outer source IP (v4/v6)
> - the tunnel outer destination IP (v4/v6). Current
> - VNI - Virtual Network Identifier
>
> There is no direct way found to provide kernel with all required
> encapsulatioh header parameters. The encapsulation VTEP is created
> attached to the outer interface and assumed as default path for
> egress encapsulated traffic. The outer tunnel IP address are
> assigned to interface using Netlink, the implicit route is
> created like this:
>
> ip addr add <src_ip> peer <dst_ip> dev <outer> scope link
>
> Peer address provides implicit route, and scode link reduces
> the risk of conflicts. At initialization time all local scope
> link addresses are flushed from device (see next part of patchset).
>
> The destination MAC address is provided via permenent neigh rule:
>
> ip neigh add dev <outer> lladdr <dst_mac> to <dst_ip> nud permanent
>
> At initialization time all neigh rules of this type are flushed
> from device (see the next part of patchset).
>
> Suggested-by: Adrien Mazarguil <adrien.mazarguil at 6wind.com>
> Signed-off-by: Viacheslav Ovsiienko <viacheslavo at mellanox.com>
> ---
> drivers/net/mlx5/mlx5_flow_tcf.c | 394 ++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 389 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c b/drivers/net/mlx5/mlx5_flow_tcf.c
> index efa9c3b..a1d7733 100644
> --- a/drivers/net/mlx5/mlx5_flow_tcf.c
> +++ b/drivers/net/mlx5/mlx5_flow_tcf.c
> @@ -3443,6 +3443,376 @@ struct pedit_parser {
> return -err;
> }
>
> +/**
> + * Emit Netlink message to add/remove local address to the outer device.
> + * The address being added is visible within the link only (scope link).
> + *
> + * Note that an implicit route is maintained by the kernel due to the
> + * presence of a peer address (IFA_ADDRESS).
> + *
> + * These rules are used for encapsultion only and allow to assign
> + * the outer tunnel source IP address.
> + *
> + * @param[in] tcf
> + * Libmnl socket context object.
> + * @param[in] encap
> + * Encapsulation properties (source address and its peer).
> + * @param[in] ifindex
> + * Network interface to apply rule.
> + * @param[in] enable
> + * Toggle between add and remove.
> + * @param[out] error
> + * Perform verbose error reporting if not NULL.
> + *
> + * @return
> + * 0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
> + const struct mlx5_flow_tcf_vxlan_encap *encap,
> + unsigned int ifindex,
> + bool enable,
> + struct rte_flow_error *error)
> +{
> + struct nlmsghdr *nlh;
> + struct ifaddrmsg *ifa;
> + alignas(struct nlmsghdr)
> + uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
> +
> + nlh = mnl_nlmsg_put_header(buf);
> + nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
> + nlh->nlmsg_flags =
> + NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
> + nlh->nlmsg_seq = 0;
> + ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
> + ifa->ifa_flags = IFA_F_PERMANENT;
> + ifa->ifa_scope = RT_SCOPE_LINK;
> + ifa->ifa_index = ifindex;
> + if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC) {
> + ifa->ifa_family = AF_INET;
> + ifa->ifa_prefixlen = 32;
> + mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
> + if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST)
> + mnl_attr_put_u32(nlh, IFA_ADDRESS,
> + encap->ipv4.dst);
> + } else {
> + assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC);
> + ifa->ifa_family = AF_INET6;
> + ifa->ifa_prefixlen = 128;
> + mnl_attr_put(nlh, IFA_LOCAL,
> + sizeof(encap->ipv6.src),
> + &encap->ipv6.src);
> + if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST)
> + mnl_attr_put(nlh, IFA_ADDRESS,
> + sizeof(encap->ipv6.dst),
> + &encap->ipv6.dst);
> + }
> + if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL))
> + return 0;
> + return rte_flow_error_set
> + (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
> + "netlink: cannot complete IFA request (ip addr add)");
> +}
> +
> +/**
> + * Emit Netlink message to add/remove neighbor.
> + *
> + * @param[in] tcf
> + * Libmnl socket context object.
> + * @param[in] encap
> + * Encapsulation properties (destination address).
> + * @param[in] ifindex
> + * Network interface.
> + * @param[in] enable
> + * Toggle between add and remove.
> + * @param[out] error
> + * Perform verbose error reporting if not NULL.
> + *
> + * @return
> + * 0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
> + const struct mlx5_flow_tcf_vxlan_encap *encap,
> + unsigned int ifindex,
> + bool enable,
> + struct rte_flow_error *error)
> +{
> + struct nlmsghdr *nlh;
> + struct ndmsg *ndm;
> + alignas(struct nlmsghdr)
> + uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
> +
> + nlh = mnl_nlmsg_put_header(buf);
> + nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
> + nlh->nlmsg_flags =
> + NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
> + nlh->nlmsg_seq = 0;
> + ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
> + ndm->ndm_ifindex = ifindex;
> + ndm->ndm_state = NUD_PERMANENT;
> + ndm->ndm_flags = 0;
> + ndm->ndm_type = 0;
> + if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST) {
> + ndm->ndm_family = AF_INET;
> + mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
> + } else {
> + assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST);
> + ndm->ndm_family = AF_INET6;
> + mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
> + &encap->ipv6.dst);
> + }
> + if (encap->mask & MLX5_FLOW_TCF_ENCAP_ETH_SRC && enable)
> + DRV_LOG(WARNING,
> + "Outer ethernet source address cannot be "
> + "forced for VXLAN encapsulation");
> + if (encap->mask & MLX5_FLOW_TCF_ENCAP_ETH_DST)
> + mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
> + &encap->eth.dst);
> + if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL))
> + return 0;
> + return rte_flow_error_set
> + (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
> + "netlink: cannot complete ND request (ip neigh)");
> +}
> +
> +/**
> + * Manage the local IP addresses and their peers IP addresses on the
> + * outer interface for encapsulation purposes. The kernel searches the
> + * appropriate device for tunnel egress traffic using the outer source
> + * IP, this IP should be assigned to the outer network device, otherwise
> + * kernel rejects the rule.
> + *
> + * Adds or removes the addresses using the Netlink command like this:
> + * ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
> + *
> + * The addresses are local to the netdev ("scope link"), this reduces
> + * the risk of conflicts. Note that an implicit route is maintained by
> + * the kernel due to the presence of a peer address (IFA_ADDRESS).
> + *
> + * @param[in] tcf
> + * Libmnl socket context object.
> + * @param[in] vtep
> + * VTEP object, contains rule database and ifouter index.
> + * @param[in] dev_flow
> + * Flow object, contains the tunnel parameters (for encap only).
> + * @param[in] enable
> + * Toggle between add and remove.
> + * @param[out] error
> + * Perform verbose error reporting if not NULL.
> + *
> + * @return
> + * 0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
> + struct mlx5_flow_tcf_vtep *vtep,
> + struct mlx5_flow *dev_flow,
> + bool enable,
> + struct rte_flow_error *error)
> +{
> + const struct mlx5_flow_tcf_vxlan_encap *encap =
> + dev_flow->tcf.vxlan_encap;
> + struct tcf_local_rule *rule;
> + bool found = false;
> + int ret;
> +
> + assert(encap);
> + assert(encap->hdr.type == MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP);
> + if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC) {
> + assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST);
> + LIST_FOREACH(rule, &vtep->local, next) {
> + if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC &&
> + encap->ipv4.src == rule->ipv4.src &&
> + encap->ipv4.dst == rule->ipv4.dst) {
> + found = true;
> + break;
> + }
> + }
> + } else {
> + assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC);
> + assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST);
> + LIST_FOREACH(rule, &vtep->local, next) {
> + if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC &&
> + !memcmp(&encap->ipv6.src, &rule->ipv6.src,
> + sizeof(encap->ipv6.src)) &&
> + !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
> + sizeof(encap->ipv6.dst))) {
> + found = true;
> + break;
> + }
> + }
> + }
> + if (found) {
> + if (enable) {
> + rule->refcnt++;
> + return 0;
> + }
> + if (!rule->refcnt || !--rule->refcnt) {
Same suggestion for this as that of vtep - refcnt handling and adding get()
func.
> + LIST_REMOVE(rule, next);
> + return flow_tcf_rule_local(tcf, encap,
> + vtep->ifouter, false, error);
> + }
> + return 0;
> + }
> + if (!enable) {
> + DRV_LOG(WARNING, "Disabling not existing local rule");
> + rte_flow_error_set
> + (error, ENOENT, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> + NULL, "Disabling not existing local rule");
> + return -ENOENT;
> + }
> + rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
> + alignof(struct tcf_local_rule));
> + if (!rule) {
> + rte_flow_error_set
> + (error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> + NULL, "unable to allocate memory for local rule");
> + return -rte_errno;
> + }
> + *rule = (struct tcf_local_rule){.refcnt = 0,
> + .mask = 0,
> + };
Is it effective? The allocated memory is already zeroed out.
> + if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC) {
> + rule->mask = MLX5_FLOW_TCF_ENCAP_IPV4_SRC
> + | MLX5_FLOW_TCF_ENCAP_IPV4_DST;
> + rule->ipv4.src = encap->ipv4.src;
> + rule->ipv4.dst = encap->ipv4.dst;
> + } else {
> + rule->mask = MLX5_FLOW_TCF_ENCAP_IPV6_SRC
> + | MLX5_FLOW_TCF_ENCAP_IPV6_DST;
> + memcpy(&rule->ipv6.src, &encap->ipv6.src,
> + sizeof(rule->ipv6.src));
> + memcpy(&rule->ipv6.dst, &encap->ipv6.dst,
> + sizeof(rule->ipv6.dst));
> + }
> + ret = flow_tcf_rule_local(tcf, encap, vtep->ifouter, true, error);
> + if (ret) {
> + rte_free(rule);
> + return ret;
> + }
> + rule->refcnt++;
> + LIST_INSERT_HEAD(&vtep->local, rule, next);
> + return 0;
> +}
> +
> +/**
> + * Manage the destination MAC/IP addresses neigh database, kernel uses
> + * this one to determine the destination MAC address within encapsulation
> + * header. Adds or removes the entries using the Netlink command like this:
> + * ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
> + *
> + * @param[in] tcf
> + * Libmnl socket context object.
> + * @param[in] vtep
> + * VTEP object, contains rule database and ifouter index.
> + * @param[in] dev_flow
> + * Flow object, contains the tunnel parameters (for encap only).
> + * @param[in] enable
> + * Toggle between add and remove.
> + * @param[out] error
> + * Perform verbose error reporting if not NULL.
> + *
> + * @return
> + * 0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
> + struct mlx5_flow_tcf_vtep *vtep,
> + struct mlx5_flow *dev_flow,
> + bool enable,
> + struct rte_flow_error *error)
> +{
> + const struct mlx5_flow_tcf_vxlan_encap *encap =
> + dev_flow->tcf.vxlan_encap;
> + struct tcf_neigh_rule *rule;
> + bool found = false;
> + int ret;
> +
> + assert(encap);
> + assert(encap->hdr.type == MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP);
> + if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST) {
> + assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_SRC);
> + LIST_FOREACH(rule, &vtep->neigh, next) {
> + if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST &&
> + encap->ipv4.dst == rule->ipv4.dst) {
> + found = true;
> + break;
> + }
> + }
> + } else {
> + assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_SRC);
> + assert(encap->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST);
> + LIST_FOREACH(rule, &vtep->neigh, next) {
> + if (rule->mask & MLX5_FLOW_TCF_ENCAP_IPV6_DST &&
> + !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
> + sizeof(encap->ipv6.dst))) {
> + found = true;
> + break;
> + }
> + }
> + }
> + if (found) {
> + if (memcmp(&encap->eth.dst, &rule->eth,
> + sizeof(encap->eth.dst))) {
> + DRV_LOG(WARNING, "Destination MAC differs"
> + " in neigh rule");
> + rte_flow_error_set(error, EEXIST,
> + RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> + NULL, "Different MAC address"
> + " neigh rule for the same"
> + " destination IP");
> + return -EEXIST;
> + }
> + if (enable) {
> + rule->refcnt++;
> + return 0;
> + }
> + if (!rule->refcnt || !--rule->refcnt) {
Same suggestion for this as that of vtep - refcnt handling by adding
create()/get()/release() func.
> + LIST_REMOVE(rule, next);
> + return flow_tcf_rule_neigh(tcf, encap,
> + vtep->ifouter,
> + false, error);
> + }
> + return 0;
> + }
> + if (!enable) {
> + DRV_LOG(WARNING, "Disabling not existing neigh rule");
> + rte_flow_error_set
> + (error, ENOENT, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> + NULL, "unable to allocate memory for neigh rule");
> + return -ENOENT;
> + }
> + rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
> + alignof(struct tcf_neigh_rule));
> + if (!rule) {
> + rte_flow_error_set
> + (error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> + NULL, "unadble to allocate memory for neigh rule");
> + return -rte_errno;
> + }
> + *rule = (struct tcf_neigh_rule){.refcnt = 0,
> + .mask = 0,
> + };
Is it effective? The allocated memory is already zeroed out.
> + if (encap->mask & MLX5_FLOW_TCF_ENCAP_IPV4_DST) {
> + rule->mask = MLX5_FLOW_TCF_ENCAP_IPV4_DST;
> + rule->ipv4.dst = encap->ipv4.dst;
> + } else {
> + rule->mask = MLX5_FLOW_TCF_ENCAP_IPV6_DST;
> + memcpy(&rule->ipv6.dst, &encap->ipv6.dst,
> + sizeof(rule->ipv6.dst));
> + }
> + memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
> + ret = flow_tcf_rule_neigh(tcf, encap, vtep->ifouter, true, error);
> + if (ret) {
> + rte_free(rule);
> + return ret;
> + }
> + rule->refcnt++;
> + LIST_INSERT_HEAD(&vtep->neigh, rule, next);
> + return 0;
> +}
> +
> /* VTEP device list is shared between PMD port instances. */
> static LIST_HEAD(, mlx5_flow_tcf_vtep)
> vtep_list_vxlan = LIST_HEAD_INITIALIZER();
> @@ -3715,6 +4085,7 @@ static LIST_HEAD(, mlx5_flow_tcf_vtep)
> {
> static uint16_t encap_port = MLX5_VXLAN_PORT_RANGE_MIN - 1;
> struct mlx5_flow_tcf_vtep *vtep, *vlst;
> + int ret;
>
> assert(ifouter);
> /* Look whether the attached VTEP for encap is created. */
> @@ -3766,6 +4137,21 @@ static LIST_HEAD(, mlx5_flow_tcf_vtep)
> }
> if (!vtep)
> return 0;
> + /* Create local ipaddr with peer to specify the outer IPs. */
> + ret = flow_tcf_encap_local(tcf, vtep, dev_flow, true, error);
> + if (ret) {
> + if (!vtep->refcnt)
> + flow_tcf_delete_iface(tcf, vtep);
There's no possibility of decreasing vtep->refcnt in flow_tcf_encap_local(),
then why do you expect it to be zero here? If it is already zero at this point,
it should've been deleted when it became zero.
> + return 0;
> + }
> + /* Create neigh rule to specify outer destination MAC. */
> + ret = flow_tcf_encap_neigh(tcf, vtep, dev_flow, true, error);
> + if (ret) {
> + flow_tcf_encap_local(tcf, vtep, dev_flow, false, error);
> + if (!vtep->refcnt)
> + flow_tcf_delete_iface(tcf, vtep);
Same here.
Thanks,
Yongseok
> + return 0;
> + }
> vtep->refcnt++;
> assert(vtep->ifindex);
> return vtep->ifindex;
> @@ -3848,11 +4234,9 @@ static LIST_HEAD(, mlx5_flow_tcf_vtep)
> case MLX5_FLOW_TCF_TUNACT_VXLAN_DECAP:
> break;
> case MLX5_FLOW_TCF_TUNACT_VXLAN_ENCAP:
> -/*
> - * TODO: Remove the encap ancillary rules first.
> - * flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL);
> - * flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL);
> - */
> + /* Remove the encap ancillary rules first. */
> + flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL);
> + flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL);
> break;
> default:
> assert(false);
>
More information about the dev
mailing list