Roopa Prabhu <ro...@cumulusnetworks.com> writes:

> From: Roopa Prabhu <ro...@cumulusnetworks.com>
>
> This patch adds support for MPLS multipath routes.
>
> Includes following changes to support multipath:
> - splits struct mpls_route into 'struct mpls_route + struct mpls_nh'
>
> - 'struct mpls_nh' represents a mpls nexthop label forwarding entry
>
> - moves mpls route and nexthop structures into internal.h
>
> - A mpls_route can point to multiple mpls_nh structs
>
> - the nexthops are maintained as a list

So I am not certain I like nexthops being a list.  In the practical case
introducing this list guarantees that everyone will see at least an
extra cache line miss in the forwarding path.

In the more abstract sense a list is the wrong data structure.  If the
list is so short we can afford to walk it an array is a better data
structure.  If we need enough entries to make the memory consumption
of an array a concern we want some kind of hash table or tree data
structure, because a list will be too long in that case.

So can we please not use a list?

I expect we can simplify the data structures by noting that rt_via must
be an ethernet mac today so that 6 bytes are enough and 8 bytes gives us
a bit extra and aligns things nicely.

Also I know it goes away in the next patch but a spinlock taken for
every transit through the forwarding path really bugs me.

Eric

> - In the process of restructuring, this patch also consistently changes all
> labels to u8
>
> - Adds support to parse/fill RTA_MULTIPATH netlink attribute for
> multipath routes similar to ipv4/v6 fib
>
> - In this patch, the multipath route nexthop selection algorithm
> is a simple round robin picked up from ipv4 fib code and is replaced by
> a hash based algorithm from Robert Shearman in the next patch
>
> - mpls_route_update cleanup: remove 'dev' handling in mpls_route_update.
> mpls_route_update though implemented to update based on dev, it was never
> used that way. And the dev handling gets tricky with multiple nexthops. Cannot
> match against any single nexthops dev. So, this patch removes the unused
> 'dev' handling in mpls_route_update.

>
> Example:
>
> $ip -f mpls route add 100 nexthop as 200 via inet 10.1.1.2 dev swp1 \
>                 nexthop as 700 via inet 10.1.1.6 dev swp2 \
>                 nexthop as 800 via inet 40.1.1.2 dev swp3
>
> $ip  -f mpls route show
> 100
>         nexthop as to 200 via inet 10.1.1.2  dev swp1
>         nexthop as to 700 via inet 10.1.1.6  dev swp2
>         nexthop as to 800 via inet 40.1.1.2  dev swp3
>
> Signed-off-by: Roopa Prabhu <ro...@cumulusnetworks.com>
> ---
>  include/net/mpls_iptunnel.h |   2 +-
>  net/mpls/af_mpls.c          | 627 
> +++++++++++++++++++++++++++++++++-----------
>  net/mpls/internal.h         |  43 ++-
>  3 files changed, 516 insertions(+), 156 deletions(-)
>
> diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h
> index 4757997..179253f 100644
> --- a/include/net/mpls_iptunnel.h
> +++ b/include/net/mpls_iptunnel.h
> @@ -18,7 +18,7 @@
>  
>  struct mpls_iptunnel_encap {
>       u32     label[MAX_NEW_LABELS];
> -     u32     labels;
> +     u8      labels;
>  };
>  
>  static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct 
> lwtunnel_state *lwtstate)
> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
> index 8c5707d..ae9e153 100644
> --- a/net/mpls/af_mpls.c
> +++ b/net/mpls/af_mpls.c
> @@ -19,39 +19,12 @@
>  #include <net/ipv6.h>
>  #include <net/addrconf.h>
>  #endif
> +#include <net/nexthop.h>
>  #include "internal.h"
>  
> -#define LABEL_NOT_SPECIFIED (1<<20)
> -#define MAX_NEW_LABELS 2
> -
> -/* This maximum ha length copied from the definition of struct neighbour */
> -#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
> -
> -enum mpls_payload_type {
> -     MPT_UNSPEC, /* IPv4 or IPv6 */
> -     MPT_IPV4 = 4,
> -     MPT_IPV6 = 6,
> -
> -     /* Other types not implemented:
> -      *  - Pseudo-wire with or without control word (RFC4385)
> -      *  - GAL (RFC5586)
> -      */
> -};
> -
> -struct mpls_route { /* next hop label forwarding entry */
> -     struct net_device __rcu *rt_dev;
> -     struct rcu_head         rt_rcu;
> -     u32                     rt_label[MAX_NEW_LABELS];
> -     u8                      rt_protocol; /* routing protocol that set this 
> entry */
> -     u8                      rt_payload_type;
> -     u8                      rt_labels;
> -     u8                      rt_via_alen;
> -     u8                      rt_via_table;
> -     u8                      rt_via[0];
> -};
> -
>  static int zero = 0;
>  static int label_limit = (1 << 20) - 1;
> +static DEFINE_SPINLOCK(mpls_multipath_lock);
>  
>  static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
>                      struct nlmsghdr *nlh, struct net *net, u32 portid,
> @@ -80,10 +53,10 @@ bool mpls_output_possible(const struct net_device *dev)
>  }
>  EXPORT_SYMBOL_GPL(mpls_output_possible);
>  
> -static unsigned int mpls_rt_header_size(const struct mpls_route *rt)
> +static unsigned int mpls_nh_header_size(const struct mpls_nh *nh)
>  {
>       /* The size of the layer 2.5 labels to be added for this route */
> -     return rt->rt_labels * sizeof(struct mpls_shim_hdr);
> +     return nh->nh_labels * sizeof(struct mpls_shim_hdr);
>  }
>  
>  unsigned int mpls_dev_mtu(const struct net_device *dev)
> @@ -105,8 +78,58 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned 
> int mtu)
>  }
>  EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
>  
> -static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
> -                     struct mpls_entry_decoded dec)
> +/* This is a cut/copy/modify from fib_select_multipath */
> +static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt)
> +{
> +     struct mpls_nh *nh;
> +     struct mpls_nh *ret_nh;
> +     int nhsel = 0;
> +     int w;
> +
> +     spin_lock_bh(&mpls_multipath_lock);
> +     ret_nh = list_first_entry_or_null(&rt->rt_nhs, struct mpls_nh,
> +                                       nh_next);
> +     if (rt->rt_power <= 0) {
> +             int power = 0;
> +
> +             list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
> +                     power += nh->nh_weight;
> +                     nh->nh_power = nh->nh_weight;
> +             }
> +             rt->rt_power = power;
> +             if (power <= 0) {
> +                     spin_unlock_bh(&mpls_multipath_lock);
> +                     /* Race condition: route has just become dead. */
> +                     return ret_nh;
> +             }
> +     }
> +
> +     /* w should be random number [0..rt->rt_power-1],
> +      * it is pretty bad approximation.
> +      */
> +     w = jiffies % rt->rt_power;
> +
> +     list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
> +             if (nh->nh_power) {
> +                     w -= nh->nh_power;
> +                     if (w <= 0) {
> +                             nh->nh_power--;
> +                             rt->rt_power--;
> +                             ret_nh = nh;
> +                             spin_unlock_bh(&mpls_multipath_lock);
> +                             return ret_nh;
> +                     }
> +             }
> +             nhsel++;
> +     }
> +
> +     /* Race condition: route has just become dead. */
> +     spin_unlock_bh(&mpls_multipath_lock);
> +     return ret_nh;
> +}
> +
> +static bool mpls_egress(struct mpls_route *rt, struct mpls_nh *nh,
> +                     struct sk_buff *skb, struct mpls_entry_decoded dec)
>  {
>       enum mpls_payload_type payload_type;
>       bool success = false;
> @@ -159,6 +182,7 @@ static int mpls_forward(struct sk_buff *skb, struct 
> net_device *dev,
>       struct net *net = dev_net(dev);
>       struct mpls_shim_hdr *hdr;
>       struct mpls_route *rt;
> +     struct mpls_nh *nh;
>       struct mpls_entry_decoded dec;
>       struct net_device *out_dev;
>       struct mpls_dev *mdev;
> @@ -196,9 +220,13 @@ static int mpls_forward(struct sk_buff *skb, struct 
> net_device *dev,
>       if (!rt)
>               goto drop;
>  
> +     nh = mpls_select_multipath(rt);
> +     if (!nh)
> +             goto drop;
> +
>       /* Find the output device */
> -     out_dev = rcu_dereference(rt->rt_dev);
> -     if (!mpls_output_possible(out_dev))
> +     out_dev = rcu_dereference(nh->nh_dev);
> +     if (!out_dev || !mpls_output_possible(out_dev))
>               goto drop;
>  
>       if (skb_warn_if_lro(skb))
> @@ -212,7 +240,7 @@ static int mpls_forward(struct sk_buff *skb, struct 
> net_device *dev,
>       dec.ttl -= 1;
>  
>       /* Verify the destination can hold the packet */
> -     new_header_size = mpls_rt_header_size(rt);
> +     new_header_size = mpls_nh_header_size(nh);
>       mtu = mpls_dev_mtu(out_dev);
>       if (mpls_pkt_too_big(skb, mtu - new_header_size))
>               goto drop;
> @@ -230,7 +258,7 @@ static int mpls_forward(struct sk_buff *skb, struct 
> net_device *dev,
>  
>       if (unlikely(!new_header_size && dec.bos)) {
>               /* Penultimate hop popping */
> -             if (!mpls_egress(rt, skb, dec))
> +             if (!mpls_egress(rt, nh, skb, dec))
>                       goto drop;
>       } else {
>               bool bos;
> @@ -240,13 +268,14 @@ static int mpls_forward(struct sk_buff *skb, struct 
> net_device *dev,
>               /* Push the new labels */
>               hdr = mpls_hdr(skb);
>               bos = dec.bos;
> -             for (i = rt->rt_labels - 1; i >= 0; i--) {
> -                     hdr[i] = mpls_entry_encode(rt->rt_label[i], dec.ttl, 0, 
> bos);
> +             for (i = nh->nh_labels - 1; i >= 0; i--) {
> +                     hdr[i] = mpls_entry_encode(nh->nh_label[i],
> +                                                dec.ttl, 0, bos);
>                       bos = false;
>               }
>       }
>  
> -     err = neigh_xmit(rt->rt_via_table, out_dev, rt->rt_via, skb);
> +     err = neigh_xmit(nh->nh_via_table, out_dev, nh->nh_via, skb);
>       if (err)
>               net_dbg_ratelimited("%s: packet transmission failed: %d\n",
>                                   __func__, err);
> @@ -270,31 +299,43 @@ static const struct nla_policy 
> rtm_mpls_policy[RTA_MAX+1] = {
>  struct mpls_route_config {
>       u32                     rc_protocol;
>       u32                     rc_ifindex;
> -     u16                     rc_via_table;
> -     u16                     rc_via_alen;
> +     u8                      rc_via_table;
> +     u8                      rc_via_alen;
>       u8                      rc_via[MAX_VIA_ALEN];
> +     u8                      rc_output_labels;
>       u32                     rc_label;
> -     u32                     rc_output_labels;
>       u32                     rc_output_label[MAX_NEW_LABELS];
>       u32                     rc_nlflags;
>       enum mpls_payload_type  rc_payload_type;
>       struct nl_info          rc_nlinfo;
> +     struct rtnexthop        *rc_mp;
> +     int                     rc_mp_len;
>  };
>  
> -static struct mpls_route *mpls_rt_alloc(size_t alen)
> +static struct mpls_route *mpls_rt_alloc(int num_nh)
>  {
>       struct mpls_route *rt;
>  
> -     rt = kzalloc(sizeof(*rt) + alen, GFP_KERNEL);
> -     if (rt)
> -             rt->rt_via_alen = alen;
> +     rt = kzalloc(sizeof(*rt), GFP_KERNEL);
> +     if (rt) {
> +             rt->rt_nhn = num_nh;
> +             INIT_LIST_HEAD(&rt->rt_nhs);
> +     }
> +
>       return rt;
>  }
>  
>  static void mpls_rt_free(struct mpls_route *rt)
>  {
> -     if (rt)
> +     struct mpls_nh *nh, *nh_safe;
> +
> +     if (rt) {
> +             list_for_each_entry_safe(nh, nh_safe, &rt->rt_nhs, nh_next) {
> +                     list_del(&nh->nh_next);
> +                     kfree(nh);
> +             }
>               kfree_rcu(rt, rt_rcu);
> +     }
>  }
>  
>  static void mpls_notify_route(struct net *net, unsigned index,
> @@ -312,25 +353,22 @@ static void mpls_notify_route(struct net *net, unsigned 
> index,
>  }
>  
>  static void mpls_route_update(struct net *net, unsigned index,
> -                           struct net_device *dev, struct mpls_route *new,
> +                           struct mpls_route *new,
>                             const struct nl_info *info)
>  {
>       struct mpls_route __rcu **platform_label;
> -     struct mpls_route *rt, *old = NULL;
> +     struct mpls_route *rt;
>  
>       ASSERT_RTNL();
>  
>       platform_label = rtnl_dereference(net->mpls.platform_label);
>       rt = rtnl_dereference(platform_label[index]);
> -     if (!dev || (rt && (rtnl_dereference(rt->rt_dev) == dev))) {
> -             rcu_assign_pointer(platform_label[index], new);
> -             old = rt;
> -     }
> +     rcu_assign_pointer(platform_label[index], new);
>  
> -     mpls_notify_route(net, index, old, new, info);
> +     mpls_notify_route(net, index, rt, new, info);
>  
>       /* If we removed a route free it now */
> -     mpls_rt_free(old);
> +     mpls_rt_free(rt);
>  }
>  
>  static unsigned find_free_label(struct net *net)
> @@ -406,23 +444,23 @@ static struct net_device *inet6_fib_lookup_dev(struct 
> net *net, void *addr)
>  #endif
>  
>  static struct net_device *find_outdev(struct net *net,
> -                                   struct mpls_route_config *cfg)
> +                                   struct mpls_nh *nh, int oif)
>  {
>       struct net_device *dev = NULL;
>  
> -     if (!cfg->rc_ifindex) {
> -             switch (cfg->rc_via_table) {
> +     if (!oif) {
> +             switch (nh->nh_via_table) {
>               case NEIGH_ARP_TABLE:
> -                     dev = inet_fib_lookup_dev(net, cfg->rc_via);
> +                     dev = inet_fib_lookup_dev(net, nh->nh_via);
>                       break;
>               case NEIGH_ND_TABLE:
> -                     dev = inet6_fib_lookup_dev(net, cfg->rc_via);
> +                     dev = inet6_fib_lookup_dev(net, nh->nh_via);
>                       break;
>               case NEIGH_LINK_TABLE:
>                       break;
>               }
>       } else {
> -             dev = dev_get_by_index(net, cfg->rc_ifindex);
> +             dev = dev_get_by_index(net, oif);
>       }
>  
>       if (!dev)
> @@ -431,15 +469,208 @@ static struct net_device *find_outdev(struct net *net,
>       return dev;
>  }
>  
> +static int mpls_nh_assign_dev(struct net *net, struct mpls_nh *nh, int oif)
> +{
> +     struct net_device *dev = NULL;
> +     int err = -ENODEV;
> +
> +     dev = find_outdev(net, nh, oif);
> +     if (IS_ERR(dev)) {
> +             err = PTR_ERR(dev);
> +             dev = NULL;
> +             goto errout;
> +     }
> +
> +     /* Ensure this is a supported device */
> +     err = -EINVAL;
> +     if (!mpls_dev_get(dev))
> +             goto errout;
> +
> +     RCU_INIT_POINTER(nh->nh_dev, dev);
> +     dev_put(dev);
> +
> +     return 0;
> +
> +errout:
> +     if (dev)
> +             dev_put(dev);
> +     return err;
> +}
> +
> +static struct mpls_nh *mpls_nh_alloc(size_t alen)
> +{
> +     struct mpls_nh *nh;
> +
> +     nh = kzalloc(sizeof(*nh) + alen, GFP_KERNEL);
> +     if (nh)
> +             nh->nh_via_alen = alen;
> +
> +     return nh;
> +}
> +
> +static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg,
> +                               struct mpls_route *rt)
> +{
> +     struct net *net = cfg->rc_nlinfo.nl_net;
> +     struct mpls_nh *nh = NULL;
> +     int err;
> +     int i;
> +
> +     err = -EINVAL;
> +     /* Ensure only a supported number of labels are present */
> +     if (cfg->rc_output_labels > MAX_NEW_LABELS)
> +             goto errout;
> +
> +     err = -ENOMEM;
> +     nh = mpls_nh_alloc(cfg->rc_via_alen);
> +     if (!nh)
> +             goto errout;
> +
> +     nh->nh_labels = cfg->rc_output_labels;
> +     for (i = 0; i < nh->nh_labels; i++)
> +             nh->nh_label[i] = cfg->rc_output_label[i];
> +
> +     nh->nh_via_table = cfg->rc_via_table;
> +     memcpy(nh->nh_via, cfg->rc_via, cfg->rc_via_alen);
> +     nh->nh_via_alen = cfg->rc_via_alen;
> +
> +     err = mpls_nh_assign_dev(net, nh, cfg->rc_ifindex);
> +     if (err)
> +             goto errout;
> +
> +     list_add_tail(&nh->nh_next, &rt->rt_nhs);
> +
> +     return 0;
> +
> +errout:
> +     kfree(nh);
> +
> +     return err;
> +}
> +
> +static int mpls_nh_build(struct net *net, struct mpls_nh **rt_nh,
> +                      int oif, struct nlattr *via_attr,
> +                      struct nlattr *newdst)
> +{
> +     struct mpls_nh *nh = NULL;
> +     int err;
> +     u8 via_alen;
> +     u8 via_table;
> +     u8 via[MAX_VIA_ALEN];
> +
> +     err = nla_get_via(via_attr, &via_alen, &via_table,
> +                       via);
> +     if (err)
> +             goto errout;
> +
> +     nh = mpls_nh_alloc(via_alen);
> +     if (!nh)
> +             goto errout;
> +
> +     if (newdst) {
> +             err = nla_get_labels(newdst, MAX_NEW_LABELS,
> +                                  &nh->nh_labels, nh->nh_label);
> +             if (err)
> +                     goto errout;
> +     }
> +     nh->nh_via_table = via_table;
> +     memcpy(nh->nh_via, via, via_alen);
> +
> +     err = mpls_nh_assign_dev(net, nh, oif);
> +     if (err)
> +             goto errout;
> +
> +     *rt_nh = nh;
> +
> +     return 0;
> +
> +errout:
> +     kfree(nh);
> +
> +     return err;
> +}
> +
> +static int mpls_count_nexthops(struct rtnexthop *rtnh, int len)
> +{
> +     int nhs = 0;
> +     int remaining = len;
> +
> +     while (rtnh_ok(rtnh, remaining)) {
> +             nhs++;
> +             rtnh = rtnh_next(rtnh, &remaining);
> +     }
> +
> +     /* leftover implies invalid nexthop configuration, discard it */
> +     return remaining > 0 ? 0 : nhs;
> +}
> +
> +static int mpls_nh_build_multi(struct mpls_route_config *cfg,
> +                            struct mpls_route *rt)
> +{
> +     struct rtnexthop *rtnh = cfg->rc_mp;
> +     struct nlattr *nla_via, *nla_newdst;
> +     int remaining = cfg->rc_mp_len;
> +     struct mpls_nh *nh, *nh_safe;
> +     int nhs = 0;
> +     int err = 0;
> +
> +     while (rtnh_ok(rtnh, remaining)) {
> +             int attrlen;
> +
> +             nla_via = NULL;
> +             nla_newdst = NULL;
> +             nh = NULL;
> +
> +             err = -EINVAL;
> +             if (!rtnh_ok(rtnh, remaining))
> +                     goto errout;
> +
> +             attrlen = rtnh_attrlen(rtnh);
> +             if (attrlen > 0) {
> +                     struct nlattr *attrs = rtnh_attrs(rtnh);
> +
> +                     nla_via = nla_find(attrs, attrlen, RTA_VIA);
> +                     nla_newdst = nla_find(attrs, attrlen, RTA_NEWDST);
> +             }
> +
> +             err = -EINVAL;
> +             if (!nla_via)
> +                     goto errout;
> +
> +             err = mpls_nh_build(cfg->rc_nlinfo.nl_net, &nh,
> +                                 rtnh->rtnh_ifindex, nla_via,
> +                                 nla_newdst);
> +             if (err)
> +                     goto errout;
> +
> +             nh->nh_weight = rtnh->rtnh_hops + 1;
> +             list_add_tail(&nh->nh_next, &rt->rt_nhs);
> +
> +             rtnh = rtnh_next(rtnh, &remaining);
> +             nhs++;
> +     }
> +
> +     rt->rt_nhn = nhs;
> +
> +     return 0;
> +
> +errout:
> +     list_for_each_entry_safe(nh, nh_safe, &rt->rt_nhs, nh_next) {
> +             list_del(&nh->nh_next);
> +             kfree(nh);
> +     }
> +
> +     return err;
> +}
> +
>  static int mpls_route_add(struct mpls_route_config *cfg)
>  {
>       struct mpls_route __rcu **platform_label;
>       struct net *net = cfg->rc_nlinfo.nl_net;
> -     struct net_device *dev = NULL;
>       struct mpls_route *rt, *old;
> -     unsigned index;
> -     int i;
>       int err = -EINVAL;
> +     unsigned index;
> +     int nhs = 1; /* default to one nexthop */
>  
>       index = cfg->rc_label;
>  
> @@ -457,27 +688,6 @@ static int mpls_route_add(struct mpls_route_config *cfg)
>       if (index >= net->mpls.platform_labels)
>               goto errout;
>  
> -     /* Ensure only a supported number of labels are present */
> -     if (cfg->rc_output_labels > MAX_NEW_LABELS)
> -             goto errout;
> -
> -     dev = find_outdev(net, cfg);
> -     if (IS_ERR(dev)) {
> -             err = PTR_ERR(dev);
> -             dev = NULL;
> -             goto errout;
> -     }
> -
> -     /* Ensure this is a supported device */
> -     err = -EINVAL;
> -     if (!mpls_dev_get(dev))
> -             goto errout;
> -
> -     err = -EINVAL;
> -     if ((cfg->rc_via_table == NEIGH_LINK_TABLE) &&
> -         (dev->addr_len != cfg->rc_via_alen))
> -             goto errout;
> -
>       /* Append makes no sense with mpls */
>       err = -EOPNOTSUPP;
>       if (cfg->rc_nlflags & NLM_F_APPEND)
> @@ -497,28 +707,34 @@ static int mpls_route_add(struct mpls_route_config *cfg)
>       if (!(cfg->rc_nlflags & NLM_F_CREATE) && !old)
>               goto errout;
>  
> +     if (cfg->rc_mp) {
> +             err = -EINVAL;
> +             nhs = mpls_count_nexthops(cfg->rc_mp, cfg->rc_mp_len);
> +             if (nhs == 0)
> +                     goto errout;
> +     }
> +
>       err = -ENOMEM;
> -     rt = mpls_rt_alloc(cfg->rc_via_alen);
> +     rt = mpls_rt_alloc(nhs);
>       if (!rt)
>               goto errout;
> -
> -     rt->rt_labels = cfg->rc_output_labels;
> -     for (i = 0; i < rt->rt_labels; i++)
> -             rt->rt_label[i] = cfg->rc_output_label[i];
>       rt->rt_protocol = cfg->rc_protocol;
> -     RCU_INIT_POINTER(rt->rt_dev, dev);
>       rt->rt_payload_type = cfg->rc_payload_type;
> -     rt->rt_via_table = cfg->rc_via_table;
> -     memcpy(rt->rt_via, cfg->rc_via, cfg->rc_via_alen);
>  
> -     mpls_route_update(net, index, NULL, rt, &cfg->rc_nlinfo);
> +     if (cfg->rc_mp)
> +             err = mpls_nh_build_multi(cfg, rt);
> +     else
> +             err = mpls_nh_build_from_cfg(cfg, rt);
> +     if (err)
> +             goto freert;
> +
> +     mpls_route_update(net, index, rt, &cfg->rc_nlinfo);
>  
> -     dev_put(dev);
>       return 0;
>  
> +freert:
> +     mpls_rt_free(rt);
>  errout:
> -     if (dev)
> -             dev_put(dev);
>       return err;
>  }
>  
> @@ -538,7 +754,7 @@ static int mpls_route_del(struct mpls_route_config *cfg)
>       if (index >= net->mpls.platform_labels)
>               goto errout;
>  
> -     mpls_route_update(net, index, NULL, NULL, &cfg->rc_nlinfo);
> +     mpls_route_update(net, index, NULL, &cfg->rc_nlinfo);
>  
>       err = 0;
>  errout:
> @@ -628,6 +844,7 @@ static void mpls_ifdown(struct net_device *dev)
>       struct mpls_route __rcu **platform_label;
>       struct net *net = dev_net(dev);
>       struct mpls_dev *mdev;
> +     struct mpls_nh *nh;
>       unsigned index;
>  
>       platform_label = rtnl_dereference(net->mpls.platform_label);
> @@ -635,9 +852,14 @@ static void mpls_ifdown(struct net_device *dev)
>               struct mpls_route *rt = rtnl_dereference(platform_label[index]);
>               if (!rt)
>                       continue;
> -             if (rtnl_dereference(rt->rt_dev) != dev)
> -                     continue;
> -             rt->rt_dev = NULL;
> +             list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
> +                     struct net_device *mdev;
> +
> +                     mdev = rtnl_dereference(nh->nh_dev);
> +                     if (mdev != dev)
> +                             continue;
> +                     nh->nh_dev = NULL;
> +             }
>       }
>  
>       mdev = mpls_dev_get(dev);
> @@ -736,7 +958,7 @@ int nla_put_labels(struct sk_buff *skb, int attrtype,
>  EXPORT_SYMBOL_GPL(nla_put_labels);
>  
>  int nla_get_labels(const struct nlattr *nla,
> -                u32 max_labels, u32 *labels, u32 label[])
> +                u8 max_labels, u8 *labels, u32 label[])
>  {
>       unsigned len = nla_len(nla);
>       unsigned nla_labels;
> @@ -781,6 +1003,48 @@ int nla_get_labels(const struct nlattr *nla,
>  }
>  EXPORT_SYMBOL_GPL(nla_get_labels);
>  
> +int nla_get_via(const struct nlattr *nla, u8 *via_alen,
> +             u8 *via_table, u8 via_addr[])
> +{
> +     struct rtvia *via = nla_data(nla);
> +     int err = -EINVAL;
> +     u8 alen;
> +
> +     if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr))
> +             goto errout;
> +     alen = nla_len(nla) -
> +                     offsetof(struct rtvia, rtvia_addr);
> +     if (alen > MAX_VIA_ALEN)
> +             goto errout;
> +
> +     /* Validate the address family */
> +     switch (via->rtvia_family) {
> +     case AF_PACKET:
> +             *via_table = NEIGH_LINK_TABLE;
> +             break;
> +     case AF_INET:
> +             *via_table = NEIGH_ARP_TABLE;
> +             if (alen != 4)
> +                     goto errout;
> +             break;
> +     case AF_INET6:
> +             *via_table = NEIGH_ND_TABLE;
> +             if (alen != 16)
> +                     goto errout;
> +             break;
> +     default:
> +             /* Unsupported address family */
> +             goto errout;
> +     }
> +
> +     memcpy(via_addr, via->rtvia_addr, alen);
> +     *via_alen = alen;
> +     err = 0;
> +
> +errout:
> +     return err;
> +}
> +
>  static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
>                              struct mpls_route_config *cfg)
>  {
> @@ -844,7 +1108,7 @@ static int rtm_to_route_config(struct sk_buff *skb,  
> struct nlmsghdr *nlh,
>                       break;
>               case RTA_DST:
>               {
> -                     u32 label_count;
> +                     u8 label_count;
>                       if (nla_get_labels(nla, 1, &label_count,
>                                          &cfg->rc_label))
>                               goto errout;
> @@ -857,35 +1121,15 @@ static int rtm_to_route_config(struct sk_buff *skb,  
> struct nlmsghdr *nlh,
>               }
>               case RTA_VIA:
>               {
> -                     struct rtvia *via = nla_data(nla);
> -                     if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr))
> +                     if (nla_get_via(nla, &cfg->rc_via_alen,
> +                                     &cfg->rc_via_table, cfg->rc_via))
>                               goto errout;
> -                     cfg->rc_via_alen   = nla_len(nla) -
> -                             offsetof(struct rtvia, rtvia_addr);
> -                     if (cfg->rc_via_alen > MAX_VIA_ALEN)
> -                             goto errout;
> -
> -                     /* Validate the address family */
> -                     switch(via->rtvia_family) {
> -                     case AF_PACKET:
> -                             cfg->rc_via_table = NEIGH_LINK_TABLE;
> -                             break;
> -                     case AF_INET:
> -                             cfg->rc_via_table = NEIGH_ARP_TABLE;
> -                             if (cfg->rc_via_alen != 4)
> -                                     goto errout;
> -                             break;
> -                     case AF_INET6:
> -                             cfg->rc_via_table = NEIGH_ND_TABLE;
> -                             if (cfg->rc_via_alen != 16)
> -                                     goto errout;
> -                             break;
> -                     default:
> -                             /* Unsupported address family */
> -                             goto errout;
> -                     }
> -
> -                     memcpy(cfg->rc_via, via->rtvia_addr, cfg->rc_via_alen);
> +                     break;
> +             }
> +             case RTA_MULTIPATH:
> +             {
> +                     cfg->rc_mp = nla_data(nla);
> +                     cfg->rc_mp_len = nla_len(nla);
>                       break;
>               }
>               default:
> @@ -946,16 +1190,56 @@ static int mpls_dump_route(struct sk_buff *skb, u32 
> portid, u32 seq, int event,
>       rtm->rtm_type = RTN_UNICAST;
>       rtm->rtm_flags = 0;
>  
> -     if (rt->rt_labels &&
> -         nla_put_labels(skb, RTA_NEWDST, rt->rt_labels, rt->rt_label))
> -             goto nla_put_failure;
> -     if (nla_put_via(skb, rt->rt_via_table, rt->rt_via, rt->rt_via_alen))
> -             goto nla_put_failure;
> -     dev = rtnl_dereference(rt->rt_dev);
> -     if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
> -             goto nla_put_failure;
>       if (nla_put_labels(skb, RTA_DST, 1, &label))
>               goto nla_put_failure;
> +     if (rt->rt_nhn == 1) {
> +             struct mpls_nh *nh = list_first_entry_or_null(&rt->rt_nhs,
> +                                                     struct mpls_nh,
> +                                                     nh_next);
> +
> +             if (nh->nh_labels &&
> +                 nla_put_labels(skb, RTA_NEWDST, nh->nh_labels,
> +                                nh->nh_label))
> +                     goto nla_put_failure;
> +             if (nla_put_via(skb, nh->nh_via_table, nh->nh_via,
> +                             nh->nh_via_alen))
> +                     goto nla_put_failure;
> +             dev = rtnl_dereference(nh->nh_dev);
> +             if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
> +                     goto nla_put_failure;
> +     } else {
> +             struct rtnexthop *rtnh;
> +             struct nlattr *mp;
> +             struct mpls_nh *nh;
> +
> +             mp = nla_nest_start(skb, RTA_MULTIPATH);
> +             if (!mp)
> +                     goto nla_put_failure;
> +
> +             list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
> +                     rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
> +                     if (!rtnh)
> +                             goto nla_put_failure;
> +
> +                     rtnh->rtnh_flags = nh->nh_flags & 0xFF;
> +                     dev = rtnl_dereference(nh->nh_dev);
> +                     if (dev)
> +                             rtnh->rtnh_ifindex = dev->ifindex;
> +                     if (nh->nh_labels &&
> +                         nla_put_labels(skb, RTA_NEWDST, nh->nh_labels,
> +                                        nh->nh_label))
> +                             goto nla_put_failure;
> +                     if (nla_put_via(skb, nh->nh_via_table,
> +                                     nh->nh_via,
> +                                     nh->nh_via_alen))
> +                             goto nla_put_failure;
> +
> +                     /* length of rtnetlink header + attributes */
> +                     rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
> +             }
> +
> +             nla_nest_end(skb, mp);
> +     }
>  
>       nlmsg_end(skb, nlh);
>       return 0;
> @@ -1000,12 +1284,34 @@ static inline size_t lfib_nlmsg_size(struct 
> mpls_route *rt)
>  {
>       size_t payload =
>               NLMSG_ALIGN(sizeof(struct rtmsg))
> -             + nla_total_size(2 + rt->rt_via_alen)   /* RTA_VIA */
>               + nla_total_size(4);                    /* RTA_DST */
> -     if (rt->rt_labels)                              /* RTA_NEWDST */
> -             payload += nla_total_size(rt->rt_labels * 4);
> -     if (rt->rt_dev)                                 /* RTA_OIF */
> -             payload += nla_total_size(4);
> +
> +     if (rt->rt_nhn == 1) {
> +             struct mpls_nh *nh = list_first_entry_or_null(&rt->rt_nhs,
> +                                                           struct mpls_nh,
> +                                                           nh_next);
> +
> +             if (nh->nh_dev)
> +                     payload += nla_total_size(4); /* RTA_OIF */
> +             payload += nla_total_size(2 + nh->nh_via_alen); /* RTA_VIA */
> +             if (nh->nh_labels) /* RTA_NEWDST */
> +                     payload += nla_total_size(nh->nh_labels * 4);
> +     } else {
> +             struct mpls_nh *nh;
> +             /* each nexthop is packed in an attribute */
> +             size_t nhsize = 0;
> +
> +             list_for_each_entry(nh, &rt->rt_nhs, nh_next) {
> +                     nhsize += nla_total_size(sizeof(struct rtnexthop)) +
> +                                     nla_total_size(nh->nh_via_alen +
> +                                                    2); /* RTA_VIA */
> +                     if (nh->nh_labels) /* RTA_NEWDST */
> +                             nhsize += nla_total_size(nh->nh_labels * 4);
> +             }
> +             /* nested attribute */
> +             payload += nla_total_size(nhsize);
> +     }
> +
>       return payload;
>  }
>  
> @@ -1057,25 +1363,37 @@ static int resize_platform_label_table(struct net 
> *net, size_t limit)
>       /* In case the predefined labels need to be populated */
>       if (limit > MPLS_LABEL_IPV4NULL) {
>               struct net_device *lo = net->loopback_dev;
> -             rt0 = mpls_rt_alloc(lo->addr_len);
> +             struct mpls_nh *nh;
> +
> +             rt0 = mpls_rt_alloc(1);
>               if (!rt0)
>                       goto nort0;
> -             RCU_INIT_POINTER(rt0->rt_dev, lo);
>               rt0->rt_protocol = RTPROT_KERNEL;
>               rt0->rt_payload_type = MPT_IPV4;
> -             rt0->rt_via_table = NEIGH_LINK_TABLE;
> -             memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len);
> +             nh = mpls_nh_alloc(lo->addr_len);
> +             if (!nh)
> +                     goto nort2;
> +             RCU_INIT_POINTER(nh->nh_dev, lo);
> +             nh->nh_via_table = NEIGH_LINK_TABLE;
> +             memcpy(nh->nh_via, lo->dev_addr, lo->addr_len);
> +             list_add_tail(&nh->nh_next, &rt0->rt_nhs);
>       }
>       if (limit > MPLS_LABEL_IPV6NULL) {
>               struct net_device *lo = net->loopback_dev;
> -             rt2 = mpls_rt_alloc(lo->addr_len);
> +             struct mpls_nh *nh;
> +
> +             rt2 = mpls_rt_alloc(1);
>               if (!rt2)
>                       goto nort2;
> -             RCU_INIT_POINTER(rt2->rt_dev, lo);
>               rt2->rt_protocol = RTPROT_KERNEL;
>               rt2->rt_payload_type = MPT_IPV6;
> -             rt2->rt_via_table = NEIGH_LINK_TABLE;
> -             memcpy(rt2->rt_via, lo->dev_addr, lo->addr_len);
> +             nh = mpls_nh_alloc(lo->addr_len);
> +             if (!nh)
> +                     goto nort2;
> +             RCU_INIT_POINTER(nh->nh_dev, lo);
> +             nh->nh_via_table = NEIGH_LINK_TABLE;
> +             memcpy(nh->nh_via, lo->dev_addr, lo->addr_len);
> +             list_add_tail(&nh->nh_next, &rt2->rt_nhs);
>       }
>  
>       rtnl_lock();
> @@ -1085,7 +1403,7 @@ static int resize_platform_label_table(struct net *net, 
> size_t limit)
>  
>       /* Free any labels beyond the new table */
>       for (index = limit; index < old_limit; index++)
> -             mpls_route_update(net, index, NULL, NULL, NULL);
> +             mpls_route_update(net, index, NULL, NULL);
>  
>       /* Copy over the old labels */
>       cp_size = size;
> @@ -1124,6 +1442,7 @@ static int resize_platform_label_table(struct net *net, 
> size_t limit)
>  
>  nort2:
>       mpls_rt_free(rt0);
> +     mpls_rt_free(rt2);
>  nort0:
>       kvfree(labels);
>  nolabels:
> diff --git a/net/mpls/internal.h b/net/mpls/internal.h
> index 2681a4b..9e18b58 100644
> --- a/net/mpls/internal.h
> +++ b/net/mpls/internal.h
> @@ -1,6 +1,17 @@
>  #ifndef MPLS_INTERNAL_H
>  #define MPLS_INTERNAL_H
>  
> +enum mpls_payload_type {
> +     MPT_UNSPEC, /* IPv4 or IPv6 */
> +     MPT_IPV4 = 4,
> +     MPT_IPV6 = 6,
> +
> +     /* Other types not implemented:
> +      *  - Pseudo-wire with or without control word (RFC4385)
> +      *  - GAL (RFC5586)
> +      */
> +};
> +
>  struct mpls_shim_hdr {
>       __be32 label_stack_entry;
>  };
> @@ -21,6 +32,34 @@ struct mpls_dev {
>  
>  struct sk_buff;
>  
> +#define LABEL_NOT_SPECIFIED (1 << 20)
> +#define MAX_NEW_LABELS 2
> +
> +/* This maximum ha length copied from the definition of struct neighbour */
> +#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
> +
> +struct mpls_nh {
> +     struct net_device __rcu *nh_dev;
> +     u32                     nh_label[MAX_NEW_LABELS];
> +     unsigned int            nh_flags;
> +     int                     nh_weight;
> +     int                     nh_power;
> +     struct list_head        nh_next;
> +     u8                      nh_labels;
> +     u8                      nh_via_alen;
> +     u8                      nh_via_table;
> +     u8                      nh_via[0];
> +};
> +
> +struct mpls_route {
> +     struct rcu_head         rt_rcu;
> +     u8                      rt_protocol;
> +     u8                      rt_payload_type;
> +     int                     rt_power;
> +     int                     rt_nhn;
> +     struct list_head        rt_nhs;
> +};
> +
>  static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
>  {
>       return (struct mpls_shim_hdr *)skb_network_header(skb);
> @@ -52,8 +91,10 @@ static inline struct mpls_entry_decoded 
> mpls_entry_decode(struct mpls_shim_hdr *
>  
>  int nla_put_labels(struct sk_buff *skb, int attrtype,  u8 labels,
>                  const u32 label[]);
> -int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels,
> +int nla_get_labels(const struct nlattr *nla, u8 max_labels, u8 *labels,
>                  u32 label[]);
> +int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table,
> +             u8 via[]);
>  bool mpls_output_possible(const struct net_device *dev);
>  unsigned int mpls_dev_mtu(const struct net_device *dev);
>  bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu);
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to