ebied...@xmission.com (Eric W. Biederman) writes: > Roopa Prabhu <ro...@cumulusnetworks.com> writes: > >> From: Roopa Prabhu <ro...@cumulusnetworks.com> >> >> This patch adds support for MPLS multipath routes. >> >> Includes following changes to support multipath: >> - splits struct mpls_route into 'struct mpls_route + struct mpls_nh' >> >> - 'struct mpls_nh' represents a mpls nexthop label forwarding entry >> >> - moves mpls route and nexthop structures into internal.h >> >> - A mpls_route can point to multiple mpls_nh structs >> >> - the nexthops are maintained as a list > > So I am not certain I like nexthops being a list. In the practical case > introducing this list guarantees that everyone will see at least an > extra cache line miss in the forwarding path. > > In the more abstract sense a list is the wrong data structure. If the > list is so short we can afford to walk it an array is a better data > structure. If we need enough entries to make the memory consumption > of an array a concern we want some kind of hash table or tree data > structure, because a list will be too long in that case. > > So can we please not use a list? > > I expect we can simplify the data structures by noting that rt_via must > be an ethernet mac today so that 6 bytes are enough and 8 bytes gives us > a bit extra and aligns things nicely.
Grr. My mistake. The current worst case is 16 bytes for an ipv6 address in rt_via. But the point remains that a fixed sized array of bytes in rt_via allows the use of an array and not a list for nexthops. At least for the single nexthop case I really want something that is small enough it fits in a single 64byte cache line. The performance compared to anything else is going to be noticable. Eric > Also I know it goes away in the next patch but a spinlock taken for > every transit through the forwarding path really bugs me. > > Eric > >> - In the process of restructuring, this patch also consistently changes all >> labels to u8 >> >> - Adds support to parse/fill RTA_MULTIPATH netlink attribute for >> multipath routes similar to ipv4/v6 fib >> >> - In this patch, the multipath route nexthop selection algorithm >> is a simple round robin picked up from ipv4 fib code and is replaced by >> a hash based algorithm from Robert Shearman in the next patch >> >> - mpls_route_update cleanup: remove 'dev' handling in mpls_route_update. >> mpls_route_update though implemented to update based on dev, it was never >> used that way. And the dev handling gets tricky with multiple nexthops. >> Cannot >> match against any single nexthops dev. So, this patch removes the unused >> 'dev' handling in mpls_route_update. > >> >> Example: >> >> $ip -f mpls route add 100 nexthop as 200 via inet 10.1.1.2 dev swp1 \ >> nexthop as 700 via inet 10.1.1.6 dev swp2 \ >> nexthop as 800 via inet 40.1.1.2 dev swp3 >> >> $ip -f mpls route show >> 100 >> nexthop as to 200 via inet 10.1.1.2 dev swp1 >> nexthop as to 700 via inet 10.1.1.6 dev swp2 >> nexthop as to 800 via inet 40.1.1.2 dev swp3 >> >> Signed-off-by: Roopa Prabhu <ro...@cumulusnetworks.com> >> --- >> include/net/mpls_iptunnel.h | 2 +- >> net/mpls/af_mpls.c | 627 >> +++++++++++++++++++++++++++++++++----------- >> net/mpls/internal.h | 43 ++- >> 3 files changed, 516 insertions(+), 156 deletions(-) >> >> diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h >> index 4757997..179253f 100644 >> --- a/include/net/mpls_iptunnel.h >> +++ b/include/net/mpls_iptunnel.h >> @@ -18,7 +18,7 @@ >> >> struct mpls_iptunnel_encap { >> u32 label[MAX_NEW_LABELS]; >> - u32 labels; >> + u8 labels; >> }; >> >> static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct >> lwtunnel_state *lwtstate) >> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c >> index 8c5707d..ae9e153 100644 >> --- a/net/mpls/af_mpls.c >> +++ b/net/mpls/af_mpls.c >> @@ -19,39 +19,12 @@ >> #include <net/ipv6.h> >> #include <net/addrconf.h> >> #endif >> +#include <net/nexthop.h> >> #include "internal.h" >> >> -#define LABEL_NOT_SPECIFIED (1<<20) >> -#define MAX_NEW_LABELS 2 >> - >> -/* This maximum ha length copied from the definition of struct neighbour */ >> -#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))) >> - >> -enum mpls_payload_type { >> - MPT_UNSPEC, /* IPv4 or IPv6 */ >> - MPT_IPV4 = 4, >> - MPT_IPV6 = 6, >> - >> - /* Other types not implemented: >> - * - Pseudo-wire with or without control word (RFC4385) >> - * - GAL (RFC5586) >> - */ >> -}; >> - >> -struct mpls_route { /* next hop label forwarding entry */ >> - struct net_device __rcu *rt_dev; >> - struct rcu_head rt_rcu; >> - u32 rt_label[MAX_NEW_LABELS]; >> - u8 rt_protocol; /* routing protocol that set this >> entry */ >> - u8 rt_payload_type; >> - u8 rt_labels; >> - u8 rt_via_alen; >> - u8 rt_via_table; >> - u8 rt_via[0]; >> -}; >> - >> static int zero = 0; >> static int label_limit = (1 << 20) - 1; >> +static DEFINE_SPINLOCK(mpls_multipath_lock); >> >> static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, >> struct nlmsghdr *nlh, struct net *net, u32 portid, >> @@ -80,10 +53,10 @@ bool mpls_output_possible(const struct net_device *dev) >> } >> EXPORT_SYMBOL_GPL(mpls_output_possible); >> >> -static unsigned int mpls_rt_header_size(const struct mpls_route *rt) >> +static unsigned int mpls_nh_header_size(const struct mpls_nh *nh) >> { >> /* The size of the layer 2.5 labels to be added for this route */ >> - return rt->rt_labels * sizeof(struct mpls_shim_hdr); >> + return nh->nh_labels * sizeof(struct mpls_shim_hdr); >> } >> >> unsigned int mpls_dev_mtu(const struct net_device *dev) >> @@ -105,8 +78,58 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, >> unsigned int mtu) >> } >> EXPORT_SYMBOL_GPL(mpls_pkt_too_big); >> >> -static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb, >> - struct mpls_entry_decoded dec) >> +/* This is a cut/copy/modify from fib_select_multipath */ >> +static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt) >> +{ >> + struct mpls_nh *nh; >> + struct mpls_nh *ret_nh; >> + int nhsel = 0; >> + int w; >> + >> + spin_lock_bh(&mpls_multipath_lock); >> + ret_nh = list_first_entry_or_null(&rt->rt_nhs, struct mpls_nh, >> + nh_next); >> + if (rt->rt_power <= 0) { >> + int power = 0; >> + >> + list_for_each_entry(nh, &rt->rt_nhs, nh_next) { >> + power += nh->nh_weight; >> + nh->nh_power = nh->nh_weight; >> + } >> + rt->rt_power = power; >> + if (power <= 0) { >> + spin_unlock_bh(&mpls_multipath_lock); >> + /* Race condition: route has just become dead. */ >> + return ret_nh; >> + } >> + } >> + >> + /* w should be random number [0..rt->rt_power-1], >> + * it is pretty bad approximation. >> + */ >> + w = jiffies % rt->rt_power; >> + >> + list_for_each_entry(nh, &rt->rt_nhs, nh_next) { >> + if (nh->nh_power) { >> + w -= nh->nh_power; >> + if (w <= 0) { >> + nh->nh_power--; >> + rt->rt_power--; >> + ret_nh = nh; >> + spin_unlock_bh(&mpls_multipath_lock); >> + return ret_nh; >> + } >> + } >> + nhsel++; >> + } >> + >> + /* Race condition: route has just become dead. */ >> + spin_unlock_bh(&mpls_multipath_lock); >> + return ret_nh; >> +} >> + >> +static bool mpls_egress(struct mpls_route *rt, struct mpls_nh *nh, >> + struct sk_buff *skb, struct mpls_entry_decoded dec) >> { >> enum mpls_payload_type payload_type; >> bool success = false; >> @@ -159,6 +182,7 @@ static int mpls_forward(struct sk_buff *skb, struct >> net_device *dev, >> struct net *net = dev_net(dev); >> struct mpls_shim_hdr *hdr; >> struct mpls_route *rt; >> + struct mpls_nh *nh; >> struct mpls_entry_decoded dec; >> struct net_device *out_dev; >> struct mpls_dev *mdev; >> @@ -196,9 +220,13 @@ static int mpls_forward(struct sk_buff *skb, struct >> net_device *dev, >> if (!rt) >> goto drop; >> >> + nh = mpls_select_multipath(rt); >> + if (!nh) >> + goto drop; >> + >> /* Find the output device */ >> - out_dev = rcu_dereference(rt->rt_dev); >> - if (!mpls_output_possible(out_dev)) >> + out_dev = rcu_dereference(nh->nh_dev); >> + if (!out_dev || !mpls_output_possible(out_dev)) >> goto drop; >> >> if (skb_warn_if_lro(skb)) >> @@ -212,7 +240,7 @@ static int mpls_forward(struct sk_buff *skb, struct >> net_device *dev, >> dec.ttl -= 1; >> >> /* Verify the destination can hold the packet */ >> - new_header_size = mpls_rt_header_size(rt); >> + new_header_size = mpls_nh_header_size(nh); >> mtu = mpls_dev_mtu(out_dev); >> if (mpls_pkt_too_big(skb, mtu - new_header_size)) >> goto drop; >> @@ -230,7 +258,7 @@ static int mpls_forward(struct sk_buff *skb, struct >> net_device *dev, >> >> if (unlikely(!new_header_size && dec.bos)) { >> /* Penultimate hop popping */ >> - if (!mpls_egress(rt, skb, dec)) >> + if (!mpls_egress(rt, nh, skb, dec)) >> goto drop; >> } else { >> bool bos; >> @@ -240,13 +268,14 @@ static int mpls_forward(struct sk_buff *skb, struct >> net_device *dev, >> /* Push the new labels */ >> hdr = mpls_hdr(skb); >> bos = dec.bos; >> - for (i = rt->rt_labels - 1; i >= 0; i--) { >> - hdr[i] = mpls_entry_encode(rt->rt_label[i], dec.ttl, 0, >> bos); >> + for (i = nh->nh_labels - 1; i >= 0; i--) { >> + hdr[i] = mpls_entry_encode(nh->nh_label[i], >> + dec.ttl, 0, bos); >> bos = false; >> } >> } >> >> - err = neigh_xmit(rt->rt_via_table, out_dev, rt->rt_via, skb); >> + err = neigh_xmit(nh->nh_via_table, out_dev, nh->nh_via, skb); >> if (err) >> net_dbg_ratelimited("%s: packet transmission failed: %d\n", >> __func__, err); >> @@ -270,31 +299,43 @@ static const struct nla_policy >> rtm_mpls_policy[RTA_MAX+1] = { >> struct mpls_route_config { >> u32 rc_protocol; >> u32 rc_ifindex; >> - u16 rc_via_table; >> - u16 rc_via_alen; >> + u8 rc_via_table; >> + u8 rc_via_alen; >> u8 rc_via[MAX_VIA_ALEN]; >> + u8 rc_output_labels; >> u32 rc_label; >> - u32 rc_output_labels; >> u32 rc_output_label[MAX_NEW_LABELS]; >> u32 rc_nlflags; >> enum mpls_payload_type rc_payload_type; >> struct nl_info rc_nlinfo; >> + struct rtnexthop *rc_mp; >> + int rc_mp_len; >> }; >> >> -static struct mpls_route *mpls_rt_alloc(size_t alen) >> +static struct mpls_route *mpls_rt_alloc(int num_nh) >> { >> struct mpls_route *rt; >> >> - rt = kzalloc(sizeof(*rt) + alen, GFP_KERNEL); >> - if (rt) >> - rt->rt_via_alen = alen; >> + rt = kzalloc(sizeof(*rt), GFP_KERNEL); >> + if (rt) { >> + rt->rt_nhn = num_nh; >> + INIT_LIST_HEAD(&rt->rt_nhs); >> + } >> + >> return rt; >> } >> >> static void mpls_rt_free(struct mpls_route *rt) >> { >> - if (rt) >> + struct mpls_nh *nh, *nh_safe; >> + >> + if (rt) { >> + list_for_each_entry_safe(nh, nh_safe, &rt->rt_nhs, nh_next) { >> + list_del(&nh->nh_next); >> + kfree(nh); >> + } >> kfree_rcu(rt, rt_rcu); >> + } >> } >> >> static void mpls_notify_route(struct net *net, unsigned index, >> @@ -312,25 +353,22 @@ static void mpls_notify_route(struct net *net, >> unsigned index, >> } >> >> static void mpls_route_update(struct net *net, unsigned index, >> - struct net_device *dev, struct mpls_route *new, >> + struct mpls_route *new, >> const struct nl_info *info) >> { >> struct mpls_route __rcu **platform_label; >> - struct mpls_route *rt, *old = NULL; >> + struct mpls_route *rt; >> >> ASSERT_RTNL(); >> >> platform_label = rtnl_dereference(net->mpls.platform_label); >> rt = rtnl_dereference(platform_label[index]); >> - if (!dev || (rt && (rtnl_dereference(rt->rt_dev) == dev))) { >> - rcu_assign_pointer(platform_label[index], new); >> - old = rt; >> - } >> + rcu_assign_pointer(platform_label[index], new); >> >> - mpls_notify_route(net, index, old, new, info); >> + mpls_notify_route(net, index, rt, new, info); >> >> /* If we removed a route free it now */ >> - mpls_rt_free(old); >> + mpls_rt_free(rt); >> } >> >> static unsigned find_free_label(struct net *net) >> @@ -406,23 +444,23 @@ static struct net_device *inet6_fib_lookup_dev(struct >> net *net, void *addr) >> #endif >> >> static struct net_device *find_outdev(struct net *net, >> - struct mpls_route_config *cfg) >> + struct mpls_nh *nh, int oif) >> { >> struct net_device *dev = NULL; >> >> - if (!cfg->rc_ifindex) { >> - switch (cfg->rc_via_table) { >> + if (!oif) { >> + switch (nh->nh_via_table) { >> case NEIGH_ARP_TABLE: >> - dev = inet_fib_lookup_dev(net, cfg->rc_via); >> + dev = inet_fib_lookup_dev(net, nh->nh_via); >> break; >> case NEIGH_ND_TABLE: >> - dev = inet6_fib_lookup_dev(net, cfg->rc_via); >> + dev = inet6_fib_lookup_dev(net, nh->nh_via); >> break; >> case NEIGH_LINK_TABLE: >> break; >> } >> } else { >> - dev = dev_get_by_index(net, cfg->rc_ifindex); >> + dev = dev_get_by_index(net, oif); >> } >> >> if (!dev) >> @@ -431,15 +469,208 @@ static struct net_device *find_outdev(struct net *net, >> return dev; >> } >> >> +static int mpls_nh_assign_dev(struct net *net, struct mpls_nh *nh, int oif) >> +{ >> + struct net_device *dev = NULL; >> + int err = -ENODEV; >> + >> + dev = find_outdev(net, nh, oif); >> + if (IS_ERR(dev)) { >> + err = PTR_ERR(dev); >> + dev = NULL; >> + goto errout; >> + } >> + >> + /* Ensure this is a supported device */ >> + err = -EINVAL; >> + if (!mpls_dev_get(dev)) >> + goto errout; >> + >> + RCU_INIT_POINTER(nh->nh_dev, dev); >> + dev_put(dev); >> + >> + return 0; >> + >> +errout: >> + if (dev) >> + dev_put(dev); >> + return err; >> +} >> + >> +static struct mpls_nh *mpls_nh_alloc(size_t alen) >> +{ >> + struct mpls_nh *nh; >> + >> + nh = kzalloc(sizeof(*nh) + alen, GFP_KERNEL); >> + if (nh) >> + nh->nh_via_alen = alen; >> + >> + return nh; >> +} >> + >> +static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg, >> + struct mpls_route *rt) >> +{ >> + struct net *net = cfg->rc_nlinfo.nl_net; >> + struct mpls_nh *nh = NULL; >> + int err; >> + int i; >> + >> + err = -EINVAL; >> + /* Ensure only a supported number of labels are present */ >> + if (cfg->rc_output_labels > MAX_NEW_LABELS) >> + goto errout; >> + >> + err = -ENOMEM; >> + nh = mpls_nh_alloc(cfg->rc_via_alen); >> + if (!nh) >> + goto errout; >> + >> + nh->nh_labels = cfg->rc_output_labels; >> + for (i = 0; i < nh->nh_labels; i++) >> + nh->nh_label[i] = cfg->rc_output_label[i]; >> + >> + nh->nh_via_table = cfg->rc_via_table; >> + memcpy(nh->nh_via, cfg->rc_via, cfg->rc_via_alen); >> + nh->nh_via_alen = cfg->rc_via_alen; >> + >> + err = mpls_nh_assign_dev(net, nh, cfg->rc_ifindex); >> + if (err) >> + goto errout; >> + >> + list_add_tail(&nh->nh_next, &rt->rt_nhs); >> + >> + return 0; >> + >> +errout: >> + kfree(nh); >> + >> + return err; >> +} >> + >> +static int mpls_nh_build(struct net *net, struct mpls_nh **rt_nh, >> + int oif, struct nlattr *via_attr, >> + struct nlattr *newdst) >> +{ >> + struct mpls_nh *nh = NULL; >> + int err; >> + u8 via_alen; >> + u8 via_table; >> + u8 via[MAX_VIA_ALEN]; >> + >> + err = nla_get_via(via_attr, &via_alen, &via_table, >> + via); >> + if (err) >> + goto errout; >> + >> + nh = mpls_nh_alloc(via_alen); >> + if (!nh) >> + goto errout; >> + >> + if (newdst) { >> + err = nla_get_labels(newdst, MAX_NEW_LABELS, >> + &nh->nh_labels, nh->nh_label); >> + if (err) >> + goto errout; >> + } >> + nh->nh_via_table = via_table; >> + memcpy(nh->nh_via, via, via_alen); >> + >> + err = mpls_nh_assign_dev(net, nh, oif); >> + if (err) >> + goto errout; >> + >> + *rt_nh = nh; >> + >> + return 0; >> + >> +errout: >> + kfree(nh); >> + >> + return err; >> +} >> + >> +static int mpls_count_nexthops(struct rtnexthop *rtnh, int len) >> +{ >> + int nhs = 0; >> + int remaining = len; >> + >> + while (rtnh_ok(rtnh, remaining)) { >> + nhs++; >> + rtnh = rtnh_next(rtnh, &remaining); >> + } >> + >> + /* leftover implies invalid nexthop configuration, discard it */ >> + return remaining > 0 ? 0 : nhs; >> +} >> + >> +static int mpls_nh_build_multi(struct mpls_route_config *cfg, >> + struct mpls_route *rt) >> +{ >> + struct rtnexthop *rtnh = cfg->rc_mp; >> + struct nlattr *nla_via, *nla_newdst; >> + int remaining = cfg->rc_mp_len; >> + struct mpls_nh *nh, *nh_safe; >> + int nhs = 0; >> + int err = 0; >> + >> + while (rtnh_ok(rtnh, remaining)) { >> + int attrlen; >> + >> + nla_via = NULL; >> + nla_newdst = NULL; >> + nh = NULL; >> + >> + err = -EINVAL; >> + if (!rtnh_ok(rtnh, remaining)) >> + goto errout; >> + >> + attrlen = rtnh_attrlen(rtnh); >> + if (attrlen > 0) { >> + struct nlattr *attrs = rtnh_attrs(rtnh); >> + >> + nla_via = nla_find(attrs, attrlen, RTA_VIA); >> + nla_newdst = nla_find(attrs, attrlen, RTA_NEWDST); >> + } >> + >> + err = -EINVAL; >> + if (!nla_via) >> + goto errout; >> + >> + err = mpls_nh_build(cfg->rc_nlinfo.nl_net, &nh, >> + rtnh->rtnh_ifindex, nla_via, >> + nla_newdst); >> + if (err) >> + goto errout; >> + >> + nh->nh_weight = rtnh->rtnh_hops + 1; >> + list_add_tail(&nh->nh_next, &rt->rt_nhs); >> + >> + rtnh = rtnh_next(rtnh, &remaining); >> + nhs++; >> + } >> + >> + rt->rt_nhn = nhs; >> + >> + return 0; >> + >> +errout: >> + list_for_each_entry_safe(nh, nh_safe, &rt->rt_nhs, nh_next) { >> + list_del(&nh->nh_next); >> + kfree(nh); >> + } >> + >> + return err; >> +} >> + >> static int mpls_route_add(struct mpls_route_config *cfg) >> { >> struct mpls_route __rcu **platform_label; >> struct net *net = cfg->rc_nlinfo.nl_net; >> - struct net_device *dev = NULL; >> struct mpls_route *rt, *old; >> - unsigned index; >> - int i; >> int err = -EINVAL; >> + unsigned index; >> + int nhs = 1; /* default to one nexthop */ >> >> index = cfg->rc_label; >> >> @@ -457,27 +688,6 @@ static int mpls_route_add(struct mpls_route_config *cfg) >> if (index >= net->mpls.platform_labels) >> goto errout; >> >> - /* Ensure only a supported number of labels are present */ >> - if (cfg->rc_output_labels > MAX_NEW_LABELS) >> - goto errout; >> - >> - dev = find_outdev(net, cfg); >> - if (IS_ERR(dev)) { >> - err = PTR_ERR(dev); >> - dev = NULL; >> - goto errout; >> - } >> - >> - /* Ensure this is a supported device */ >> - err = -EINVAL; >> - if (!mpls_dev_get(dev)) >> - goto errout; >> - >> - err = -EINVAL; >> - if ((cfg->rc_via_table == NEIGH_LINK_TABLE) && >> - (dev->addr_len != cfg->rc_via_alen)) >> - goto errout; >> - >> /* Append makes no sense with mpls */ >> err = -EOPNOTSUPP; >> if (cfg->rc_nlflags & NLM_F_APPEND) >> @@ -497,28 +707,34 @@ static int mpls_route_add(struct mpls_route_config >> *cfg) >> if (!(cfg->rc_nlflags & NLM_F_CREATE) && !old) >> goto errout; >> >> + if (cfg->rc_mp) { >> + err = -EINVAL; >> + nhs = mpls_count_nexthops(cfg->rc_mp, cfg->rc_mp_len); >> + if (nhs == 0) >> + goto errout; >> + } >> + >> err = -ENOMEM; >> - rt = mpls_rt_alloc(cfg->rc_via_alen); >> + rt = mpls_rt_alloc(nhs); >> if (!rt) >> goto errout; >> - >> - rt->rt_labels = cfg->rc_output_labels; >> - for (i = 0; i < rt->rt_labels; i++) >> - rt->rt_label[i] = cfg->rc_output_label[i]; >> rt->rt_protocol = cfg->rc_protocol; >> - RCU_INIT_POINTER(rt->rt_dev, dev); >> rt->rt_payload_type = cfg->rc_payload_type; >> - rt->rt_via_table = cfg->rc_via_table; >> - memcpy(rt->rt_via, cfg->rc_via, cfg->rc_via_alen); >> >> - mpls_route_update(net, index, NULL, rt, &cfg->rc_nlinfo); >> + if (cfg->rc_mp) >> + err = mpls_nh_build_multi(cfg, rt); >> + else >> + err = mpls_nh_build_from_cfg(cfg, rt); >> + if (err) >> + goto freert; >> + >> + mpls_route_update(net, index, rt, &cfg->rc_nlinfo); >> >> - dev_put(dev); >> return 0; >> >> +freert: >> + mpls_rt_free(rt); >> errout: >> - if (dev) >> - dev_put(dev); >> return err; >> } >> >> @@ -538,7 +754,7 @@ static int mpls_route_del(struct mpls_route_config *cfg) >> if (index >= net->mpls.platform_labels) >> goto errout; >> >> - mpls_route_update(net, index, NULL, NULL, &cfg->rc_nlinfo); >> + mpls_route_update(net, index, NULL, &cfg->rc_nlinfo); >> >> err = 0; >> errout: >> @@ -628,6 +844,7 @@ static void mpls_ifdown(struct net_device *dev) >> struct mpls_route __rcu **platform_label; >> struct net *net = dev_net(dev); >> struct mpls_dev *mdev; >> + struct mpls_nh *nh; >> unsigned index; >> >> platform_label = rtnl_dereference(net->mpls.platform_label); >> @@ -635,9 +852,14 @@ static void mpls_ifdown(struct net_device *dev) >> struct mpls_route *rt = rtnl_dereference(platform_label[index]); >> if (!rt) >> continue; >> - if (rtnl_dereference(rt->rt_dev) != dev) >> - continue; >> - rt->rt_dev = NULL; >> + list_for_each_entry(nh, &rt->rt_nhs, nh_next) { >> + struct net_device *mdev; >> + >> + mdev = rtnl_dereference(nh->nh_dev); >> + if (mdev != dev) >> + continue; >> + nh->nh_dev = NULL; >> + } >> } >> >> mdev = mpls_dev_get(dev); >> @@ -736,7 +958,7 @@ int nla_put_labels(struct sk_buff *skb, int attrtype, >> EXPORT_SYMBOL_GPL(nla_put_labels); >> >> int nla_get_labels(const struct nlattr *nla, >> - u32 max_labels, u32 *labels, u32 label[]) >> + u8 max_labels, u8 *labels, u32 label[]) >> { >> unsigned len = nla_len(nla); >> unsigned nla_labels; >> @@ -781,6 +1003,48 @@ int nla_get_labels(const struct nlattr *nla, >> } >> EXPORT_SYMBOL_GPL(nla_get_labels); >> >> +int nla_get_via(const struct nlattr *nla, u8 *via_alen, >> + u8 *via_table, u8 via_addr[]) >> +{ >> + struct rtvia *via = nla_data(nla); >> + int err = -EINVAL; >> + u8 alen; >> + >> + if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) >> + goto errout; >> + alen = nla_len(nla) - >> + offsetof(struct rtvia, rtvia_addr); >> + if (alen > MAX_VIA_ALEN) >> + goto errout; >> + >> + /* Validate the address family */ >> + switch (via->rtvia_family) { >> + case AF_PACKET: >> + *via_table = NEIGH_LINK_TABLE; >> + break; >> + case AF_INET: >> + *via_table = NEIGH_ARP_TABLE; >> + if (alen != 4) >> + goto errout; >> + break; >> + case AF_INET6: >> + *via_table = NEIGH_ND_TABLE; >> + if (alen != 16) >> + goto errout; >> + break; >> + default: >> + /* Unsupported address family */ >> + goto errout; >> + } >> + >> + memcpy(via_addr, via->rtvia_addr, alen); >> + *via_alen = alen; >> + err = 0; >> + >> +errout: >> + return err; >> +} >> + >> static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, >> struct mpls_route_config *cfg) >> { >> @@ -844,7 +1108,7 @@ static int rtm_to_route_config(struct sk_buff *skb, >> struct nlmsghdr *nlh, >> break; >> case RTA_DST: >> { >> - u32 label_count; >> + u8 label_count; >> if (nla_get_labels(nla, 1, &label_count, >> &cfg->rc_label)) >> goto errout; >> @@ -857,35 +1121,15 @@ static int rtm_to_route_config(struct sk_buff *skb, >> struct nlmsghdr *nlh, >> } >> case RTA_VIA: >> { >> - struct rtvia *via = nla_data(nla); >> - if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) >> + if (nla_get_via(nla, &cfg->rc_via_alen, >> + &cfg->rc_via_table, cfg->rc_via)) >> goto errout; >> - cfg->rc_via_alen = nla_len(nla) - >> - offsetof(struct rtvia, rtvia_addr); >> - if (cfg->rc_via_alen > MAX_VIA_ALEN) >> - goto errout; >> - >> - /* Validate the address family */ >> - switch(via->rtvia_family) { >> - case AF_PACKET: >> - cfg->rc_via_table = NEIGH_LINK_TABLE; >> - break; >> - case AF_INET: >> - cfg->rc_via_table = NEIGH_ARP_TABLE; >> - if (cfg->rc_via_alen != 4) >> - goto errout; >> - break; >> - case AF_INET6: >> - cfg->rc_via_table = NEIGH_ND_TABLE; >> - if (cfg->rc_via_alen != 16) >> - goto errout; >> - break; >> - default: >> - /* Unsupported address family */ >> - goto errout; >> - } >> - >> - memcpy(cfg->rc_via, via->rtvia_addr, cfg->rc_via_alen); >> + break; >> + } >> + case RTA_MULTIPATH: >> + { >> + cfg->rc_mp = nla_data(nla); >> + cfg->rc_mp_len = nla_len(nla); >> break; >> } >> default: >> @@ -946,16 +1190,56 @@ static int mpls_dump_route(struct sk_buff *skb, u32 >> portid, u32 seq, int event, >> rtm->rtm_type = RTN_UNICAST; >> rtm->rtm_flags = 0; >> >> - if (rt->rt_labels && >> - nla_put_labels(skb, RTA_NEWDST, rt->rt_labels, rt->rt_label)) >> - goto nla_put_failure; >> - if (nla_put_via(skb, rt->rt_via_table, rt->rt_via, rt->rt_via_alen)) >> - goto nla_put_failure; >> - dev = rtnl_dereference(rt->rt_dev); >> - if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) >> - goto nla_put_failure; >> if (nla_put_labels(skb, RTA_DST, 1, &label)) >> goto nla_put_failure; >> + if (rt->rt_nhn == 1) { >> + struct mpls_nh *nh = list_first_entry_or_null(&rt->rt_nhs, >> + struct mpls_nh, >> + nh_next); >> + >> + if (nh->nh_labels && >> + nla_put_labels(skb, RTA_NEWDST, nh->nh_labels, >> + nh->nh_label)) >> + goto nla_put_failure; >> + if (nla_put_via(skb, nh->nh_via_table, nh->nh_via, >> + nh->nh_via_alen)) >> + goto nla_put_failure; >> + dev = rtnl_dereference(nh->nh_dev); >> + if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) >> + goto nla_put_failure; >> + } else { >> + struct rtnexthop *rtnh; >> + struct nlattr *mp; >> + struct mpls_nh *nh; >> + >> + mp = nla_nest_start(skb, RTA_MULTIPATH); >> + if (!mp) >> + goto nla_put_failure; >> + >> + list_for_each_entry(nh, &rt->rt_nhs, nh_next) { >> + rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); >> + if (!rtnh) >> + goto nla_put_failure; >> + >> + rtnh->rtnh_flags = nh->nh_flags & 0xFF; >> + dev = rtnl_dereference(nh->nh_dev); >> + if (dev) >> + rtnh->rtnh_ifindex = dev->ifindex; >> + if (nh->nh_labels && >> + nla_put_labels(skb, RTA_NEWDST, nh->nh_labels, >> + nh->nh_label)) >> + goto nla_put_failure; >> + if (nla_put_via(skb, nh->nh_via_table, >> + nh->nh_via, >> + nh->nh_via_alen)) >> + goto nla_put_failure; >> + >> + /* length of rtnetlink header + attributes */ >> + rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; >> + } >> + >> + nla_nest_end(skb, mp); >> + } >> >> nlmsg_end(skb, nlh); >> return 0; >> @@ -1000,12 +1284,34 @@ static inline size_t lfib_nlmsg_size(struct >> mpls_route *rt) >> { >> size_t payload = >> NLMSG_ALIGN(sizeof(struct rtmsg)) >> - + nla_total_size(2 + rt->rt_via_alen) /* RTA_VIA */ >> + nla_total_size(4); /* RTA_DST */ >> - if (rt->rt_labels) /* RTA_NEWDST */ >> - payload += nla_total_size(rt->rt_labels * 4); >> - if (rt->rt_dev) /* RTA_OIF */ >> - payload += nla_total_size(4); >> + >> + if (rt->rt_nhn == 1) { >> + struct mpls_nh *nh = list_first_entry_or_null(&rt->rt_nhs, >> + struct mpls_nh, >> + nh_next); >> + >> + if (nh->nh_dev) >> + payload += nla_total_size(4); /* RTA_OIF */ >> + payload += nla_total_size(2 + nh->nh_via_alen); /* RTA_VIA */ >> + if (nh->nh_labels) /* RTA_NEWDST */ >> + payload += nla_total_size(nh->nh_labels * 4); >> + } else { >> + struct mpls_nh *nh; >> + /* each nexthop is packed in an attribute */ >> + size_t nhsize = 0; >> + >> + list_for_each_entry(nh, &rt->rt_nhs, nh_next) { >> + nhsize += nla_total_size(sizeof(struct rtnexthop)) + >> + nla_total_size(nh->nh_via_alen + >> + 2); /* RTA_VIA */ >> + if (nh->nh_labels) /* RTA_NEWDST */ >> + nhsize += nla_total_size(nh->nh_labels * 4); >> + } >> + /* nested attribute */ >> + payload += nla_total_size(nhsize); >> + } >> + >> return payload; >> } >> >> @@ -1057,25 +1363,37 @@ static int resize_platform_label_table(struct net >> *net, size_t limit) >> /* In case the predefined labels need to be populated */ >> if (limit > MPLS_LABEL_IPV4NULL) { >> struct net_device *lo = net->loopback_dev; >> - rt0 = mpls_rt_alloc(lo->addr_len); >> + struct mpls_nh *nh; >> + >> + rt0 = mpls_rt_alloc(1); >> if (!rt0) >> goto nort0; >> - RCU_INIT_POINTER(rt0->rt_dev, lo); >> rt0->rt_protocol = RTPROT_KERNEL; >> rt0->rt_payload_type = MPT_IPV4; >> - rt0->rt_via_table = NEIGH_LINK_TABLE; >> - memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len); >> + nh = mpls_nh_alloc(lo->addr_len); >> + if (!nh) >> + goto nort2; >> + RCU_INIT_POINTER(nh->nh_dev, lo); >> + nh->nh_via_table = NEIGH_LINK_TABLE; >> + memcpy(nh->nh_via, lo->dev_addr, lo->addr_len); >> + list_add_tail(&nh->nh_next, &rt0->rt_nhs); >> } >> if (limit > MPLS_LABEL_IPV6NULL) { >> struct net_device *lo = net->loopback_dev; >> - rt2 = mpls_rt_alloc(lo->addr_len); >> + struct mpls_nh *nh; >> + >> + rt2 = mpls_rt_alloc(1); >> if (!rt2) >> goto nort2; >> - RCU_INIT_POINTER(rt2->rt_dev, lo); >> rt2->rt_protocol = RTPROT_KERNEL; >> rt2->rt_payload_type = MPT_IPV6; >> - rt2->rt_via_table = NEIGH_LINK_TABLE; >> - memcpy(rt2->rt_via, lo->dev_addr, lo->addr_len); >> + nh = mpls_nh_alloc(lo->addr_len); >> + if (!nh) >> + goto nort2; >> + RCU_INIT_POINTER(nh->nh_dev, lo); >> + nh->nh_via_table = NEIGH_LINK_TABLE; >> + memcpy(nh->nh_via, lo->dev_addr, lo->addr_len); >> + list_add_tail(&nh->nh_next, &rt2->rt_nhs); >> } >> >> rtnl_lock(); >> @@ -1085,7 +1403,7 @@ static int resize_platform_label_table(struct net >> *net, size_t limit) >> >> /* Free any labels beyond the new table */ >> for (index = limit; index < old_limit; index++) >> - mpls_route_update(net, index, NULL, NULL, NULL); >> + mpls_route_update(net, index, NULL, NULL); >> >> /* Copy over the old labels */ >> cp_size = size; >> @@ -1124,6 +1442,7 @@ static int resize_platform_label_table(struct net >> *net, size_t limit) >> >> nort2: >> mpls_rt_free(rt0); >> + mpls_rt_free(rt2); >> nort0: >> kvfree(labels); >> nolabels: >> diff --git a/net/mpls/internal.h b/net/mpls/internal.h >> index 2681a4b..9e18b58 100644 >> --- a/net/mpls/internal.h >> +++ b/net/mpls/internal.h >> @@ -1,6 +1,17 @@ >> #ifndef MPLS_INTERNAL_H >> #define MPLS_INTERNAL_H >> >> +enum mpls_payload_type { >> + MPT_UNSPEC, /* IPv4 or IPv6 */ >> + MPT_IPV4 = 4, >> + MPT_IPV6 = 6, >> + >> + /* Other types not implemented: >> + * - Pseudo-wire with or without control word (RFC4385) >> + * - GAL (RFC5586) >> + */ >> +}; >> + >> struct mpls_shim_hdr { >> __be32 label_stack_entry; >> }; >> @@ -21,6 +32,34 @@ struct mpls_dev { >> >> struct sk_buff; >> >> +#define LABEL_NOT_SPECIFIED (1 << 20) >> +#define MAX_NEW_LABELS 2 >> + >> +/* This maximum ha length copied from the definition of struct neighbour */ >> +#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))) >> + >> +struct mpls_nh { >> + struct net_device __rcu *nh_dev; >> + u32 nh_label[MAX_NEW_LABELS]; >> + unsigned int nh_flags; >> + int nh_weight; >> + int nh_power; >> + struct list_head nh_next; >> + u8 nh_labels; >> + u8 nh_via_alen; >> + u8 nh_via_table; >> + u8 nh_via[0]; >> +}; >> + >> +struct mpls_route { >> + struct rcu_head rt_rcu; >> + u8 rt_protocol; >> + u8 rt_payload_type; >> + int rt_power; >> + int rt_nhn; >> + struct list_head rt_nhs; >> +}; >> + >> static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) >> { >> return (struct mpls_shim_hdr *)skb_network_header(skb); >> @@ -52,8 +91,10 @@ static inline struct mpls_entry_decoded >> mpls_entry_decode(struct mpls_shim_hdr * >> >> int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, >> const u32 label[]); >> -int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, >> +int nla_get_labels(const struct nlattr *nla, u8 max_labels, u8 *labels, >> u32 label[]); >> +int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table, >> + u8 via[]); >> bool mpls_output_possible(const struct net_device *dev); >> unsigned int mpls_dev_mtu(const struct net_device *dev); >> bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu); -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html