On 03/11/15 05:13, Roopa Prabhu wrote:
From: Roopa Prabhu <[email protected]>

Adds support for RTNH_F_DEAD and RTNH_F_LINKDOWN flags on mpls
routes due to link events. Also adds code to ignore dead
routes during route selection

Signed-off-by: Roopa Prabhu <[email protected]>
---
RFC to v1:
         Addressed a few comments from Eric and Robert:
         - remove support for weighted nexthops
         - Use rt_nhn_alive in the rt structure to keep count of alive
         routes.
         What i have not done is: sort nexthops on link events.
         I am not comfortable recreating or sorting nexthops on
         every carrier change. This leaves scope for optimizing in the future

v1 to v2:
        Fix dead nexthop checks as suggested by dave

  net/mpls/af_mpls.c  | 191 ++++++++++++++++++++++++++++++++++++++++++++--------
  net/mpls/internal.h |   3 +
  2 files changed, 166 insertions(+), 28 deletions(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index c70d750..5e88118 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -96,22 +96,15 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned 
int mtu)
  }
  EXPORT_SYMBOL_GPL(mpls_pkt_too_big);

-static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
-                                            struct sk_buff *skb, bool bos)
+static u32 mpls_multipath_hash(struct mpls_route *rt,
+                              struct sk_buff *skb, bool bos)
  {
        struct mpls_entry_decoded dec;
        struct mpls_shim_hdr *hdr;
        bool eli_seen = false;
        int label_index;
-       int nh_index = 0;
        u32 hash = 0;

-       /* No need to look further into packet if there's only
-        * one path
-        */
-       if (rt->rt_nhn == 1)
-               goto out;
-
        for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos;
             label_index++) {
                if (!pskb_may_pull(skb, sizeof(*hdr) * label_index))
@@ -165,9 +158,37 @@ static struct mpls_nh *mpls_select_multipath(struct 
mpls_route *rt,
                }
        }

-       nh_index = hash % rt->rt_nhn;
+       return hash;
+}
+
+static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
+                                            struct sk_buff *skb, bool bos)
+{
+       u32 hash = 0;
+       int nh_index;
+       int n = 0;
+
+       /* No need to look further into packet if there's only
+        * one path
+        */
+       if (rt->rt_nhn == 1)
+               goto out;
+
+       if (rt->rt_nhn_alive <= 0)
+               return NULL;
+
+       hash = mpls_multipath_hash(rt, skb, bos);
+       nh_index = hash % rt->rt_nhn_alive;
+       for_nexthops(rt) {

This should be optimised in the common rt->rt_nhn_alive == rt->rt_nhn case by doing the direct array index and avoid the nh walk.

+               if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
+                       continue;
+               if (n == nh_index)
+                       return nh;
+               n++;
+       } endfor_nexthops(rt);
+
  out:
-       return &rt->rt_nh[nh_index];
+       return &rt->rt_nh[0];
  }

  static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
@@ -365,6 +386,7 @@ static struct mpls_route *mpls_rt_alloc(int num_nh, u8 
max_alen)
                     GFP_KERNEL);
        if (rt) {
                rt->rt_nhn = num_nh;
+               rt->rt_nhn_alive = num_nh;
                rt->rt_max_alen = max_alen_aligned;
        }

@@ -536,6 +558,15 @@ static int mpls_nh_assign_dev(struct net *net, struct 
mpls_route *rt,

        RCU_INIT_POINTER(nh->nh_dev, dev);

+       if (!netif_carrier_ok(dev))
+               nh->nh_flags |= RTNH_F_LINKDOWN;
+
+       if (!(dev->flags & IFF_UP))
+               nh->nh_flags |= RTNH_F_DEAD;

Below you're testing IFF_RUNNING | IFF_LOWER_UP. Is the difference intended here?

+
+       if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
+               rt->rt_nhn_alive--;

You don't update your new rt->rt_flags field here. This highlights the issue with duplicating data, which you're doing with the rt_flags field.

+
        return 0;

  errout:
@@ -577,7 +608,7 @@ errout:
  }

  static int mpls_nh_build(struct net *net, struct mpls_route *rt,
-                        struct mpls_nh *nh, int oif,
+                        struct mpls_nh *nh, int oif, int hops,

This change isn't required.

                         struct nlattr *via, struct nlattr *newdst)
  {
        int err = -ENOMEM;
@@ -666,7 +697,7 @@ static int mpls_nh_build_multi(struct mpls_route_config 
*cfg,
                /* neither weighted multipath nor any flags
                 * are supported
                 */
-               if (rtnh->rtnh_hops || rtnh->rtnh_flags)
+               if (rtnh->rtnh_flags || rtnh->rtnh_flags)

As the build bot has pointed out, this change is not entirely correct.

                        goto errout;

                attrlen = rtnh_attrlen(rtnh);
@@ -681,8 +712,8 @@ static int mpls_nh_build_multi(struct mpls_route_config 
*cfg,
                        goto errout;

                err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh,
-                                   rtnh->rtnh_ifindex, nla_via,
-                                   nla_newdst);
+                                   rtnh->rtnh_ifindex, rtnh->rtnh_hops,
+                                   nla_via, nla_newdst);

This change isn't required.

                if (err)
                        goto errout;

@@ -875,34 +906,100 @@ free:
        return ERR_PTR(err);
  }

-static void mpls_ifdown(struct net_device *dev)
+static void mpls_ifdown(struct net_device *dev, int event)
  {
        struct mpls_route __rcu **platform_label;
        struct net *net = dev_net(dev);
-       struct mpls_dev *mdev;
        unsigned index;
+       int dead;

        platform_label = rtnl_dereference(net->mpls.platform_label);
        for (index = 0; index < net->mpls.platform_labels; index++) {
                struct mpls_route *rt = rtnl_dereference(platform_label[index]);
+
                if (!rt)
                        continue;
+               dead = 0;
                for_nexthops(rt) {
+                       if ((event == NETDEV_DOWN &&
+                            (nh->nh_flags & RTNH_F_DEAD)) ||
+                            (event == NETDEV_CHANGE &&
+                            (nh->nh_flags & RTNH_F_LINKDOWN))) {
+                               dead++;
+                               continue;
+                       }
+
                        if (rtnl_dereference(nh->nh_dev) != dev)
                                continue;
-                       nh->nh_dev = NULL;
+                       switch (event) {
+                       case NETDEV_DOWN:
+                       case NETDEV_UNREGISTER:
+                               nh->nh_flags |= RTNH_F_DEAD;
+                               /* fall through */
+                       case NETDEV_CHANGE:
+                               nh->nh_flags |= RTNH_F_LINKDOWN;
+                               rt->rt_nhn_alive--;

Are these operations atomic on all architectures?

Even if they are, I'm a bit worried about the RCU-correctness of this. I think the fallthrough case in mpls_select_multipath saves you, causing traffic to get via nh0 instead of the one selected by the hash (which is fine transiently).

+                               break;
+                       }
+                       if (event == NETDEV_UNREGISTER) {
+                               nh->nh_dev = NULL;

I realise this was just moved from the original code, but I think this should be at least RCU_INIT_POINTER(nh->nh_dev, NULL), if not rcu_assign_pointer.

+                               dead = rt->rt_nhn;
+                               break;
+                       }
+                       dead++;
                } endfor_nexthops(rt);
+
+               if (dead == rt->rt_nhn) {
+                       switch (event) {
+                       case NETDEV_DOWN:
+                       case NETDEV_UNREGISTER:
+                               rt->rt_flags |= RTNH_F_DEAD;
+                               /* fall through */
+                       case NETDEV_CHANGE:
+                               rt->rt_flags |= RTNH_F_LINKDOWN;
+                               rt->rt_nhn_alive = 0;

Won't rt_nhn_alive be zero already at this point?

There's also the problem that depending on the type of the last event, rt->rt_flags will get set differently.

E.g.

- NETDEV_DOWN for nh0 followed by NETDEV_CHANGE[down] for nh1 then rt->rt_flags will be RTNH_F_LINKDOWN. - NETDEV_CHANGE[down] for nh1 followed by NETDEV_DOWN for nh0 then rt->rt_flags will be RTNH_F_LINKDOWN|RTNH_F_DEAD.

That doesn't seem like desirable behaviour. What are expected semantics?

+                               break;
+                       }
+               }
        }

-       mdev = mpls_dev_get(dev);
-       if (!mdev)
-               return;
+       return;
+}
+
+static void mpls_ifup(struct net_device *dev, unsigned int nh_flags)
+{
+       struct mpls_route __rcu **platform_label;
+       struct net *net = dev_net(dev);
+       unsigned index;
+       int alive;
+
+       platform_label = rtnl_dereference(net->mpls.platform_label);
+       for (index = 0; index < net->mpls.platform_labels; index++) {
+               struct mpls_route *rt = rtnl_dereference(platform_label[index]);
+
+               if (!rt)
+                       continue;
+               alive = 0;
+               for_nexthops(rt) {
+                       struct net_device *nh_dev =
+                               rtnl_dereference(nh->nh_dev);

-       mpls_dev_sysctl_unregister(mdev);
+                       if (!(nh->nh_flags & nh_flags)) {
+                               alive++;
+                               continue;
+                       }
+                       if (nh_dev != dev)
+                               continue;
+                       alive++;
+                       nh->nh_flags &= ~nh_flags;
+               } endfor_nexthops(rt);

-       RCU_INIT_POINTER(dev->mpls_ptr, NULL);
+               if (alive > 0)
+                       rt->rt_flags &= ~nh_flags;

Again, this creates a dependence on the ordering of events.

+               rt->rt_nhn_alive = alive;
+       }

-       kfree_rcu(mdev, rcu);
+       return;
  }

  static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
@@ -910,9 +1007,9 @@ static int mpls_dev_notify(struct notifier_block *this, 
unsigned long event,
  {
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct mpls_dev *mdev;
+       unsigned int flags;

-       switch(event) {
-       case NETDEV_REGISTER:
+       if (event == NETDEV_REGISTER) {
                /* For now just support ethernet devices */
                if ((dev->type == ARPHRD_ETHER) ||
                    (dev->type == ARPHRD_LOOPBACK)) {
@@ -920,10 +1017,39 @@ static int mpls_dev_notify(struct notifier_block *this, 
unsigned long event,
                        if (IS_ERR(mdev))
                                return notifier_from_errno(PTR_ERR(mdev));
                }
-               break;
+               return NOTIFY_OK;
+       }

+       mdev = mpls_dev_get(dev);
+       if (!mdev)
+               return NOTIFY_OK;
+
+       switch (event) {
+       case NETDEV_DOWN:
+               mpls_ifdown(dev, event);
+               break;
+       case NETDEV_UP:
+               flags = dev_get_flags(dev);
+               if (flags & (IFF_RUNNING | IFF_LOWER_UP))
+                       mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN);
+               else
+                       mpls_ifup(dev, RTNH_F_DEAD);
+               break;
+       case NETDEV_CHANGE:
+               flags = dev_get_flags(dev);
+               if (flags & (IFF_RUNNING | IFF_LOWER_UP))
+                       mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN);
+               else
+                       mpls_ifdown(dev, event);
+               break;
        case NETDEV_UNREGISTER:
-               mpls_ifdown(dev);
+               mpls_ifdown(dev, event);
+               mdev = mpls_dev_get(dev);
+               if (mdev) {
+                       mpls_dev_sysctl_unregister(mdev);
+                       RCU_INIT_POINTER(dev->mpls_ptr, NULL);
+                       kfree_rcu(mdev, rcu);
+               }
                break;
        case NETDEV_CHANGENAME:
                mdev = mpls_dev_get(dev);
@@ -1237,6 +1363,10 @@ static int mpls_dump_route(struct sk_buff *skb, u32 
portid, u32 seq, int event,
                dev = rtnl_dereference(nh->nh_dev);
                if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
                        goto nla_put_failure;
+               if (nh->nh_flags & RTNH_F_LINKDOWN)
+                       rtm->rtm_flags |= RTNH_F_LINKDOWN;
+               if (nh->nh_flags & RTNH_F_DEAD)
+                       rtm->rtm_flags |= RTNH_F_DEAD;
        } else {
                struct rtnexthop *rtnh;
                struct nlattr *mp;
@@ -1253,6 +1383,11 @@ static int mpls_dump_route(struct sk_buff *skb, u32 
portid, u32 seq, int event,
                        dev = rtnl_dereference(nh->nh_dev);
                        if (dev)
                                rtnh->rtnh_ifindex = dev->ifindex;
+                       if (nh->nh_flags & RTNH_F_LINKDOWN)
+                               rtnh->rtnh_flags |= RTNH_F_LINKDOWN;
+                       if (nh->nh_flags & RTNH_F_DEAD)
+                               rtnh->rtnh_flags |= RTNH_F_DEAD;
+

You never read from rt->rt_flags. Was your intention to set rtm->rtm_flags using that field in this function?

                        if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST,
                                                            nh->nh_labels,
                                                            nh->nh_label))
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index bde52ce..4f9bf2b 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -41,6 +41,7 @@ enum mpls_payload_type {

  struct mpls_nh { /* next hop label forwarding entry */
        struct net_device __rcu *nh_dev;
+       unsigned int            nh_flags;

This could be implemented as a u8 (or even two 1-bit fields) after nh_via_table (making use of the 1-byte hole already there) without increasing the size of the structure by a fifth.

        u32                     nh_label[MAX_NEW_LABELS];
        u8                      nh_labels;
        u8                      nh_via_alen;
@@ -70,10 +71,12 @@ struct mpls_nh { /* next hop label forwarding entry */
   */
  struct mpls_route { /* next hop label forwarding entry */
        struct rcu_head         rt_rcu;
+       unsigned int            rt_flags;

Storing this is unnecessary - it can be derived from rt_nhn_alive == 0 and looking at the nexthop flags, which also avoids the event ordering problems described above.

Thanks,
Rob

        u8                      rt_protocol;
        u8                      rt_payload_type;
        u8                      rt_max_alen;
        unsigned int            rt_nhn;
+       unsigned int            rt_nhn_alive;
        struct mpls_nh          rt_nh[0];
  };


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to