From: David Ahern <dsah...@gmail.com>

Exceptions are really per device, so move rt6i_exception_bucket to
fib_nh_common as a generic nhc_exception_bucket. Move the flushed
flag to common as well. fib_nh_common for both is a strategic choice
to reduce memory consumption. Moving to fib6_nh pushes the struct over
256 which increases the actual allocation of a fib entry to 512.

Exception flushing when a fib entry is deleted is limited to the exceptions
per nexthop that reference the to-be-deleted fib entry (ie., 'from' points
to it). When a fib6_nh is released, all exceptions are flushed.

Move the core logic of rt6_flush_exceptions, rt6_remove_exception_rt and
rt6_update_exception_stamp_rt to helpers that can be invoked per fib6_nh.
For fib6_nh_flush_exceptions, only remove the exception if from is NULL
(ie., flushing all exceptions) or the rt6_info->from matches (fib delete).

Signed-off-by: David Ahern <dsah...@gmail.com>
---
 include/net/ip6_fib.h |   6 +--
 include/net/ip_fib.h  |   4 +-
 net/ipv6/ip6_fib.c    |   7 ---
 net/ipv6/route.c      | 140 ++++++++++++++++++++++++++++++++------------------
 4 files changed, 96 insertions(+), 61 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 58dbb4e82908..c1d1e32e1a19 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -153,7 +153,6 @@ struct fib6_info {
        struct rt6key                   fib6_prefsrc;
 
        struct rt6_info * __percpu      *rt6i_pcpu;
-       struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
 
 #ifdef CONFIG_IPV6_ROUTER_PREF
        unsigned long                   last_probe;
@@ -162,12 +161,11 @@ struct fib6_info {
        u32                             fib6_metric;
        u8                              fib6_protocol;
        u8                              fib6_type;
-       u8                              exception_bucket_flushed:1,
-                                       should_flush:1,
+       u8                              should_flush:1,
                                        dst_nocount:1,
                                        dst_nopolicy:1,
                                        dst_host:1,
-                                       unused:3;
+                                       unused:4;
 
        struct fib6_nh                  fib6_nh;
        struct rcu_head                 rcu;
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index cce437a1b2ff..063430ca0c6e 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -84,7 +84,8 @@ struct fib_nh_common {
        unsigned char           nhc_scope;
        u8                      nhc_family;
        u8                      nhc_has_gw:1,
-                               unused:7;
+                               nhc_exceptions_flushed:1,
+                               unused:6;
        union {
                __be32          ipv4;
                struct in6_addr ipv6;
@@ -96,6 +97,7 @@ struct fib_nh_common {
        /* v4 specific, but allows v6 gw with v4 routes */
        struct rtable __rcu * __percpu *nhc_pcpu_rth_output;
        struct rtable __rcu     *nhc_rth_input;
+       void __rcu              *nhc_exceptions;
 };
 
 struct fib_nh {
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 8c00609a1513..cce976a59a8c 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -170,16 +170,9 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
 void fib6_info_destroy_rcu(struct rcu_head *head)
 {
        struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
-       struct rt6_exception_bucket *bucket;
 
        WARN_ON(f6i->fib6_node);
 
-       bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
-       if (bucket) {
-               f6i->rt6i_exception_bucket = NULL;
-               kfree(bucket);
-       }
-
        if (f6i->rt6i_pcpu) {
                int cpu;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e0ee30cbd079..c66b9ac37036 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1413,6 +1413,7 @@ static unsigned int fib6_mtu(const struct fib6_info *rt)
 static int rt6_insert_exception(struct rt6_info *nrt,
                                struct fib6_info *ort)
 {
+       struct fib_nh_common *nhc = &ort->fib6_nh.nh_common;
        struct net *net = dev_net(nrt->dst.dev);
        struct rt6_exception_bucket *bucket;
        struct in6_addr *src_key = NULL;
@@ -1421,12 +1422,12 @@ static int rt6_insert_exception(struct rt6_info *nrt,
 
        spin_lock_bh(&rt6_exception_lock);
 
-       if (ort->exception_bucket_flushed) {
+       if (nhc->nhc_exceptions_flushed) {
                err = -EINVAL;
                goto out;
        }
 
-       bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
+       bucket = rcu_dereference_protected(nhc->nhc_exceptions,
                                        lockdep_is_held(&rt6_exception_lock));
        if (!bucket) {
                bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
@@ -1435,7 +1436,7 @@ static int rt6_insert_exception(struct rt6_info *nrt,
                        err = -ENOMEM;
                        goto out;
                }
-               rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
+               rcu_assign_pointer(nhc->nhc_exceptions, bucket);
        }
 
 #ifdef CONFIG_IPV6_SUBTREES
@@ -1490,8 +1491,9 @@ static int rt6_insert_exception(struct rt6_info *nrt,
        return err;
 }
 
-void rt6_flush_exceptions(struct fib6_info *rt)
+static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info 
*from)
 {
+       struct fib_nh_common *nhc = &nh->nh_common;
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        struct hlist_node *tmp;
@@ -1499,17 +1501,21 @@ void rt6_flush_exceptions(struct fib6_info *rt)
 
        spin_lock_bh(&rt6_exception_lock);
        /* Prevent rt6_insert_exception() to recreate the bucket list */
-       rt->exception_bucket_flushed = 1;
+       if (!from)
+               nhc->nhc_exceptions_flushed = 1;
 
-       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+       bucket = rcu_dereference_protected(nhc->nhc_exceptions,
                                    lockdep_is_held(&rt6_exception_lock));
        if (!bucket)
                goto out;
 
        for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
-               hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
-                       rt6_remove_exception(bucket, rt6_ex);
-               WARN_ON_ONCE(bucket->depth);
+               hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
+                       if (!from ||
+                           rcu_access_pointer(rt6_ex->rt6i->from) == from)
+                               rt6_remove_exception(bucket, rt6_ex);
+               }
+               WARN_ON_ONCE(!from && bucket->depth);
                bucket++;
        }
 
@@ -1517,6 +1523,11 @@ void rt6_flush_exceptions(struct fib6_info *rt)
        spin_unlock_bh(&rt6_exception_lock);
 }
 
+void rt6_flush_exceptions(struct fib6_info *f6i)
+{
+       fib6_nh_flush_exceptions(&f6i->fib6_nh, f6i);
+}
+
 /* Find cached rt in the hash table inside passed in rt
  * Caller has to hold rcu_read_lock()
  */
@@ -1524,12 +1535,13 @@ static struct rt6_info *rt6_find_cached_rt(struct 
fib6_info *rt,
                                           struct in6_addr *daddr,
                                           struct in6_addr *saddr)
 {
+       struct fib_nh_common *nhc = &rt->fib6_nh.nh_common;
        struct rt6_exception_bucket *bucket;
        struct in6_addr *src_key = NULL;
        struct rt6_exception *rt6_ex;
        struct rt6_info *res = NULL;
 
-       bucket = rcu_dereference(rt->rt6i_exception_bucket);
+       bucket = rcu_dereference(nhc->nhc_exceptions);
 
 #ifdef CONFIG_IPV6_SUBTREES
        /* rt6i_src.plen != 0 indicates rt is in subtree
@@ -1549,25 +1561,20 @@ static struct rt6_info *rt6_find_cached_rt(struct 
fib6_info *rt,
        return res;
 }
 
-/* Remove the passed in cached rt from the hash table that contains it */
-static int rt6_remove_exception_rt(struct rt6_info *rt)
+static int fib6_nh_remove_exception(struct fib6_nh *nh, int plen,
+                                   const struct rt6_info *rt)
 {
+       struct fib_nh_common *nhc = &nh->nh_common;
+       const struct in6_addr *src_key = NULL;
        struct rt6_exception_bucket *bucket;
-       struct in6_addr *src_key = NULL;
        struct rt6_exception *rt6_ex;
-       struct fib6_info *from;
-       int err;
-
-       from = rcu_dereference(rt->from);
-       if (!from ||
-           !(rt->rt6i_flags & RTF_CACHE))
-               return -EINVAL;
+       int err = 0;
 
-       if (!rcu_access_pointer(from->rt6i_exception_bucket))
+       if (!rcu_access_pointer(nhc->nhc_exceptions))
                return -ENOENT;
 
        spin_lock_bh(&rt6_exception_lock);
-       bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
+       bucket = rcu_dereference_protected(nhc->nhc_exceptions,
                                    lockdep_is_held(&rt6_exception_lock));
 #ifdef CONFIG_IPV6_SUBTREES
        /* rt6i_src.plen != 0 indicates 'from' is in subtree
@@ -1576,39 +1583,43 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
         * Otherwise, the exception table is indexed by
         * a hash of only rt6i_dst.
         */
-       if (from->fib6_src.plen)
+       if (plen)
                src_key = &rt->rt6i_src.addr;
 #endif
        rt6_ex = __rt6_find_exception_spinlock(&bucket,
                                               &rt->rt6i_dst.addr,
                                               src_key);
-       if (rt6_ex) {
+       if (rt6_ex)
                rt6_remove_exception(bucket, rt6_ex);
-               err = 0;
-       } else {
+       else
                err = -ENOENT;
-       }
 
        spin_unlock_bh(&rt6_exception_lock);
        return err;
 }
 
-/* Find rt6_ex which contains the passed in rt cache and
- * refresh its stamp
- */
-static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
+/* Remove the passed in cached rt from the hash table that contains it */
+static int rt6_remove_exception_rt(struct rt6_info *rt)
 {
-       struct rt6_exception_bucket *bucket;
-       struct in6_addr *src_key = NULL;
-       struct rt6_exception *rt6_ex;
        struct fib6_info *from;
 
-       rcu_read_lock();
        from = rcu_dereference(rt->from);
        if (!from || !(rt->rt6i_flags & RTF_CACHE))
-               goto unlock;
+               return -EINVAL;
 
-       bucket = rcu_dereference(from->rt6i_exception_bucket);
+       return fib6_nh_remove_exception(&from->fib6_nh,
+                                       from->fib6_src.plen, rt);
+}
+
+static void fib6_nh_update_exception(struct fib6_nh *nh, int plen,
+                                    const struct rt6_info *rt)
+{
+       struct fib_nh_common *nhc = &nh->nh_common;
+       const struct in6_addr *src_key = NULL;
+       struct rt6_exception_bucket *bucket;
+       struct rt6_exception *rt6_ex;
+
+       bucket = rcu_dereference(nhc->nhc_exceptions);
 
 #ifdef CONFIG_IPV6_SUBTREES
        /* rt6i_src.plen != 0 indicates 'from' is in subtree
@@ -1617,15 +1628,28 @@ static void rt6_update_exception_stamp_rt(struct 
rt6_info *rt)
         * Otherwise, the exception table is indexed by
         * a hash of only rt6i_dst.
         */
-       if (from->fib6_src.plen)
+       if (plen)
                src_key = &rt->rt6i_src.addr;
 #endif
-       rt6_ex = __rt6_find_exception_rcu(&bucket,
-                                         &rt->rt6i_dst.addr,
-                                         src_key);
+       rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
        if (rt6_ex)
                rt6_ex->stamp = jiffies;
+}
+
+/* Find rt6_ex which contains the passed in rt cache and
+ * refresh its stamp
+ */
+static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
+{
+       struct fib6_info *from;
+
+       rcu_read_lock();
 
+       from = rcu_dereference(rt->from);
+       if (!from || !(rt->rt6i_flags & RTF_CACHE))
+               goto unlock;
+
+       fib6_nh_update_exception(&from->fib6_nh, from->fib6_src.plen, rt);
 unlock:
        rcu_read_unlock();
 }
@@ -1655,11 +1679,12 @@ static bool rt6_mtu_change_route_allowed(struct 
inet6_dev *idev,
 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
                                       struct fib6_info *rt, int mtu)
 {
+       struct fib_nh_common *nhc = &rt->fib6_nh.nh_common;
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        int i;
 
-       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+       bucket = rcu_dereference_protected(nhc->nhc_exceptions,
                                        lockdep_is_held(&rt6_exception_lock));
 
        if (!bucket)
@@ -1686,16 +1711,17 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev 
*idev,
 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
                                        struct in6_addr *gateway)
 {
+       struct fib_nh_common *nhc = &rt->fib6_nh.nh_common;
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        struct hlist_node *tmp;
        int i;
 
-       if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+       if (!rcu_access_pointer(nhc->nhc_exceptions))
                return;
 
        spin_lock_bh(&rt6_exception_lock);
-       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+       bucket = rcu_dereference_protected(nhc->nhc_exceptions,
                                     lockdep_is_held(&rt6_exception_lock));
 
        if (bucket) {
@@ -1768,15 +1794,18 @@ void rt6_age_exceptions(struct fib6_info *rt,
 {
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
+       struct fib_nh_common *nhc;
        struct hlist_node *tmp;
        int i;
 
-       if (!rcu_access_pointer(rt->rt6i_exception_bucket))
-               return;
-
        rcu_read_lock_bh();
+
+       nhc = &rt->fib6_nh.nh_common;
+       if (!rcu_access_pointer(nhc->nhc_exceptions))
+               goto out;
+
        spin_lock(&rt6_exception_lock);
-       bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
+       bucket = rcu_dereference_protected(nhc->nhc_exceptions,
                                    lockdep_is_held(&rt6_exception_lock));
 
        if (bucket) {
@@ -1790,6 +1819,7 @@ void rt6_age_exceptions(struct fib6_info *rt,
                }
        }
        spin_unlock(&rt6_exception_lock);
+out:
        rcu_read_unlock_bh();
 }
 
@@ -2596,6 +2626,7 @@ static unsigned int ip6_mtu(const struct dst_entry *dst)
 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
                      struct in6_addr *saddr)
 {
+       struct fib_nh_common *nhc = &f6i->fib6_nh.nh_common;
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        struct in6_addr *src_key;
@@ -2614,7 +2645,7 @@ u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct 
in6_addr *daddr,
                src_key = saddr;
 #endif
 
-       bucket = rcu_dereference(f6i->rt6i_exception_bucket);
+       bucket = rcu_dereference(nhc->nhc_exceptions);
        rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
        if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
                mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
@@ -3011,6 +3042,17 @@ int fib6_nh_init(struct net *net, struct fib6_nh 
*fib6_nh,
 
 void fib6_nh_release(struct fib6_nh *fib6_nh)
 {
+       struct fib_nh_common *nhc = &fib6_nh->nh_common;
+       struct rt6_exception_bucket *bucket;
+
+       fib6_nh_flush_exceptions(fib6_nh, NULL);
+
+       bucket = rcu_dereference_protected(nhc->nhc_exceptions, 1);
+       if (bucket) {
+               rcu_assign_pointer(nhc->nhc_exceptions, NULL);
+               kfree(bucket);
+       }
+
        fib_nh_common_release(&fib6_nh->nh_common);
 }
 
-- 
2.11.0

Reply via email to