On Thu, May 11, 2017 at 5:07 PM, Cong Wang <xiyou.wangc...@gmail.com> wrote:
> So, if I understand you correctly it is safe to NULL'ing
> nh_dev in NETDEV_UNREGISTER_FINAL, right?
>
> If still not, how about transfer nh_dev's to loopback_dev
> too in NETDEV_UNREGISTER? Like we transfer dst->dev.
>
> I don't want to touch the fast path to check for NULL, as
> it will change more code and slow down performance.

Finally I come up with the attached patch. Please let me know if
I still miss anything.
commit edc38ecab7101487b35fc9152e166a2670467e49
Author: Cong Wang <xiyou.wangc...@gmail.com>
Date:   Tue May 9 14:35:10 2017 -0700

    ipv4: restore rt->fi for reference counting, try #2
    
    Signed-off-by: Cong Wang <xiyou.wangc...@gmail.com>

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 6692c57..0f04f4d 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -392,6 +392,7 @@ int fib_unmerge(struct net *net);
 /* Exported by fib_semantics.c */
 int ip_fib_check_default(__be32 gw, struct net_device *dev);
 int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
+void fib_put_nh_devs(struct net_device *dev);
 int fib_sync_down_addr(struct net_device *dev, __be32 local);
 int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
 
diff --git a/include/net/route.h b/include/net/route.h
index 2cc0e14..4335eb7 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -69,6 +69,7 @@ struct rtable {
 
        struct list_head        rt_uncached;
        struct uncached_list    *rt_uncached_list;
+       struct fib_info         *fi; /* for refcnt to shared metrics */
 };
 
 static inline bool rt_is_input_route(const struct rtable *rt)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 39bd1ed..59b0a1d 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1178,6 +1178,9 @@ static int fib_netdev_event(struct notifier_block *this, 
unsigned long event, vo
                fib_disable_ip(dev, event, true);
                rt_flush_dev(dev);
                return NOTIFY_DONE;
+       } else if (event == NETDEV_UNREGISTER_FINAL) {
+               fib_put_nh_devs(dev);
+               return NOTIFY_DONE;
        }
 
        in_dev = __in_dev_get_rtnl(dev);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index da449dd..b85c5bb 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1453,6 +1453,36 @@ int fib_sync_down_dev(struct net_device *dev, unsigned 
long event, bool force)
        return ret;
 }
 
+/* We have to release these nh_dev here because a dst could still hold a
+ * fib_info via rt->fi, we can't wait for GC, a socket could hold the dst
+ * for a long time.
+ */
+void fib_put_nh_devs(struct net_device *dev)
+{
+       struct fib_info *prev_fi = NULL;
+       unsigned int hash = fib_devindex_hashfn(dev->ifindex);
+       struct hlist_head *head = &fib_info_devhash[hash];
+       struct fib_nh *nh;
+
+       hlist_for_each_entry(nh, head, nh_hash) {
+               struct fib_info *fi = nh->nh_parent;
+
+               if (nh->nh_dev != dev || fi == prev_fi)
+                       continue;
+               prev_fi = fi;
+               change_nexthops(fi) {
+                       if (nexthop_nh->nh_dev == dev) {
+                               /* This should be safe, we are on unregister
+                                * path, after synchronize_net() and
+                                * rcu_barrier(), no one could see this.
+                                */
+                               RCU_INIT_POINTER(nexthop_nh->nh_dev, NULL);
+                               dev_put(dev);
+                       }
+               } endfor_nexthops(fi)
+       }
+}
+
 /* Must be invoked inside of an RCU protected region.  */
 static void fib_select_default(const struct flowi4 *flp, struct fib_result 
*res)
 {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 655d9ee..f647310 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1387,6 +1387,11 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
 {
        struct rtable *rt = (struct rtable *) dst;
 
+       if (rt->fi) {
+               fib_info_put(rt->fi);
+               rt->fi = NULL;
+       }
+
        if (!list_empty(&rt->rt_uncached)) {
                struct uncached_list *ul = rt->rt_uncached_list;
 
@@ -1424,6 +1429,16 @@ static bool rt_cache_valid(const struct rtable *rt)
                !rt_is_expired(rt);
 }
 
+static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
+{
+       if (fi->fib_metrics != (u32 *)dst_default_metrics) {
+               fib_info_hold(fi);
+               rt->fi = fi;
+       }
+
+       dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+}
+
 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
                           const struct fib_result *res,
                           struct fib_nh_exception *fnhe,
@@ -1438,7 +1453,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 
daddr,
                        rt->rt_gateway = nh->nh_gw;
                        rt->rt_uses_gateway = 1;
                }
-               dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+               rt_init_metrics(rt, fi);
 #ifdef CONFIG_IP_ROUTE_CLASSID
                rt->dst.tclassid = nh->nh_tclassid;
 #endif
@@ -1490,6 +1505,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
                rt->rt_gateway = 0;
                rt->rt_uses_gateway = 0;
                rt->rt_table_id = 0;
+               rt->fi = NULL;
                INIT_LIST_HEAD(&rt->rt_uncached);
 
                rt->dst.output = ip_output;

Reply via email to