In setups like the following: Host A -- Host B tun0 -- ipsec -- eth0 -- eth0 -- ipsec -- tun0
where tun0 are tunnel devices using dst_cache (ipip, ipip6, etc...). Unregistration of an underlying eth0 device leads to the following log messages: unregister_netdevice: waiting for eth0 to become free. Usage count = 2 This is since xfrm dsts device references are not released upon device unregistration when the xfrm dst is cached in a dst_cache. Several approaches for fixing this were discussed in [1]; this commit keeps track of allocated xdsts and releases their device references on a netdev unregister event. Signed-off-by: Eyal Birger <eyal.bir...@gmail.com> [1] https://patchwork.ozlabs.org/patch/869025/ --- v2: - call gc flush from existing netdev notifier per Shannon Nelson's suggestion. --- include/net/xfrm.h | 11 +++------ net/xfrm/xfrm_device.c | 2 ++ net/xfrm/xfrm_policy.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 7 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 7d20776..4614047 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -324,6 +324,7 @@ void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo); void km_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c); void xfrm_policy_cache_flush(void); +void xfrm_policy_dst_gc_flush_dev(struct net_device *dev); void km_state_notify(struct xfrm_state *x, const struct km_event *c); struct xfrm_tmpl; @@ -994,6 +995,8 @@ struct xfrm_dst { u32 child_mtu_cached; u32 route_cookie; u32 path_cookie; + struct list_head xfrm_dst_gc; + struct xfrm_dst_list *xfrm_dst_gc_list; }; static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst) @@ -1025,13 +1028,7 @@ static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *c xdst->child = child; } -static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) -{ - xfrm_pols_put(xdst->pols, xdst->num_pols); - dst_release(xdst->route); - if (likely(xdst->u.dst.xfrm)) - xfrm_state_put(xdst->u.dst.xfrm); -} +void xfrm_dst_destroy(struct xfrm_dst *xdst); #endif void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 8e70291..5bd9892 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -309,6 +309,7 @@ static int xfrm_dev_register(struct net_device *dev) static int xfrm_dev_unregister(struct net_device *dev) { xfrm_policy_cache_flush(); + xfrm_policy_dst_gc_flush_dev(dev); return NOTIFY_DONE; } @@ -323,6 +324,7 @@ static int xfrm_dev_down(struct net_device *dev) xfrm_dev_state_flush(dev_net(dev), dev, true); xfrm_policy_cache_flush(); + xfrm_policy_dst_gc_flush_dev(dev); return NOTIFY_DONE; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7a23078..c7b9a9a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -45,6 +45,12 @@ struct xfrm_flo { u8 flags; }; +struct xfrm_dst_list { + spinlock_t lock; /* sync writers */ + struct list_head head; +}; + +static DEFINE_PER_CPU_ALIGNED(struct xfrm_dst_list, xfrm_dst_gc_list); static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst); static struct work_struct *xfrm_pcpu_work __read_mostly; static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); @@ -94,6 +100,51 @@ __xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) (fl6->flowi6_oif == sel->ifindex || !sel->ifindex); } +static void xfrm_dst_add_to_gc_list(struct xfrm_dst *xdst) +{ + struct xfrm_dst_list *xl = raw_cpu_ptr(&xfrm_dst_gc_list); + + xdst->xfrm_dst_gc_list = xl; + + spin_lock_bh(&xl->lock); + list_add_tail(&xdst->xfrm_dst_gc, &xl->head); + spin_unlock_bh(&xl->lock); +} + +void xfrm_dst_destroy(struct xfrm_dst *xdst) +{ + xfrm_pols_put(xdst->pols, xdst->num_pols); + dst_release(xdst->route); + if (likely(xdst->u.dst.xfrm)) + xfrm_state_put(xdst->u.dst.xfrm); + if (!list_empty(&xdst->xfrm_dst_gc)) { + struct xfrm_dst_list *xl = xdst->xfrm_dst_gc_list; + + spin_lock_bh(&xl->lock); + list_del(&xdst->xfrm_dst_gc); + spin_unlock_bh(&xl->lock); + } +} +EXPORT_SYMBOL_GPL(xfrm_dst_destroy); + +void xfrm_policy_dst_gc_flush_dev(struct net_device *dev) +{ + struct xfrm_dst *xdst; + int cpu; + + for_each_possible_cpu(cpu) { + struct xfrm_dst_list *xl = &per_cpu(xfrm_dst_gc_list, cpu); + + spin_lock_bh(&xl->lock); + list_for_each_entry(xdst, &xl->head, xfrm_dst_gc) { + if (xdst->u.dst.dev != dev) + continue; + dst_dev_put(&xdst->u.dst); + } + spin_unlock_bh(&xl->lock); + } +} + bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl, unsigned short family) { @@ -1581,6 +1632,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, } bundle[i] = xdst; + xfrm_dst_add_to_gc_list(xdst); + if (!xdst_prev) xdst0 = xdst; else @@ -2984,6 +3037,18 @@ static struct pernet_operations __net_initdata xfrm_net_ops = { .exit = xfrm_net_exit, }; +static void __init xfrm_dst_gc_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct xfrm_dst_list *xl = &per_cpu(xfrm_dst_gc_list, cpu); + + INIT_LIST_HEAD(&xl->head); + spin_lock_init(&xl->lock); + } +} + void __init xfrm_init(void) { int i; @@ -2998,6 +3063,7 @@ void __init xfrm_init(void) register_pernet_subsys(&xfrm_net_ops); seqcount_init(&xfrm_policy_hash_generation); xfrm_input_init(); + xfrm_dst_gc_init(); } #ifdef CONFIG_AUDITSYSCALL -- 2.7.4