As a part of the effort to remove dependency on rtnl lock, cls API is being converted to use fine-grained locking mechanisms instead of global rtnl lock. However, chain_head_change callback for ingress Qdisc is a sleeping function and cannot be executed while holding a spinlock.
Extend cls API with new workqueue intended to be used for tcf_proto lifetime management. Modify tcf_proto_destroy() to deallocate proto asynchronously on workqueue in order to ensure that all chain_head_change callbacks involving the proto complete before it is freed. Convert mini_qdisc_pair_swap(), that is used as a chain_head_change callback for ingress and clsact Qdiscs, to use a workqueue. Move Qdisc deallocation to tc_proto_wq ordered workqueue that is used to destroy tcf proto instances. This is necessary to ensure that Qdisc is destroyed after all instances of chain/proto that it contains in order to prevent use-after-free error in tc_chain_notify_delete(). Signed-off-by: Vlad Buslov <vla...@mellanox.com> Suggested-by: Jiri Pirko <j...@mellanox.com> --- Changes from V1 to V2: - Use smp_store_release() instead of xchg() for setting miniqp->tp_head. - Move Qdisc deallocation to tc_proto_wq ordered workqueue that is used to destroy tcf proto instances. This is necessary to ensure that Qdisc is destroyed after all instances of chain/proto that it contains in order to prevent use-after-free error in tc_chain_notify_delete(). - Cache parent net device ifindex in block to prevent use-after-free of dev queue in tc_chain_notify_delete(). include/net/sch_generic.h | 14 +++++++++++- net/sched/cls_api.c | 52 ++++++++++++++++++++++++++++++++++++------- net/sched/sch_generic.c | 56 +++++++++++++++++++++++++++++++++++++++++++---- net/sched/sch_ingress.c | 4 ++++ 4 files changed, 113 insertions(+), 13 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 9481f2c142e2..80fc06468c79 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -109,7 +109,13 @@ struct Qdisc { spinlock_t busylock ____cacheline_aligned_in_smp; spinlock_t seqlock; - struct rcu_head rcu; + int ifindex; /* cached interface index of parent net + * device + */ + union { + struct rcu_head rcu; + struct work_struct work; + }; }; static inline void qdisc_refcount_inc(struct Qdisc *qdisc) @@ -322,6 +328,7 @@ struct tcf_proto { const struct tcf_proto_ops *ops; struct tcf_chain *chain; struct rcu_head rcu; + struct work_struct work; }; struct qdisc_skb_cb { @@ -400,6 +407,8 @@ tc_cls_offload_cnt_update(struct tcf_block *block, u32 *cnt, } } +bool tc_queue_proto_work(struct work_struct *work); + static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) { struct qdisc_skb_cb *qcb; @@ -1175,12 +1184,15 @@ struct mini_Qdisc_pair { struct mini_Qdisc miniq1; struct mini_Qdisc miniq2; struct mini_Qdisc __rcu **p_miniq; + struct tcf_proto *tp_head; + struct work_struct work; }; void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, struct tcf_proto *tp_head); void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, struct mini_Qdisc __rcu **p_miniq); +void mini_qdisc_pair_cleanup(struct mini_Qdisc_pair *miniqp); static inline void skb_tc_reinsert(struct sk_buff *skb, struct tcf_result *res) { diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index d92f44ac4c39..08f245abf04a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -108,12 +108,14 @@ int register_tcf_proto_ops(struct tcf_proto_ops *ops) EXPORT_SYMBOL(register_tcf_proto_ops); static struct workqueue_struct *tc_filter_wq; +static struct workqueue_struct *tc_proto_wq; int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) { struct tcf_proto_ops *t; int rc = -ENOENT; + flush_workqueue(tc_proto_wq); /* Wait for outstanding call_rcu()s, if any, from a * tcf_proto_ops's destroy() handler. */ @@ -140,6 +142,12 @@ bool tcf_queue_work(struct rcu_work *rwork, work_func_t func) } EXPORT_SYMBOL(tcf_queue_work); +bool tc_queue_proto_work(struct work_struct *work) +{ + return queue_work(tc_proto_wq, work); +} +EXPORT_SYMBOL(tc_queue_proto_work); + /* Select new prio value from the range, managed by kernel. */ static inline u32 tcf_auto_prio(struct tcf_proto *tp) @@ -152,6 +160,23 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) return TC_H_MAJ(first); } +static void tcf_chain_put(struct tcf_chain *chain); + +static void tcf_proto_destroy_work(struct work_struct *work) +{ + struct tcf_proto *tp = container_of(work, struct tcf_proto, work); + struct tcf_chain *chain = tp->chain; + + rtnl_lock(); + + tp->ops->destroy(tp, NULL); + module_put(tp->ops->owner); + kfree_rcu(tp, rcu); + tcf_chain_put(chain); + + rtnl_unlock(); +} + static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, u32 prio, struct tcf_chain *chain, struct netlink_ext_ack *extack) @@ -172,6 +197,7 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, tp->protocol = protocol; tp->prio = prio; tp->chain = chain; + INIT_WORK(&tp->work, tcf_proto_destroy_work); err = tp->ops->init(tp); if (err) { @@ -188,9 +214,7 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, static void tcf_proto_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { - tp->ops->destroy(tp, extack); - module_put(tp->ops->owner); - kfree_rcu(tp, rcu); + tc_queue_proto_work(&tp->work); } struct tcf_filter_chain_list_item { @@ -362,7 +386,6 @@ static void tcf_chain_flush(struct tcf_chain *chain) RCU_INIT_POINTER(chain->filter_chain, tp->next); tcf_proto_destroy(tp, NULL); tp = rtnl_dereference(chain->filter_chain); - tcf_chain_put(chain); } } @@ -1378,7 +1401,6 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain, if (tp == chain->filter_chain) tcf_chain0_head_change(chain, next); RCU_INIT_POINTER(*chain_info->pprev, next); - tcf_chain_put(chain); } static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, @@ -1425,7 +1447,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; if (q) { - tcm->tcm_ifindex = qdisc_dev(q)->ifindex; + tcm->tcm_ifindex = q->ifindex; tcm->tcm_parent = parent; } else { tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK; @@ -1667,8 +1689,14 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, false); } else { - if (tp_created) + if (tp_created) { + /* tp wasn't inserted to chain tp list. Take reference + * to chain manually for tcf_proto_destroy() to + * release. + */ + tcf_chain_hold(chain); tcf_proto_destroy(tp, NULL); + } } errout: @@ -2074,7 +2102,7 @@ static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net, tcm->tcm__pad2 = 0; tcm->tcm_handle = 0; if (block->q) { - tcm->tcm_ifindex = qdisc_dev(block->q)->ifindex; + tcm->tcm_ifindex = block->q->ifindex; tcm->tcm_parent = block->q->handle; } else { tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK; @@ -2598,6 +2626,12 @@ static int __init tc_filter_init(void) if (!tc_filter_wq) return -ENOMEM; + tc_proto_wq = alloc_ordered_workqueue("tc_proto_workqueue", 0); + if (!tc_proto_wq) { + err = -ENOMEM; + goto err_proto_wq; + } + err = register_pernet_subsys(&tcf_net_ops); if (err) goto err_register_pernet_subsys; @@ -2621,6 +2655,8 @@ static int __init tc_filter_init(void) err_rhash_setup_block_ht: unregister_pernet_subsys(&tcf_net_ops); err_register_pernet_subsys: + destroy_workqueue(tc_proto_wq); +err_proto_wq: destroy_workqueue(tc_filter_wq); return err; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index de1663f7d3ad..617b1dbceb01 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -28,6 +28,7 @@ #include <linux/if_vlan.h> #include <linux/skb_array.h> #include <linux/if_macvlan.h> +#include <asm/barrier.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> @@ -881,6 +882,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; + sch->ifindex = dev->ifindex; dev_hold(dev); refcount_set(&sch->refcnt, 1); @@ -953,11 +955,19 @@ void qdisc_free(struct Qdisc *qdisc) kfree((char *) qdisc - qdisc->padded); } +static void qdisc_free_work(struct work_struct *work) +{ + struct Qdisc *qdisc = container_of(work, struct Qdisc, work); + + qdisc_free(qdisc); +} + static void qdisc_free_cb(struct rcu_head *head) { struct Qdisc *q = container_of(head, struct Qdisc, rcu); - qdisc_free(q); + INIT_WORK(&q->work, qdisc_free_work); + tc_queue_proto_work(&q->work); } static void qdisc_destroy(struct Qdisc *qdisc) @@ -1363,10 +1373,14 @@ static void mini_qdisc_rcu_func(struct rcu_head *head) { } -void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, - struct tcf_proto *tp_head) +void __mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, + struct tcf_proto *tp_head) { - struct mini_Qdisc *miniq_old = rtnl_dereference(*miniqp->p_miniq); + /* p_miniq is either changed on ordered workqueue or during miniqp + * cleanup. In both cases no concurrent modification is possible. + */ + struct mini_Qdisc *miniq_old = + rcu_dereference_protected(*miniqp->p_miniq, 1); struct mini_Qdisc *miniq; if (!tp_head) { @@ -1394,6 +1408,32 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, */ call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func); } + +void mini_qdisc_pair_swap_work(struct work_struct *work) +{ + struct mini_Qdisc_pair *miniqp = container_of(work, + struct mini_Qdisc_pair, + work); + struct tcf_proto *tp_head; + + /* Reset tp_head pointer to error in order to differentiate it from NULL + * pointer, which is a valid value. Paired with smp_store_release(). + */ + tp_head = xchg(&miniqp->tp_head, ERR_PTR(-ENOENT)); + if (!IS_ERR(tp_head)) + __mini_qdisc_pair_swap(miniqp, tp_head); +} + +void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, + struct tcf_proto *tp_head) +{ + /* Set tp_head to new value. We don't care if tp_head already contains + * some valid pointer, we just need to set miniq->filter_list to current + * tp_head. + */ + smp_store_release(&miniqp->tp_head, tp_head); + tc_queue_proto_work(&miniqp->work); +} EXPORT_SYMBOL(mini_qdisc_pair_swap); void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, @@ -1404,5 +1444,13 @@ void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats; miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats; miniqp->p_miniq = p_miniq; + INIT_WORK(&miniqp->work, mini_qdisc_pair_swap_work); } EXPORT_SYMBOL(mini_qdisc_pair_init); + +void mini_qdisc_pair_cleanup(struct mini_Qdisc_pair *miniqp) +{ + cancel_work_sync(&miniqp->work); + __mini_qdisc_pair_swap(miniqp, NULL); +} +EXPORT_SYMBOL(mini_qdisc_pair_cleanup); diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index ce3f55259d0d..8edc3c46bd9c 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -99,6 +99,7 @@ static void ingress_destroy(struct Qdisc *sch) struct ingress_sched_data *q = qdisc_priv(sch); tcf_block_put_ext(q->block, sch, &q->block_info); + mini_qdisc_pair_cleanup(&q->miniqp); net_dec_ingress_queue(); } @@ -245,6 +246,9 @@ static void clsact_destroy(struct Qdisc *sch) tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); + mini_qdisc_pair_cleanup(&q->miniqp_ingress); + mini_qdisc_pair_cleanup(&q->miniqp_egress); + net_dec_ingress_queue(); net_dec_egress_queue(); } -- 2.7.5