> diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c
> index 716df49d7647..1f726a7b22e3 100644
> --- a/mm/bpf_memcontrol.c
> +++ b/mm/bpf_memcontrol.c

[ ... ]

> +/* Unregister the struct ops instance */
> +static void bpf_memcg_ops_unreg(void *kdata, struct bpf_link *link)
> +{
> +     struct bpf_struct_ops_link *ops_link;
> +     struct memcg_bpf_ops *ops = kdata;
> +     struct cgroup_subsys_state *css;
> +     struct mem_cgroup *memcg;
> +     struct mem_cgroup *iter;
> +     struct memcg_bpf_ops *parent_bpf_ops = NULL;
> +     u32 parent_bpf_ops_flags = 0;
> +
> +     if (!link)
> +             return;
> +     ops_link = container_of(link, struct bpf_struct_ops_link, link);
> +     if (!ops_link->cgroup)
> +             return;

Can this early return skip a required synchronize_srcu(&memcg_bpf_srcu)?

When a cgroup that owns a memcg_bpf_ops link is destroyed,
cgroup_bpf_release() takes cgroup_mutex, splices the struct_ops_links,
sets st_link->cgroup = NULL, drops cgroup_mutex, and then calls
link->ops->detach() outside the mutex.  detach()
(bpf_struct_ops_map_link_detach) invokes unreg() with link->cgroup
already NULL, which hits this early return:

        if (!ops_link->cgroup)
                return;

At that point descendants of the destroyed cgroup still hold
memcg->bpf_ops pointing at the registered ops (kdata).  Memcgs in that
subtree are not freed immediately because folios pin them via
folio->memcg_data after offline, and uncharge paths
(memcg_uncharge() -> bpf_memcg_uncharged()) keep dereferencing those
memcgs.

bpf_memcg_uncharged()/bpf_memcg_charged() use the BPF_MEMCG_CALL macros,
which take srcu_read_lock(&memcg_bpf_srcu), READ_ONCE(memcg->bpf_ops),
and then dereference __ops->op.  After unreg() returns, the
bpf_struct_ops infrastructure eventually frees kdata via
bpf_struct_ops_map_free(), which calls
synchronize_rcu_mult(call_rcu, call_rcu_tasks) and waits only for
classic RCU and RCU-tasks grace periods, not SRCU.

If the unreg() path returns without calling
synchronize_srcu(&memcg_bpf_srcu), can an SRCU reader holding a pointer
into the about-to-be-freed kdata observe a use-after-free when invoking
__ops->op()?

Would moving the synchronize_srcu(&memcg_bpf_srcu) call so it runs on
both early-return paths (or unconditionally before returning) be
appropriate here?  The other early return (link == NULL) looks benign
because reg() returns -EOPNOTSUPP for link == NULL and never writes any
memcg->bpf_ops, so no synchronize is required on that path.

> +     cgroup_lock();
> +
> +     css = cgroup_e_css(ops_link->cgroup, &memory_cgrp_subsys);
> +     if (!css)
> +             goto unlock_out;
> +     memcg = mem_cgroup_from_css(css);
> +
> +     /* Get the parent bpf_ops and bpf_ops_flags */
> +     iter = parent_mem_cgroup(memcg);
> +     if (iter) {
> +             parent_bpf_ops = READ_ONCE(iter->bpf_ops);
> +             parent_bpf_ops_flags = iter->bpf_ops_flags;
> +     }
> +
> +     iter = NULL;
> +     while ((iter = mem_cgroup_iter(memcg, iter, NULL))) {
> +             if (READ_ONCE(iter->bpf_ops) == ops) {
> +                     WRITE_ONCE(iter->bpf_ops, parent_bpf_ops);
> +                     iter->bpf_ops_flags = parent_bpf_ops_flags;
> +             }
> +     }
> +
> +unlock_out:
> +     cgroup_unlock();
> +     synchronize_srcu(&memcg_bpf_srcu);
> +}

[ ... ]

> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 749c128b4fad..3494f36d792d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c

[ ... ]

> @@ -2516,11 +2522,15 @@ void __mem_cgroup_handle_over_high(gfp_t gfp_mask)
>        * memory.high is breached and reclaim is unable to keep up. Throttle
>        * allocators proactively to slow down excessive growth.
>        */
> -     penalty_jiffies = calculate_high_delay(memcg, nr_pages,
> -                                            mem_find_max_overage(memcg));
> +     if (nr_pages) {
> +             penalty_jiffies = calculate_high_delay(
> +                     memcg, nr_pages, mem_find_max_overage(memcg));
>
> -     penalty_jiffies += calculate_high_delay(memcg, nr_pages,
> -                                             swap_find_max_overage(memcg));
> +             penalty_jiffies += calculate_high_delay(
> +                     memcg, nr_pages, swap_find_max_overage(memcg));
> +     } else
> +             penalty_jiffies = 0;
> +     penalty_jiffies = max(penalty_jiffies, bpf_high_delay);

This isn't a bug, but the if/else here mixes braced and unbraced
branches.  Documentation/process/coding-style.rst (Placing Braces and
Spaces) asks that when one arm of an if/else uses braces, the other arm
use them too.  Would something like the following be preferable, and
quiet checkpatch.pl?

        if (nr_pages) {
                penalty_jiffies = calculate_high_delay(
                        memcg, nr_pages, mem_find_max_overage(memcg));

                penalty_jiffies += calculate_high_delay(
                        memcg, nr_pages, swap_find_max_overage(memcg));
        } else {
                penalty_jiffies = 0;
        }


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/26429228214

Reply via email to