> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index dd4f34e041ab..bd1b94a618ad 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1922,6 +1922,9 @@ struct bpf_raw_tp_link {
> struct bpf_struct_ops_link {
> struct bpf_link link;
> struct bpf_map __rcu *map;
> + struct cgroup *cgroup;
> + bool cgroup_removed;
> + struct list_head list;
> wait_queue_head_t wait_hup;
> };
Yafang Shao raised a design concern about the struct_ops_link structure:
The struct has hardcoded cgroup-specific fields. He suggested a more
generic solution using 'int type' and 'void *private' fields to support
other structs (task, etc.) in the future. This design feedback was not
addressed or responded to.
Reference:
https://lore.kernel.org/bpf/CALOAHbDXmgi=yeB1c2zzQ7-Lz2+MEZvtbxQp1=mwxmjssg3...@mail.gmail.com/
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index aec171ccb6ef..f547613986cc 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1246,6 +1246,7 @@ enum bpf_perf_event_type {
> #define BPF_F_AFTER (1U << 4)
> #define BPF_F_ID (1U << 5)
> #define BPF_F_PREORDER (1U << 6)
> +#define BPF_F_CGROUP_FD (1U << 7)
> #define BPF_F_LINK BPF_F_LINK /* 1 << 13 */
Since both fdinfo and link_info show the cgroup ID, would BPF_F_CGROUP_ID
be a better name than BPF_F_CGROUP_FD for alignment?
This naming suggestion came from Yafang Shao and was not addressed or
responded to.
Reference:
https://lore.kernel.org/bpf/CALOAHbDXmgi=yeB1c2zzQ7-Lz2+MEZvtbxQp1=mwxmjssg3...@mail.gmail.com/
> diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
> index 7ad3b1a49dee..e63f926d8728 100644
> --- a/kernel/bpf/bpf_struct_ops.c
> +++ b/kernel/bpf/bpf_struct_ops.c
> @@ -1264,6 +1270,7 @@ static void bpf_struct_ops_map_link_show_fdinfo(const
> struct bpf_link *link,
> {
> struct bpf_struct_ops_link *st_link;
> struct bpf_map *map;
> + u64 cgrp_id = 0;
Assigning 0 to cgrp_id would technically be incorrect. The cgroup_id()
function returns 1 for !CONFIG_CGROUPS, and for CONFIG_CGROUPS the ID
allocation via idr_alloc_cyclic() uses a range between 1 and INT_MAX.
Would a sentinel value like (u64)-1 be more appropriate?
Matt Bobrowski raised this concern and Roman acknowledged the review,
but this was not fixed in the current version.
Reference: https://lore.kernel.org/bpf/[email protected]/
> @@ -1271,6 +1278,14 @@ static void bpf_struct_ops_map_link_show_fdinfo(const
> struct bpf_link *link,
> if (map)
> seq_printf(seq, "map_id:\t%d\n", map->id);
> rcu_read_unlock();
> +
> + cgroup_lock();
> + if (st_link->cgroup)
> + cgrp_id = cgroup_id(st_link->cgroup);
> + cgroup_unlock();
Could an inline helper be introduced for the cgroup_lock()/cgroup_id()/
cgroup_unlock() pattern? This same dance appears in both
bpf_struct_ops_map_link_show_fdinfo() and
bpf_struct_ops_map_link_fill_link_info().
Matt Bobrowski suggested this and Roman said he would try, but the
helper was not added in the current version.
Reference: https://lore.kernel.org/bpf/[email protected]/
> @@ -1393,8 +1422,12 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
> struct bpf_link_primer link_primer;
> struct bpf_struct_ops_map *st_map;
> struct bpf_map *map;
> + struct cgroup *cgrp;
> int err;
>
> + if (attr->link_create.flags & ~BPF_F_CGROUP_FD)
> + return -EINVAL;
Should all the cgroup-related logic be protected by CONFIG_CGROUPS ifdef?
Both cgroup_get_from_fd() and cgroup_put() lack stubs when building with
!CONFIG_CGROUPS. Should BPF_F_CGROUP_FD only be accepted when
CONFIG_BPF_CGROUP is enabled, returning -EOPNOTSUPP otherwise?
Matt Bobrowski raised this and Roman acknowledged the review, but these
guards were not added in the current version.
Reference: https://lore.kernel.org/bpf/[email protected]/
> @@ -1414,11 +1447,26 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
> bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
> &bpf_struct_ops_map_lops, NULL,
> attr->link_create.attach_type);
>
> + init_waitqueue_head(&link->wait_hup);
> +
> + if (attr->link_create.flags & BPF_F_CGROUP_FD) {
> + cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
> + if (IS_ERR(cgrp)) {
> + err = PTR_ERR(cgrp);
> + goto err_out;
> + }
> + link->cgroup = cgrp;
> + err = cgroup_bpf_attach_struct_ops(cgrp, link);
> + if (err) {
> + cgroup_put(cgrp);
> + link->cgroup = NULL;
> + goto err_out;
> + }
> + }
> +
> err = bpf_link_prime(&link->link, &link_primer);
> if (err)
> - goto err_out;
> -
> - init_waitqueue_head(&link->wait_hup);
> + goto err_put_cgroup;
>
> /* Hold the update_mutex such that the subsystem cannot
> * do link->ops->detach() before the link is fully initialized.
> @@ -1429,13 +1477,16 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
> mutex_unlock(&update_mutex);
> bpf_link_cleanup(&link_primer);
> link = NULL;
> - goto err_out;
> + goto err_put_cgroup;
> }
> RCU_INIT_POINTER(link->map, map);
> mutex_unlock(&update_mutex);
>
> return bpf_link_settle(&link_primer);
>
> +err_put_cgroup:
> + if (link && link->cgroup)
> + cgroup_bpf_detach_struct_ops(link->cgroup, link);
> err_out:
> bpf_map_put(map);
> kfree(link);
Can the error path here trigger a use-after-free?
After cgroup_bpf_attach_struct_ops() succeeds, the link is added to
cgrp->bpf.struct_ops_links and becomes reachable by cgroup_bpf_release().
If bpf_link_prime() then fails (FD exhaustion or memory pressure), the
error path checks link->cgroup and calls cgroup_bpf_detach_struct_ops().
However, if the cgroup is concurrently destroyed, cgroup_bpf_release()
can run:
cgroup_bpf_release():
cgroup_lock()
list_splice_init(&cgrp->bpf.struct_ops_links, &st_links)
for each st_link in st_links:
st_link->cgroup = NULL
st_link->cgroup_removed = true
bpf_link_inc_not_zero(&st_link->link) // refcount 1->2
cgroup_unlock()
...
st_link->link.ops->detach(&st_link->link)
bpf_link_put(&st_link->link)
At this point link->cgroup is NULL, so the err_put_cgroup check becomes
a no-op. Control proceeds to err_out and calls kfree(link), freeing the
memory.
Meanwhile, cgroup_bpf_release resumes its second loop and accesses the
already-freed link memory via st_link->link.ops->detach() and
bpf_link_put().
The error path uses kfree(link) directly, bypassing the bpf_link
refcount mechanism. Once the link is added to cgrp->bpf.struct_ops_links,
it becomes exposed to concurrent cgroup_bpf_release which can bump the
refcount. Direct kfree() ignores the additional reference and frees the
memory while the release work still holds a raw pointer.
In contrast, when the post-bpf_link_prime reg() failure occurs, the code
uses bpf_link_cleanup() and sets link = NULL, which is safe because it
relies on the refcount mechanism.
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 876f6a81a9b6..b593ebb30a4e 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -307,12 +308,23 @@ static void cgroup_bpf_release(struct work_struct *work)
> bpf.release_work);
> struct bpf_prog_array *old_array;
> struct list_head *storages = &cgrp->bpf.storages;
> + struct bpf_struct_ops_link *st_link, *st_tmp;
> struct bpf_cgroup_storage *storage, *stmp;
> + LIST_HEAD(st_links);
>
> unsigned int atype;
>
> cgroup_lock();
>
> + list_splice_init(&cgrp->bpf.struct_ops_links, &st_links);
> + list_for_each_entry_safe(st_link, st_tmp, &st_links, list) {
> + st_link->cgroup = NULL;
> + st_link->cgroup_removed = true;
> + cgroup_put(cgrp);
> + if (IS_ERR(bpf_link_inc_not_zero(&st_link->link)))
> + list_del(&st_link->list);
> + }
Does this loop have the correct ordering for concurrent link dealloc?
The first loop writes to st_link memory (st_link->cgroup = NULL,
st_link->cgroup_removed = true) before calling bpf_link_inc_not_zero()
to take a reference.
Meanwhile, if userspace closes the link FD and the refcount drops to 0,
bpf_struct_ops_map_link_dealloc() runs:
bpf_struct_ops_map_link_dealloc():
st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
if (st_link->cgroup)
cgroup_bpf_detach_struct_ops(st_link->cgroup, st_link);
kfree(st_link);
The dealloc path reads st_link->cgroup without holding cgroup_mutex.
If this read sees the NULL written by cgroup_bpf_release, dealloc skips
cgroup_bpf_detach_struct_ops() and proceeds to kfree(st_link).
At that point, cgroup_bpf_release continues and performs use-after-free
on the freed st_link memory:
st_link->cgroup_removed = true // UAF write
bpf_link_inc_not_zero(&st_link->link) // UAF atomic on freed memory
list_del(&st_link->list) // UAF / list corruption
The link is on cgrp->bpf.struct_ops_links but list membership does not
increment the link refcount. The cgroup_lock would only synchronize the
dealloc path if dealloc unconditionally took it, but dealloc takes
cgroup_lock conditionally on (st_link->cgroup), which the racing writer
just nulled.
Would it be safer to call bpf_link_inc_not_zero() first before touching
st_link's other fields?
> @@ -2755,3 +2773,31 @@ cgroup_common_func_proto(enum bpf_func_id func_id,
> const struct bpf_prog *prog)
> +void cgroup_bpf_detach_struct_ops(struct cgroup *cgrp,
> + struct bpf_struct_ops_link *link)
> +{
> + cgroup_lock();
> + if (link->cgroup == cgrp) {
> + list_del(&link->list);
> + link->cgroup = NULL;
> + cgroup_put(cgrp);
> + }
> + cgroup_unlock();
> +}
Is the cgrp parameter needed in cgroup_bpf_attach_struct_ops() and
cgroup_bpf_detach_struct_ops()? Both Matt Bobrowski and Josh Don
suggested that both functions should operate on link->cgroup instead,
with a !link->cgroup being considered -EINVAL. Roman acknowledged with
'Ack' and 'Sure, good point' but the parameter was not removed.
Reference: https://lore.kernel.org/bpf/[email protected]/
---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/26429228214