Add new ve feature - VE_FEATURE_BPF, to allow bpf cgroup device
controllers on per-container basis. Needed to support Docker inside
Virtuozzo containers on cgroup-v2.

When feature is enabled it allows bpf_prog_query(BPF_CGROUP_DEVICE) to
CAP_NET/SYS_ADMIN of the container. This is needed to allow Docker
getting back the information about device control programs it attaches
to cgroups.

Also it allows bpf_prog_load(BPF_PROG_TYPE_CGROUP_DEVICE) to
CAP_BPF+CAP_NET_ADMIN or CAP_SYS_ADMIN of the container. This is to
allow Docker actually loading device control programs.

All the capability checks with fallback to CAP_SYS_ADMIN are similar to
original capability checks in the original code path, with an exception
that they are now relative to ve.

The VE_FEATURE_BPF is similar to kernel.unprivileged_bpf_disabled sysctl
which allows a small number of unprivileged types of programs to be
loaded with restrictions like: instruction count check (4096 as opposed
to 1M in privileged mode) and unaligned access check, so it should be
less prone to verifier targeted exploits than if we allow it without
restrictions.

https://virtuozzo.atlassian.net/browse/VSTOR-126504
Signed-off-by: Pavel Tikhomirov <[email protected]>

--
Tested via adding VE_FEATURE_BPF to VE_FEATURES_DEF, to have it enabled
by default. In real life though we had to implement the new "bpf"
feature in libvzctl to actually be able to toggle it.

Possible problems:

- Even with all extra restrictions of unprivileged bpf context we still
can face exploits or kernel crashes from allowing this.

- Limited bpf JIT memory budget. We can later add bpf program count
limits per VE to avoid one container to consume all JIT memory on the
system by creating excessive numbers of programs.

- Verifier uses bpf_verifier_lock global lock, thus allowing to take it
in containers can lead to lock contention.

- If Docker's device controllers will start to use maps this would
not be enough and we would need to patch more bpf checks.
---
 include/uapi/linux/vzcalluser.h |  1 +
 kernel/bpf/syscall.c            | 34 ++++++++++++++++++++++++---------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/include/uapi/linux/vzcalluser.h b/include/uapi/linux/vzcalluser.h
index b04594d31666..000e3ee107ad 100644
--- a/include/uapi/linux/vzcalluser.h
+++ b/include/uapi/linux/vzcalluser.h
@@ -48,6 +48,7 @@ struct vzctl_ve_configure {
 #define VE_FEATURE_BRIDGE      (1ULL << 7)
 #define VE_FEATURE_NFSD                (1ULL << 8)
 #define VE_FEATURE_TIME                (1ULL << 9)
+#define VE_FEATURE_BPF         (1ULL << 10)
 
 #define VE_FEATURES_OLD                (VE_FEATURE_SYSFS)
 #define VE_FEATURES_DEF                (VE_FEATURE_SYSFS | 
VE_FEATURE_DEF_PERMS)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2c901fc67570..62d945ad4c41 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -36,6 +36,7 @@
 #include <linux/rcupdate_trace.h>
 #include <linux/memcontrol.h>
 #include <linux/trace_events.h>
+#include <linux/vzcalluser.h>
 
 #include <net/netfilter/nf_bpf_link.h>
 #include <net/netkit.h>
@@ -1217,6 +1218,12 @@ static bool bpf_net_capable(void)
        return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
 }
 
+static bool ve_bpf_capable(int cap)
+{
+       return feature_capable(VE_FEATURE_BPF, cap) ||
+              (cap != CAP_SYS_ADMIN && feature_capable(VE_FEATURE_BPF, 
CAP_SYS_ADMIN));
+}
+
 #define BPF_MAP_CREATE_LAST_FIELD map_token_fd
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
@@ -2706,21 +2713,27 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t 
uattr, u32 uattr_size)
         * capability checks are still carried out for these
         * and other operations.
         */
-       if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
-               goto put_token;
-
+       if (!bpf_cap) {
+               if (type == BPF_PROG_TYPE_CGROUP_DEVICE) {
+                       if (!ve_bpf_capable(CAP_BPF))
+                               goto put_token;
+               } else if (type == BPF_PROG_TYPE_SOCKET_FILTER ||
+                          type == BPF_PROG_TYPE_CGROUP_SKB) {
+                       if (sysctl_unprivileged_bpf_disabled)
+                               goto put_token;
+               } else {
+                       goto put_token;
+               }
+       }
        if (attr->insn_cnt == 0 ||
            attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : 
BPF_MAXINSNS)) {
                err = -E2BIG;
                goto put_token;
        }
-       if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
-           type != BPF_PROG_TYPE_CGROUP_SKB &&
-           !bpf_cap)
-               goto put_token;
 
        if (is_net_admin_prog_type(type) && !bpf_token_capable(token, 
CAP_NET_ADMIN))
-               goto put_token;
+               if (type != BPF_PROG_TYPE_CGROUP_DEVICE || 
!ve_bpf_capable(CAP_NET_ADMIN))
+                       goto put_token;
        if (is_perfmon_prog_type(type) && !bpf_token_capable(token, 
CAP_PERFMON))
                goto put_token;
 
@@ -4198,8 +4211,11 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 static int bpf_prog_query(const union bpf_attr *attr,
                          union bpf_attr __user *uattr)
 {
-       if (!bpf_net_capable())
+       if (!bpf_net_capable()
+           && !(attr->query.attach_type == BPF_CGROUP_DEVICE &&
+                ve_bpf_capable(CAP_NET_ADMIN))) {
                return -EPERM;
+       }
        if (CHECK_ATTR(BPF_PROG_QUERY))
                return -EINVAL;
        if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
-- 
2.53.0

_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to