On 8/7/19 6:22 PM, Carlos Antonio Neira Bustos wrote: > The code has been modified to avoid syscalls that could sleep. > Please let me know if any other modification is needed. > > From be0384c0fa209a78c1567936e8db4e35b9a7c0f8 Mon Sep 17 00:00:00 2001 > From: Carlos <cneirabus...@gmail.com> > Date: Wed, 7 Aug 2019 20:04:30 -0400 > Subject: [PATCH] [PATCH v5 bpf-next] BPF: New helper to obtain namespace data > from current task > > This helper obtains the active namespace from current and returns pid, tgid, > device and namespace id as seen from that namespace, allowing to instrument > a process inside a container. > Device is read from /proc/self/ns/pid, as in the future it's possible that > different pid_ns files may belong to different devices, according > to the discussion between Eric Biederman and Yonghong in 2017 linux plumbers > conference. > Currently bpf_get_current_pid_tgid(), is used to do pid filtering in bcc's > scripts but this helper returns the pid as seen by the root namespace which is > fine when a bcc script is not executed inside a container. > When the process of interest is inside a container, pid filtering will not > work > if bpf_get_current_pid_tgid() is used. This helper addresses this limitation > returning the pid as it's seen by the current namespace where the script is > executing. > > This helper has the same use cases as bpf_get_current_pid_tgid() as it can be > used to do pid filtering even inside a container. > > For example a bcc script using bpf_get_current_pid_tgid() > (tools/funccount.py): > > u32 pid = bpf_get_current_pid_tgid() >> 32; > if (pid != <pid_arg_passed_in>) > return 0; > Could be modified to use bpf_get_current_pidns_info() as follows: > > struct bpf_pidns pidns; > bpf_get_current_pidns_info(&pidns, sizeof(struct bpf_pidns)); > u32 pid = pidns.tgid; > u32 nsid = pidns.nsid; > if ((pid != <pid_arg_passed_in>) && (nsid != <nsid_arg_passed_in>)) > return 0; > > To find out the name PID namespace id of a process, you could use this > command: > > $ ps -h -o pidns -p <pid_of_interest> > > Or this other command: > > $ ls -Li /proc/<pid_of_interest>/ns/pid > > Signed-off-by: Carlos Neira <cneirabus...@gmail.com> > --- > fs/namei.c | 2 +- > include/linux/bpf.h | 1 + > include/linux/namei.h | 4 + > include/uapi/linux/bpf.h | 29 ++++- > kernel/bpf/core.c | 1 + > kernel/bpf/helpers.c | 78 ++++++++++++ > kernel/trace/bpf_trace.c | 2 + > samples/bpf/Makefile | 3 + > samples/bpf/trace_ns_info_user.c | 35 ++++++ > samples/bpf/trace_ns_info_user_kern.c | 44 +++++++ > tools/include/uapi/linux/bpf.h | 29 ++++- > tools/testing/selftests/bpf/Makefile | 2 +- > tools/testing/selftests/bpf/bpf_helpers.h | 3 + > .../testing/selftests/bpf/progs/test_pidns_kern.c | 51 ++++++++ > tools/testing/selftests/bpf/test_pidns.c | 138 > +++++++++++++++++++++ > 15 files changed, 418 insertions(+), 4 deletions(-) > create mode 100644 samples/bpf/trace_ns_info_user.c > create mode 100644 samples/bpf/trace_ns_info_user_kern.c > create mode 100644 tools/testing/selftests/bpf/progs/test_pidns_kern.c > create mode 100644 tools/testing/selftests/bpf/test_pidns.c > > diff --git a/fs/namei.c b/fs/namei.c > index 209c51a5226c..d1eca36972d2 100644 > --- a/fs/namei.c > +++ b/fs/namei.c > @@ -19,7 +19,6 @@ > #include <linux/export.h> > #include <linux/kernel.h> > #include <linux/slab.h> > -#include <linux/fs.h> > #include <linux/namei.h> > #include <linux/pagemap.h> > #include <linux/fsnotify.h> > @@ -2355,6 +2354,7 @@ int filename_lookup(int dfd, struct filename *name, > unsigned flags, > putname(name); > return retval; > } > +EXPORT_SYMBOL(filename_lookup);
No need to export symbols. bpf uses it and bpf is in the core, not in modules. > > /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ > static int path_parentat(struct nameidata *nd, unsigned flags, > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index f9a506147c8a..e4adf5e05afd 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1050,6 +1050,7 @@ extern const struct bpf_func_proto > bpf_get_local_storage_proto; > extern const struct bpf_func_proto bpf_strtol_proto; > extern const struct bpf_func_proto bpf_strtoul_proto; > extern const struct bpf_func_proto bpf_tcp_sock_proto; > +extern const struct bpf_func_proto bpf_get_current_pidns_info_proto; > > /* Shared helpers among cBPF and eBPF. */ > void bpf_user_rnd_init_once(void); > diff --git a/include/linux/namei.h b/include/linux/namei.h > index 9138b4471dbf..2c24e8c71d46 100644 > --- a/include/linux/namei.h > +++ b/include/linux/namei.h > @@ -6,6 +6,7 @@ > #include <linux/path.h> > #include <linux/fcntl.h> > #include <linux/errno.h> > +#include <linux/fs.h> > > enum { MAX_NESTED_LINKS = 8 }; > > @@ -97,6 +98,9 @@ extern void unlock_rename(struct dentry *, struct dentry *); > > extern void nd_jump_link(struct path *path); > > +extern int filename_lookup(int dfd, struct filename *name, unsigned int > flags, > + struct path *path, struct path *root); The previous definition in fs/internal.h should be removed. > + > static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) > { > ((char *) name)[min(len, maxlen)] = '\0'; > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 4393bd4b2419..6f601f7106e2 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -2741,6 +2741,26 @@ union bpf_attr { > * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies > * > * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 > + * > + * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 > size_of_pidns) > + * Description > + * Copies into *pidns* pid, namespace id and tgid as seen by the > + * current namespace and also device from /proc/self/ns/pid. > + * *size_of_pidns* must be the size of *pidns* > + * > + * This helper is used when pid filtering is needed inside a > + * container as bpf_get_current_tgid() helper returns always the > + * pid id as seen by the root namespace. > + * Return > + * 0 on success > + * > + * **-EINVAL** if unable to get ns, pid or tgid of current task. > + * Or if size_of_pidns is not valid. Maybe reword by following the code sequence. if *size_of_pidns* is not valid or unable to get ns, pid or tgid of the current task. > + * > + * **-ENOMEM** if allocation fails. Maybe some other error codes in filename_lookup() function? > + * > + * If unable to get the inode from /proc/self/ns/pid an error code > + * will be returned. You do not need this. The description of error code cases should cover this. > */ > #define __BPF_FUNC_MAPPER(FN) \ > FN(unspec), \ > @@ -2853,7 +2873,8 @@ union bpf_attr { > FN(sk_storage_get), \ > FN(sk_storage_delete), \ > FN(send_signal), \ > - FN(tcp_gen_syncookie), > + FN(tcp_gen_syncookie), \ > + FN(get_current_pidns_info), > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > * function eBPF program intends to call > @@ -3604,4 +3625,10 @@ struct bpf_sockopt { > __s32 retval; > }; > > +struct bpf_pidns_info { > + __u32 dev; > + __u32 nsid; > + __u32 tgid; > + __u32 pid; > +}; > #endif /* _UAPI__LINUX_BPF_H__ */ > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index 8191a7db2777..3159f2a0188c 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -2038,6 +2038,7 @@ const struct bpf_func_proto > bpf_get_current_uid_gid_proto __weak; > const struct bpf_func_proto bpf_get_current_comm_proto __weak; > const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; > const struct bpf_func_proto bpf_get_local_storage_proto __weak; > +const struct bpf_func_proto bpf_get_current_pidns_info __weak; > > const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) > { > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c > index 5e28718928ca..571f24077db2 100644 > --- a/kernel/bpf/helpers.c > +++ b/kernel/bpf/helpers.c > @@ -11,6 +11,12 @@ > #include <linux/uidgid.h> > #include <linux/filter.h> > #include <linux/ctype.h> > +#include <linux/pid_namespace.h> > +#include <linux/major.h> > +#include <linux/stat.h> > +#include <linux/namei.h> > +#include <linux/version.h> > + > > #include "../../lib/kstrtox.h" > > @@ -312,6 +318,78 @@ void copy_map_value_locked(struct bpf_map *map, void > *dst, void *src, > preempt_enable(); > } > > +BPF_CALL_2(bpf_get_current_pidns_info, struct bpf_pidns_info *, pidns_info, > u32, > + size) > +{ > + const char *name = "/proc/self/ns/pid"; maybe rename this variable to pidns_path? > + struct pid_namespace *pidns = NULL; > + struct filename *tmp = NULL; Maybe rename this variable to name? > + int len = strlen(name) + 1; We can delay this assignment later until it is needed. > + struct inode *inode; > + struct path kp; > + pid_t tgid = 0; > + pid_t pid = 0; > + int ret; > + > + if (unlikely(size != sizeof(struct bpf_pidns_info))) > + return -EINVAL; > + > + pidns = task_active_pid_ns(current); > + we can save an empty line here. > + if (unlikely(!pidns)) > + goto clear; > + > + pidns_info->nsid = pidns->ns.inum; > + pid = task_pid_nr_ns(current, pidns); > + We can save an empty line here. > + if (unlikely(!pid)) > + goto clear; > + > + tgid = task_tgid_nr_ns(current, pidns); > + ditto. save an empty line. > + if (unlikely(!tgid)) > + goto clear; > + > + pidns_info->tgid = (u32) tgid; > + pidns_info->pid = (u32) pid; > + > + tmp = kmem_cache_alloc(names_cachep, GFP_ATOMIC); > + if (unlikely(!tmp)) { > + memset((void *)pidns_info, 0, (size_t) size); > + return -ENOMEM; > + } > + > + memcpy((char *)tmp->name, name, len); > + tmp->uptr = NULL; > + tmp->aname = NULL; > + tmp->refcnt = 1; > + ditto. save an empty line. > + ret = filename_lookup(AT_FDCWD, tmp, 0, &kp, NULL); > + ditto. save an empty line. > + if (ret) { > + memset((void *)pidns_info, 0, (size_t) size); > + return ret; > + } > + > + inode = d_backing_inode(kp.dentry); > + pidns_info->dev = inode->i_sb->s_dev; > + > + return 0; > + > +clear: > + memset((void *)pidns_info, 0, (size_t) size); > + save an empty line. > + return -EINVAL; > +} > + > +const struct bpf_func_proto bpf_get_current_pidns_info_proto = { > + .func = bpf_get_current_pidns_info, make the "= " aligned with others? > + .gpl_only = false, > + .ret_type = RET_INTEGER, > + .arg1_type = ARG_PTR_TO_UNINIT_MEM, > + .arg2_type = ARG_CONST_SIZE, > +}; > + > #ifdef CONFIG_CGROUPS > BPF_CALL_0(bpf_get_current_cgroup_id) > { > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > index ca1255d14576..5e1dc22765a5 100644 > --- a/kernel/trace/bpf_trace.c > +++ b/kernel/trace/bpf_trace.c > @@ -709,6 +709,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct > bpf_prog *prog) > #endif > case BPF_FUNC_send_signal: > return &bpf_send_signal_proto; > + case BPF_FUNC_get_current_pidns_info: > + return &bpf_get_current_pidns_info_proto; > default: > return NULL; > } > diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile > index 1d9be26b4edd..238453ff27d2 100644 > --- a/samples/bpf/Makefile > +++ b/samples/bpf/Makefile > @@ -53,6 +53,7 @@ hostprogs-y += task_fd_query > hostprogs-y += xdp_sample_pkts > hostprogs-y += ibumad > hostprogs-y += hbm > +hostprogs-y += trace_ns_info [...]