User proper CPU barrier instead of just a compile barrier when fetching ring's data_head in bpf_perf_event_read_simple() which is not correct. Also, add two small helpers bpf_perf_read_head() and bpf_perf_write_tail() to make used barriers more obvious and a comment to what they pair to.
Fixes: d0cabbb021be ("tools: bpf: move the event reading loop to libbpf") Fixes: 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example") Signed-off-by: Daniel Borkmann <dan...@iogearbox.net> --- tools/lib/bpf/libbpf.c | 52 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 176cf55..1ac8856 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -20,6 +20,7 @@ #include <fcntl.h> #include <errno.h> #include <asm/unistd.h> +#include <asm/barrier.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/bpf.h> @@ -27,6 +28,7 @@ #include <linux/list.h> #include <linux/limits.h> #include <linux/perf_event.h> +#include <linux/compiler.h> #include <sys/stat.h> #include <sys/types.h> #include <sys/vfs.h> @@ -2404,18 +2406,58 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, return 0; } +/* + * Comment from kernel/events/ring_buffer.c: + * + * Since the mmap() consumer (userspace) can run on a different CPU: + * + * kernel user + * + * if (LOAD ->data_tail) { LOAD ->data_head + * (A) smp_rmb() (C) + * STORE $data LOAD $data + * smp_wmb() (B) smp_mb() (D) + * STORE ->data_head STORE ->data_tail + * } + * + * Where A pairs with D, and B pairs with C. + * + * In our case (A) is a control dependency that separates the load of + * the ->data_tail and the stores of $data. In case ->data_tail + * indicates there is no room in the buffer to store $data we do not. + * + * D needs to be a full barrier since it separates the data READ + * from the tail WRITE. + * + * For B a WMB is sufficient since it separates two WRITEs, and for C + * an RMB is sufficient since it separates two READs. + */ +static __u64 bpf_perf_read_head(struct perf_event_mmap_page *header) +{ + __u64 data_head = READ_ONCE(header->data_head); + + rmb(); + return data_head; +} + +static void bpf_perf_write_tail(struct perf_event_mmap_page *header, + __u64 data_tail) +{ + mb(); + header->data_tail = data_tail; +} + enum bpf_perf_event_ret bpf_perf_event_read_simple(void *mem, unsigned long size, unsigned long page_size, void **buf, size_t *buf_len, bpf_perf_event_print_t fn, void *priv) { - volatile struct perf_event_mmap_page *header = mem; + struct perf_event_mmap_page *header = mem; + __u64 data_head = bpf_perf_read_head(header); __u64 data_tail = header->data_tail; - __u64 data_head = header->data_head; int ret = LIBBPF_PERF_EVENT_ERROR; void *base, *begin, *end; - asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */ if (data_head == data_tail) return LIBBPF_PERF_EVENT_CONT; @@ -2458,8 +2500,6 @@ bpf_perf_event_read_simple(void *mem, unsigned long size, data_tail += ehdr->size; } - __sync_synchronize(); /* smp_mb() */ - header->data_tail = data_tail; - + bpf_perf_write_tail(header, data_tail); return ret; } -- 2.9.5