From: Xu Kuohai <[email protected]>

Add --rb-overwrite option to benchmark BPF ring buffer in overwrite mode.
Since overwrite mode is not yet supported by libbpf for consumer, also add
--rb-bench-producer option to benchmark producer directly without a consumer.

Benchmarks on an x86_64 and an arm64 CPU are shown below for reference.

- AMD EPYC 9654 (x86_64)

Ringbuf, multi-producer contention in overwrite mode, no consumer
=================================================================
rb-prod nr_prod 1    32.180 ± 0.033M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 2    9.617 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 3    8.810 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 4    9.272 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 8    9.173 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 12   3.086 ± 0.032M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 16   2.945 ± 0.021M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 20   2.519 ± 0.021M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 24   2.545 ± 0.021M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 28   2.363 ± 0.024M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 32   2.357 ± 0.021M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 36   2.267 ± 0.011M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 40   2.284 ± 0.020M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 44   2.215 ± 0.025M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 48   2.193 ± 0.023M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 52   2.208 ± 0.024M/s (drops 0.000 ± 0.000M/s)

- HiSilicon Kunpeng 920 (arm64)

Ringbuf, multi-producer contention in overwrite mode, no consumer
=================================================================
rb-prod nr_prod 1    14.478 ± 0.006M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 2    21.787 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 3    6.045 ± 0.001M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 4    5.352 ± 0.003M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 8    4.850 ± 0.002M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 12   3.542 ± 0.016M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 16   3.509 ± 0.021M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 20   3.171 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 24   3.154 ± 0.014M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 28   2.974 ± 0.015M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 32   3.167 ± 0.014M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 36   2.903 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 40   2.866 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 44   2.914 ± 0.010M/s (drops 0.000 ± 0.000M/s)
rb-prod nr_prod 48   2.806 ± 0.012M/s (drops 0.000 ± 0.000M/s)
Rb-prod nr_prod 52   2.840 ± 0.012M/s (drops 0.000 ± 0.000M/s)

Signed-off-by: Xu Kuohai <[email protected]>
---
 .../selftests/bpf/benchs/bench_ringbufs.c     | 66 +++++++++++++++++--
 .../bpf/benchs/run_bench_ringbufs.sh          |  4 ++
 .../selftests/bpf/progs/ringbuf_bench.c       | 11 ++++
 3 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c 
b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
index e1ee979e6acc..212859fb2961 100644
--- a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
+++ b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
@@ -19,6 +19,8 @@ static struct {
        int ringbuf_sz; /* per-ringbuf, in bytes */
        bool ringbuf_use_output; /* use slower output API */
        int perfbuf_sz; /* per-CPU size, in pages */
+       bool overwrite;
+       bool bench_producer;
 } args = {
        .back2back = false,
        .batch_cnt = 500,
@@ -27,6 +29,8 @@ static struct {
        .ringbuf_sz = 512 * 1024,
        .ringbuf_use_output = false,
        .perfbuf_sz = 128,
+       .overwrite = false,
+       .bench_producer = false,
 };
 
 enum {
@@ -35,6 +39,8 @@ enum {
        ARG_RB_BATCH_CNT = 2002,
        ARG_RB_SAMPLED = 2003,
        ARG_RB_SAMPLE_RATE = 2004,
+       ARG_RB_OVERWRITE = 2005,
+       ARG_RB_BENCH_PRODUCER = 2006,
 };
 
 static const struct argp_option opts[] = {
@@ -43,6 +49,8 @@ static const struct argp_option opts[] = {
        { "rb-batch-cnt", ARG_RB_BATCH_CNT, "CNT", 0, "Set BPF-side record 
batch count"},
        { "rb-sampled", ARG_RB_SAMPLED, NULL, 0, "Notification sampling"},
        { "rb-sample-rate", ARG_RB_SAMPLE_RATE, "RATE", 0, "Notification sample 
rate"},
+       { "rb-overwrite", ARG_RB_OVERWRITE, NULL, 0, "Overwrite mode"},
+       { "rb-bench-producer", ARG_RB_BENCH_PRODUCER, NULL, 0, "Benchmark 
producer"},
        {},
 };
 
@@ -72,6 +80,12 @@ static error_t parse_arg(int key, char *arg, struct 
argp_state *state)
                        argp_usage(state);
                }
                break;
+       case ARG_RB_OVERWRITE:
+               args.overwrite = true;
+               break;
+       case ARG_RB_BENCH_PRODUCER:
+               args.bench_producer = true;
+               break;
        default:
                return ARGP_ERR_UNKNOWN;
        }
@@ -95,8 +109,33 @@ static inline void bufs_trigger_batch(void)
 
 static void bufs_validate(void)
 {
-       if (env.consumer_cnt != 1) {
-               fprintf(stderr, "rb-libbpf benchmark needs one consumer!\n");
+       if (args.bench_producer && strcmp(env.bench_name, "rb-libbpf")) {
+               fprintf(stderr, "--rb-bench-producer only works with 
rb-libbpf!\n");
+               exit(1);
+       }
+
+       if (args.overwrite && !args.bench_producer) {
+               fprintf(stderr, "overwrite mode only works with 
--rb-bench-producer for now!\n");
+               exit(1);
+       }
+
+       if (args.bench_producer && env.consumer_cnt != 0) {
+               fprintf(stderr, "no consumer is needed for 
--rb-bench-producer!\n");
+               exit(1);
+       }
+
+       if (args.bench_producer && args.back2back) {
+               fprintf(stderr, "back-to-back mode makes no sense for 
--rb-bench-producer!\n");
+               exit(1);
+       }
+
+       if (args.bench_producer && args.sampled) {
+               fprintf(stderr, "sampling mode makes no sense for 
--rb-bench-producer!\n");
+               exit(1);
+       }
+
+       if (!args.bench_producer && env.consumer_cnt != 1) {
+               fprintf(stderr, "benchmarks without --rb-bench-producer require 
exactly one consumer!\n");
                exit(1);
        }
 
@@ -128,12 +167,17 @@ static void ringbuf_libbpf_measure(struct bench_res *res)
 {
        struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx;
 
-       res->hits = atomic_swap(&buf_hits.value, 0);
+       if (args.bench_producer)
+               res->hits = atomic_swap(&ctx->skel->bss->hits, 0);
+       else
+               res->hits = atomic_swap(&buf_hits.value, 0);
        res->drops = atomic_swap(&ctx->skel->bss->dropped, 0);
 }
 
 static struct ringbuf_bench *ringbuf_setup_skeleton(void)
 {
+       __u32 flags;
+       struct bpf_map *ringbuf;
        struct ringbuf_bench *skel;
 
        setup_libbpf();
@@ -146,12 +190,19 @@ static struct ringbuf_bench *ringbuf_setup_skeleton(void)
 
        skel->rodata->batch_cnt = args.batch_cnt;
        skel->rodata->use_output = args.ringbuf_use_output ? 1 : 0;
+       skel->rodata->bench_producer = args.bench_producer;
 
        if (args.sampled)
                /* record data + header take 16 bytes */
                skel->rodata->wakeup_data_size = args.sample_rate * 16;
 
-       bpf_map__set_max_entries(skel->maps.ringbuf, args.ringbuf_sz);
+       ringbuf = skel->maps.ringbuf;
+       if (args.overwrite) {
+               flags = bpf_map__map_flags(ringbuf) | BPF_F_RB_OVERWRITE;
+               bpf_map__set_map_flags(ringbuf, flags);
+       }
+
+       bpf_map__set_max_entries(ringbuf, args.ringbuf_sz);
 
        if (ringbuf_bench__load(skel)) {
                fprintf(stderr, "failed to load skeleton\n");
@@ -171,10 +222,13 @@ static void ringbuf_libbpf_setup(void)
 {
        struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx;
        struct bpf_link *link;
+       int map_fd;
 
        ctx->skel = ringbuf_setup_skeleton();
-       ctx->ringbuf = ring_buffer__new(bpf_map__fd(ctx->skel->maps.ringbuf),
-                                       buf_process_sample, NULL, NULL);
+
+       map_fd = bpf_map__fd(ctx->skel->maps.ringbuf);
+       ctx->ringbuf = ring_buffer__new(map_fd, buf_process_sample,
+                       NULL, NULL);
        if (!ctx->ringbuf) {
                fprintf(stderr, "failed to create ringbuf\n");
                exit(1);
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh 
b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh
index 91e3567962ff..83e05e837871 100755
--- a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh
+++ b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh
@@ -49,3 +49,7 @@ for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do
        summarize "rb-libbpf nr_prod $b" "$($RUN_RB_BENCH -p$b --rb-batch-cnt 
50 rb-libbpf)"
 done
 
+header "Ringbuf, multi-producer contention in overwrite mode, no consumer"
+for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do
+       summarize "rb-prod nr_prod $b" "$($RUN_BENCH -p$b --rb-batch-cnt 50 
--rb-overwrite --rb-bench-producer rb-libbpf)"
+done
diff --git a/tools/testing/selftests/bpf/progs/ringbuf_bench.c 
b/tools/testing/selftests/bpf/progs/ringbuf_bench.c
index 6a468496f539..d96c7d1e8fc2 100644
--- a/tools/testing/selftests/bpf/progs/ringbuf_bench.c
+++ b/tools/testing/selftests/bpf/progs/ringbuf_bench.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (c) 2020 Facebook
 
+#include <stdbool.h>
 #include <linux/bpf.h>
 #include <stdint.h>
 #include <bpf/bpf_helpers.h>
@@ -14,9 +15,11 @@ struct {
 
 const volatile int batch_cnt = 0;
 const volatile long use_output = 0;
+const volatile bool bench_producer = false;
 
 long sample_val = 42;
 long dropped __attribute__((aligned(128))) = 0;
+long hits __attribute__((aligned(128))) = 0;
 
 const volatile long wakeup_data_size = 0;
 
@@ -24,6 +27,9 @@ static __always_inline long get_flags()
 {
        long sz;
 
+       if (bench_producer)
+               return BPF_RB_NO_WAKEUP;
+
        if (!wakeup_data_size)
                return 0;
 
@@ -47,6 +53,8 @@ int bench_ringbuf(void *ctx)
                                *sample = sample_val;
                                flags = get_flags();
                                bpf_ringbuf_submit(sample, flags);
+                               if (bench_producer)
+                                       __sync_add_and_fetch(&hits, 1);
                        }
                }
        } else {
@@ -55,6 +63,9 @@ int bench_ringbuf(void *ctx)
                        if (bpf_ringbuf_output(&ringbuf, &sample_val,
                                               sizeof(sample_val), flags))
                                __sync_add_and_fetch(&dropped, 1);
+                       else if (bench_producer)
+                               __sync_add_and_fetch(&hits, 1);
+
                }
        }
        return 0;
-- 
2.43.0


Reply via email to