summaryrefslogtreecommitdiffstats
path: root/tools/testing/selftests/bpf/bench.h
diff options
context:
space:
mode:
authorAndrii Nakryiko <andriin@fb.com>2020-05-12 12:24:43 -0700
committerAlexei Starovoitov <ast@kernel.org>2020-05-13 12:19:38 -0700
commit8e7c2a023ac04e04c72cd7b640329511dda92672 (patch)
tree08c86f56a5d23adc94594beb4aac21a9cd3e960e /tools/testing/selftests/bpf/bench.h
parentcd49291ce18aeef3f2ec950bc99bd72d5a05fa86 (diff)
selftests/bpf: Add benchmark runner infrastructure
While working on BPF ringbuf implementation, testing, and benchmarking, I've developed a pretty generic and modular benchmark runner, which seems to be generically useful, as I've already used it for one more purpose (testing fastest way to trigger BPF program, to minimize overhead of in-kernel code). This patch adds generic part of benchmark runner and sets up Makefile for extending it with more sets of benchmarks. Benchmarker itself operates by spinning up specified number of producer and consumer threads, setting up interval timer sending SIGALARM signal to application once a second. Every second, current snapshot with hits/drops counters are collected and stored in an array. Drops are useful for producer/consumer benchmarks in which producer might overwhelm consumers. Once test finishes after given amount of warm-up and testing seconds, mean and stddev are calculated (ignoring warm-up results) and is printed out to stdout. This setup seems to give consistent and accurate results. To validate behavior, I added two atomic counting tests: global and local. For global one, all the producer threads are atomically incrementing same counter as fast as possible. This, of course, leads to huge drop of performance once there is more than one producer thread due to CPUs fighting for the same memory location. Local counting, on the other hand, maintains one counter per each producer thread, incremented independently. Once per second, all counters are read and added together to form final "counting throughput" measurement. As expected, such setup demonstrates linear scalability with number of producers (as long as there are enough physical CPU cores, of course). See example output below. Also, this setup can nicely demonstrate disastrous effects of false sharing, if care is not taken to take those per-producer counters apart into independent cache lines. Demo output shows global counter first with 1 producer, then with 4. Both total and per-producer performance significantly drop. The last run is local counter with 4 producers, demonstrating near-perfect scalability. $ ./bench -a -w1 -d2 -p1 count-global Setting up benchmark 'count-global'... Benchmark 'count-global' started. Iter 0 ( 24.822us): hits 148.179M/s (148.179M/prod), drops 0.000M/s Iter 1 ( 37.939us): hits 149.308M/s (149.308M/prod), drops 0.000M/s Iter 2 (-10.774us): hits 150.717M/s (150.717M/prod), drops 0.000M/s Iter 3 ( 3.807us): hits 151.435M/s (151.435M/prod), drops 0.000M/s Summary: hits 150.488 ± 1.079M/s (150.488M/prod), drops 0.000 ± 0.000M/s $ ./bench -a -w1 -d2 -p4 count-global Setting up benchmark 'count-global'... Benchmark 'count-global' started. Iter 0 ( 60.659us): hits 53.910M/s ( 13.477M/prod), drops 0.000M/s Iter 1 (-17.658us): hits 53.722M/s ( 13.431M/prod), drops 0.000M/s Iter 2 ( 5.865us): hits 53.495M/s ( 13.374M/prod), drops 0.000M/s Iter 3 ( 0.104us): hits 53.606M/s ( 13.402M/prod), drops 0.000M/s Summary: hits 53.608 ± 0.113M/s ( 13.402M/prod), drops 0.000 ± 0.000M/s $ ./bench -a -w1 -d2 -p4 count-local Setting up benchmark 'count-local'... Benchmark 'count-local' started. Iter 0 ( 23.388us): hits 640.450M/s (160.113M/prod), drops 0.000M/s Iter 1 ( 2.291us): hits 605.661M/s (151.415M/prod), drops 0.000M/s Iter 2 ( -6.415us): hits 607.092M/s (151.773M/prod), drops 0.000M/s Iter 3 ( -1.361us): hits 601.796M/s (150.449M/prod), drops 0.000M/s Summary: hits 604.849 ± 2.739M/s (151.212M/prod), drops 0.000 ± 0.000M/s Benchmark runner supports setting thread affinity for producer and consumer threads. You can use -a flag for default CPU selection scheme, where first consumer gets CPU #0, next one gets CPU #1, and so on. Then producer threads pick up next CPU and increment one-by-one as well. But user can also specify a set of CPUs independently for producers and consumers with --prod-affinity 1,2-10,15 and --cons-affinity <set-of-cpus>. The latter allows to force producers and consumers to share same set of CPUs, if necessary. Signed-off-by: Andrii Nakryiko <andriin@fb.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Acked-by: Yonghong Song <yhs@fb.com> Link: https://lore.kernel.org/bpf/20200512192445.2351848-3-andriin@fb.com
Diffstat (limited to 'tools/testing/selftests/bpf/bench.h')
-rw-r--r--tools/testing/selftests/bpf/bench.h81
1 files changed, 81 insertions, 0 deletions
diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h
new file mode 100644
index 000000000000..c1f48a473b02
--- /dev/null
+++ b/tools/testing/selftests/bpf/bench.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#pragma once
+#include <stdlib.h>
+#include <stdbool.h>
+#include <linux/err.h>
+#include <errno.h>
+#include <unistd.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include <math.h>
+#include <time.h>
+#include <sys/syscall.h>
+
+struct cpu_set {
+ bool *cpus;
+ int cpus_len;
+ int next_cpu;
+};
+
+struct env {
+ char *bench_name;
+ int duration_sec;
+ int warmup_sec;
+ bool verbose;
+ bool list;
+ bool affinity;
+ int consumer_cnt;
+ int producer_cnt;
+ struct cpu_set prod_cpus;
+ struct cpu_set cons_cpus;
+};
+
+struct bench_res {
+ long hits;
+ long drops;
+};
+
+struct bench {
+ const char *name;
+ void (*validate)();
+ void (*setup)();
+ void *(*producer_thread)(void *ctx);
+ void *(*consumer_thread)(void *ctx);
+ void (*measure)(struct bench_res* res);
+ void (*report_progress)(int iter, struct bench_res* res, long delta_ns);
+ void (*report_final)(struct bench_res res[], int res_cnt);
+};
+
+struct counter {
+ long value;
+} __attribute__((aligned(128)));
+
+extern struct env env;
+extern const struct bench *bench;
+
+void setup_libbpf();
+void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns);
+void hits_drops_report_final(struct bench_res res[], int res_cnt);
+
+static inline __u64 get_time_ns() {
+ struct timespec t;
+
+ clock_gettime(CLOCK_MONOTONIC, &t);
+
+ return (u64)t.tv_sec * 1000000000 + t.tv_nsec;
+}
+
+static inline void atomic_inc(long *value)
+{
+ (void)__atomic_add_fetch(value, 1, __ATOMIC_RELAXED);
+}
+
+static inline void atomic_add(long *value, long n)
+{
+ (void)__atomic_add_fetch(value, n, __ATOMIC_RELAXED);
+}
+
+static inline long atomic_swap(long *value, long n)
+{
+ return __atomic_exchange_n(value, n, __ATOMIC_RELAXED);
+}