add per cpu map op bench

eunomia-bpf · Aug 16, 2024 · bfc0447 · bfc0447
1 parent b9d3c2b
commit bfc0447
Show file tree

Hide file tree

Showing 10 changed files with 228 additions and 256 deletions.
diff --git a/benchmark/README.md b/benchmark/README.md
@@ -2,17 +2,27 @@
 
 With userspace eBPF runntime, we can:
 
-- speed up the uprobe and uretprobe by approximate 10x
-- with out any kernel patch or modify the tracing eBPF program
+- Speed up the uprobe and uretprobe by approximate `10x`
+- The userspace read and write user memory is approximate `10x` faster than kernel (~5ns vs ~50ns)
+- With out any kernel patch or modify the tracing eBPF program
 - No privilege is needed for running the eBPF tracing program.
 
+Probes:
+
 | Probe/Tracepoint Types | Kernel (ns)  | Userspace (ns) | Insn Count |
 |------------------------|-------------:|---------------:|---------------:|
 | Uprobe                 | 3224.172760  | 314.569110     | 4    |
 | Uretprobe              | 3996.799580  | 381.270270     | 2    |
 | Syscall Tracepoint     | 151.82801    | 232.57691      | 4    |
 | Embedding runtime      | Not avaliable |  110.008430   | 4    |
 
+Read and write user memory:
+
+| Probe/Tracepoint Types  | Kernel (ns)     | Userspace (ns) |
+|-------------------------|----------------:|---------------:|
+| bpf_probe_read - uprobe  | 46.820830       | 2.200530       |
+| bpf_probe_write_user - uprobe | 45.004100  | 8.101980       |
+
 ## Suggest build configuration
 
 ```sh
@@ -41,13 +51,6 @@ cd benchmark
 python3 run_benchmark.py
 ```
 
-## test environment
-
-```console
-$ uname -a
-Linux yunwei37server 6.2.0-32-generic #32-Ubuntu SMP PREEMPT_DYNAMIC Mon Aug 14 10:03:50 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
-```
-
 ## base line
 
 ```sh
@@ -150,7 +153,7 @@ You can use python script to run the benchmark:
 python3 benchmark/tools/driving.py
 ```
 
-## Test syscall trace and untrace
+## Test syscall trace and untrace with syscount
 
 run the test:
 
@@ -175,9 +178,24 @@ Average read() time over 10 runs: 531 ns
 Average sendmsg() time over 10 runs: 3681 ns
 ```
 
+## Results for uprobe, uretprobe, and syscall tracepoint
+
+| Probe/Tracepoint Types | Kernel (ns)  | Userspace (ns) | Insn Count |
+|------------------------|-------------:|---------------:|---------------:|
+| Uprobe                 | 3224.172760  | 314.569110     | 4    |
+| Uretprobe              | 3996.799580  | 381.270270     | 2    |
+| Syscall Tracepoint     | 151.82801    | 232.57691      | 4    |
+| Embedding runtime      | Not avaliable |  110.008430   | 4    |
+
+Tested on `6.2.0-32-generic` kernel and `Intel(R) Core(TM) i7-11800H CPU @ 2.30GHz`.
+
 ## Results on another machine
 
-kernel:
+Tested on `kernel version 6.2` and `Intel(R) Xeon(R) Gold 5418Y` CPU.
+
+### Uprobe and read/write with `bpf_probe_write_user` and `bpf_probe_read_user`
+
+Userspace:
 
 ```txt
 Benchmarking __bench_uprobe_uretprobe in thread 1
@@ -214,3 +232,121 @@ Average time usage 383.135720 ns, iter 100000 times
 Benchmarking __bench_write in thread 1
 Average time usage 389.037170 ns, iter 100000 times
 ```
+
+### maps operations
+
+Run the map op 1000 times in one function. Userspace map op is also faster than the kernel in the current version. Current version is 10x faster than stupid old version.
+
+```c
+SEC("uprobe/benchmark/test:__bench_hash_map_lookup")
+int test_lookup(struct pt_regs *ctx)
+{
+    for (int i = 0; i < 1000; i++) {
+        u32 key = i;
+        u64 value = i;
+        bpf_map_lookup_elem(&test_hash_map, &key);
+    }
+    return 0;
+}
+```
+
+Kernel map op cost:
+
+```txt
+
+Benchmarking __bench_hash_map_update in thread 1
+Average time usage 64738.264680 ns, iter 100000 times
+
+Benchmarking __bench_hash_map_lookup in thread 1
+Average time usage 17805.898280 ns, iter 100000 times
+
+Benchmarking __bench_hash_map_delete in thread 1
+Average time usage 21795.665340 ns, iter 100000 times
+
+Benchmarking __bench_array_map_update in thread 1
+Average time usage 11449.295960 ns, iter 100000 times
+
+Benchmarking __bench_array_map_lookup in thread 1
+Average time usage 2093.886500 ns, iter 100000 times
+
+Benchmarking __bench_array_map_delete in thread 1
+Average time usage 2126.820310 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_hash_map_update in thread 1
+Average time usage 35050.915650 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_hash_map_lookup in thread 1
+Average time usage 15999.969590 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_hash_map_delete in thread 1
+Average time usage 21664.294940 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_array_map_update in thread 1
+Average time usage 10886.969860 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_array_map_lookup in thread 1
+Average time usage 2749.468760 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_array_map_delete in thread 1
+Average time usage 2778.679460 ns, iter 100000 times
+```
+
+Userspace map op cost:
+
+```txt
+Benchmarking __bench_hash_map_update in thread 1
+Average time usage 30676.986820 ns, iter 100000 times
+
+Benchmarking __bench_hash_map_lookup in thread 1
+Average time usage 23486.304570 ns, iter 100000 times
+
+Benchmarking __bench_hash_map_delete in thread 1
+Average time usage 13435.901160 ns, iter 100000 times
+
+Benchmarking __bench_array_map_update in thread 1
+Average time usage 7081.922160 ns, iter 100000 times
+
+Benchmarking __bench_array_map_lookup in thread 1
+Average time usage 4685.450360 ns, iter 100000 times
+
+Benchmarking __bench_array_map_delete in thread 1
+Average time usage 6367.443010 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_hash_map_update in thread 1
+Average time usage 95918.602090 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_hash_map_lookup in thread 1
+Average time usage 63294.791110 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_hash_map_delete in thread 1
+Average time usage 460207.364100 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_array_map_update in thread 1
+Average time usage 26109.863360 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_array_map_lookup in thread 1
+Average time usage 9139.355980 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_array_map_delete in thread 1
+Average time usage 5203.339320 ns, iter 100000 times
+```
+
+The benchmark without inline the map op function:
+
+| Map Operation                      | Kernel (op - uprobe) (ns) | Userspace (op - uprobe) (ns) |
+|------------------------------------|--------------------------:|-----------------------------:|
+| __bench_hash_map_update            | 62827.533320              | 30296.051630                 |
+| __bench_hash_map_lookup            | 15895.166920              | 23005.369380                 |
+| __bench_hash_map_delete            | 19884.933980              | 13054.965970                 |
+| __bench_array_map_update           | 9538.564600               | 6701.987970                  |
+| __bench_array_map_lookup           |  183.155140               | 4305.515170                  |
+| __bench_array_map_delete           |  216.088950               | 5987.507820                  |
+| __bench_per_cpu_hash_map_update    | 33140.184290              | 95537.666900                 |
+| __bench_per_cpu_hash_map_lookup    | 14089.238230              | 62913.855920                 |
+| __bench_per_cpu_hash_map_delete    | 19753.563580              | 459826.428910                |
+| __bench_per_cpu_array_map_update   |  8885.238500              | 25728.928170                 |
+| __bench_per_cpu_array_map_lookup   |  1838.737400              | 8759.420790                  |
+| __bench_per_cpu_array_map_delete   |  1867.948100              | 4802.404130                  |
+
+- Some overhead can be reduced by inlining the map op function.
+- We need to fix the performance issue of the per-cpu map in the userspace runtime.
diff --git a/benchmark/test.c b/benchmark/test.c
@@ -4,54 +4,29 @@
 #include <stdint.h>
 #include <pthread.h>
 
-
-__attribute_noinline__ uint64_t __bench_map_lookup(char *a, int b,
-							   uint64_t c)
-{
-	return a[b] + c;
-}
-
-__attribute_noinline__ uint64_t __bench_map_delete(char *a, int b,
-							   uint64_t c)
-{
-	return a[b] + c;
-}
-
-__attribute_noinline__ uint64_t __bench_map_update(char *a, int b,
-							   uint64_t c)
-{
-	return a[b] + c;
-}
-
-
-__attribute_noinline__ uint64_t __bench_read(char *a, int b,
-							   uint64_t c)
-{
-	return a[b] + c;
-}
-
-__attribute_noinline__ uint64_t __bench_write(char *a, int b,
-							   uint64_t c)
-{
-	return a[b] + c;
-}
-__attribute_noinline__ uint64_t __bench_uprobe(char *a, int b,
-							   uint64_t c)
-{
-	return a[b] + c;
+#define BENCH_FUNC(name) \
+__attribute_noinline__ uint64_t name(char *a, int b, uint64_t c) \
+{ \
+    return a[b] + c; \
 }
 
-__attribute_noinline__ uint64_t __bench_uretprobe(char *a, int b,
-							   uint64_t c)
-{
-	return a[b] + c;
-}
-
-__attribute_noinline__ uint64_t __bench_uprobe_uretprobe(char *a, int b,
-							   uint64_t c)
-{
-	return a[b] + c;
-}
+BENCH_FUNC(__bench_array_map_lookup)
+BENCH_FUNC(__bench_array_map_delete)
+BENCH_FUNC(__bench_array_map_update)
+BENCH_FUNC(__bench_hash_map_lookup)
+BENCH_FUNC(__bench_hash_map_delete)
+BENCH_FUNC(__bench_hash_map_update)
+BENCH_FUNC(__bench_per_cpu_hash_map_lookup)
+BENCH_FUNC(__bench_per_cpu_hash_map_delete)
+BENCH_FUNC(__bench_per_cpu_hash_map_update)
+BENCH_FUNC(__bench_per_cpu_array_map_lookup)
+BENCH_FUNC(__bench_per_cpu_array_map_delete)
+BENCH_FUNC(__bench_per_cpu_array_map_update)
+BENCH_FUNC(__bench_read)
+BENCH_FUNC(__bench_write)
+BENCH_FUNC(__bench_uprobe)
+BENCH_FUNC(__bench_uretprobe)
+BENCH_FUNC(__bench_uprobe_uretprobe)
 
 typedef uint64_t (*benchmark_test_function_t)(char *, int, uint64_t);
 
@@ -104,7 +79,7 @@ void do_benchmark_userspace(benchmark_test_function_t func, const char *name,
 
 #define do_benchmark_func(func, iter, id)                                      \
 	do {                                                                   \
-		do_benchmark_userspace(func, #func, iter, id);                        \
+		do_benchmark_userspace(func, #func, iter, id);                 \
 	} while (0)
 
 int iter = 100 * 1000;
@@ -118,9 +93,18 @@ void *run_bench_functions(void *id_ptr)
 	do_benchmark_func(__bench_uprobe, iter, id);
 	do_benchmark_func(__bench_read, iter, id);
 	do_benchmark_func(__bench_write, iter, id);
-	do_benchmark_func(__bench_map_update, iter, id);
-	do_benchmark_func(__bench_map_lookup, iter, id);
-	do_benchmark_func(__bench_map_delete, iter, id);
+	do_benchmark_func(__bench_hash_map_update, iter, id);
+	do_benchmark_func(__bench_hash_map_lookup, iter, id);
+	do_benchmark_func(__bench_hash_map_delete, iter, id);
+	do_benchmark_func(__bench_array_map_update, iter, id);
+	do_benchmark_func(__bench_array_map_lookup, iter, id);
+	do_benchmark_func(__bench_array_map_delete, iter, id);
+	do_benchmark_func(__bench_per_cpu_hash_map_update, iter, id);
+	do_benchmark_func(__bench_per_cpu_hash_map_lookup, iter, id);
+	do_benchmark_func(__bench_per_cpu_hash_map_delete, iter, id);
+	do_benchmark_func(__bench_per_cpu_array_map_update, iter, id);
+	do_benchmark_func(__bench_per_cpu_array_map_lookup, iter, id);
+	do_benchmark_func(__bench_per_cpu_array_map_delete, iter, id);
 	return NULL;
 }
 

diff --git a/benchmark/tools/.gitignore b/benchmark/tools/.gitignore
diff --git a/benchmark/tools/Makefile b/benchmark/tools/Makefile