From 5cc74e62843083754714328f041ff2a0c12ea018 Mon Sep 17 00:00:00 2001
From: yunwei37 <1067852565@qq.com>
Date: Thu, 15 Aug 2024 22:37:42 -0700
Subject: [PATCH 1/6] add read write result

---
 benchmark/README.md                | 42 +++++++++++++++++++++++++++---
 benchmark/hash_map/README.md       | 12 ++++-----
 benchmark/hash_map/uprobe.bpf.c    |  6 ++---
 benchmark/run_benchmark.py         |  4 +--
 benchmark/test.c                   | 33 ++++++++++++++++-------
 benchmark/test_embed.c             |  4 +--
 benchmark/uprobe/uprobe-override.c |  2 +-
 benchmark/uprobe/uprobe.bpf.c      | 30 ++++++++++++++++-----
 benchmark/uprobe/uprobe.c          |  6 ++---
 9 files changed, 101 insertions(+), 38 deletions(-)

diff --git a/benchmark/README.md b/benchmark/README.md
index 3d71c865..93d32b0a 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -222,8 +222,42 @@ You can use python script to run the benchmark:
 python3 benchmark/tools/driving.py
 ```
 
-## Benchmark runner
+## Results on another machine
 
-### Usage
-- `make -C ./benchmark`
-- `python3 ./benchmark/run_benchmark.py`
+kernel:
+
+```txt
+Benchmarking __bench_uprobe_uretprobe in thread 1
+Average time usage 3060.196770 ns, iter 100000 times
+
+Benchmarking __bench_uretprobe in thread 1
+Average time usage 2958.493390 ns, iter 100000 times
+
+Benchmarking __bench_uprobe in thread 1
+Average time usage 1910.731360 ns, iter 100000 times
+
+Benchmarking __bench_read in thread 1
+Average time usage 1957.552190 ns, iter 100000 times
+
+Benchmarking __bench_write in thread 1
+Average time usage 1955.735460 ns, iter 100000 times
+```
+
+Userspace:
+
+```txt
+Benchmarking __bench_uprobe_uretprobe in thread 1
+Average time usage 412.607790 ns, iter 100000 times
+
+Benchmarking __bench_uretprobe in thread 1
+Average time usage 389.096230 ns, iter 100000 times
+
+Benchmarking __bench_uprobe in thread 1
+Average time usage 387.022160 ns, iter 100000 times
+
+Benchmarking __bench_read in thread 1
+Average time usage 415.350530 ns, iter 100000 times
+
+Benchmarking __bench_write in thread 1
+Average time usage 414.350230 ns, iter 100000 times
+```
diff --git a/benchmark/hash_map/README.md b/benchmark/hash_map/README.md
index c0b80d5a..092e26c9 100644
--- a/benchmark/hash_map/README.md
+++ b/benchmark/hash_map/README.md
@@ -1,8 +1,8 @@
 # benchmark of hash maps
 
-- __benchmark_test_function1: hashmap bpf_map_lookup_elem
-- __benchmark_test_function2: hashmap bpf_map_delete_elem
-- __benchmark_test_function3: hashmap bpf_map_update_elem
+- __bench_uprobe_uretprobe: hashmap bpf_map_lookup_elem
+- __bench_uretprobe: hashmap bpf_map_delete_elem
+- __bench_probe: hashmap bpf_map_update_elem
 
 run the uprobe:
 
@@ -23,17 +23,17 @@ in another terminal, run the benchmark:
 ```console
 $ LD_PRELOAD=build/runtime/agent/libbpftime-agent.so benchmark/test
 
-Benchmarking __benchmark_test_function1
+Benchmarking __bench_uprobe_uretprobe
 a[b] + c for 100000 times
 Elapsed time: 0.038217773 seconds
 Average time usage 382.177730 ns
 
-Benchmarking __benchmark_test_function2
+Benchmarking __bench_uretprobe
 a[b] + c for 100000 times
 Elapsed time: 0.020004455 seconds
 Average time usage 200.044550 ns
 
-Benchmarking __benchmark_test_function3
+Benchmarking __bench_probe
 a[b] + c for 100000 times
 Elapsed time: 0.047916014 seconds
 Average time usage 479.160140 ns
diff --git a/benchmark/hash_map/uprobe.bpf.c b/benchmark/hash_map/uprobe.bpf.c
index ca9bc011..8cbc53be 100644
--- a/benchmark/hash_map/uprobe.bpf.c
+++ b/benchmark/hash_map/uprobe.bpf.c
@@ -10,7 +10,7 @@ struct {
 	__type(value, u64);
 } libc_malloc_calls_total SEC(".maps");
 
-SEC("uprobe/benchmark/test:__benchmark_test_function3")
+SEC("uprobe/benchmark/test:__bench_probe")
 int test_update(struct pt_regs *ctx)
 {
 	u32 key = 0;
@@ -20,7 +20,7 @@ int test_update(struct pt_regs *ctx)
 	return 0;
 }
 
-SEC("uprobe/benchmark/test:__benchmark_test_function2")
+SEC("uprobe/benchmark/test:__bench_uretprobe")
 int test_delete(struct pt_regs *ctx)
 {
 	u32 key = 0;
@@ -30,7 +30,7 @@ int test_delete(struct pt_regs *ctx)
 	return 0;
 }
 
-SEC("uprobe/benchmark/test:__benchmark_test_function1")
+SEC("uprobe/benchmark/test:__bench_uprobe_uretprobe")
 int test_lookup(struct pt_regs *ctx)
 {
 	u32 key = 0;
diff --git a/benchmark/run_benchmark.py b/benchmark/run_benchmark.py
index 5f1b964c..a2a78bea 100644
--- a/benchmark/run_benchmark.py
+++ b/benchmark/run_benchmark.py
@@ -84,7 +84,7 @@ async def run_userspace_uprobe_test():
             server.stdout,
             should_exit,
             "SERVER",
-            (server_start_cb, "__benchmark_test_function3 is for uprobe only"),
+            (server_start_cb, "__bench_probe is for uprobe only"),
         )
     )
     await server_start_cb.wait()
@@ -131,7 +131,7 @@ async def run_kernel_uprobe_test():
             server.stdout,
             should_exit,
             "SERVER",
-            (server_start_cb, "__benchmark_test_function3 is for uprobe only"),
+            (server_start_cb, "__bench_probe is for uprobe only"),
         )
     )
     await server_start_cb.wait()
diff --git a/benchmark/test.c b/benchmark/test.c
index ae654c58..81bd5d77 100644
--- a/benchmark/test.c
+++ b/benchmark/test.c
@@ -4,27 +4,37 @@
 #include <stdint.h>
 #include <pthread.h>
 
-__attribute_noinline__ uint64_t __benchmark_test_function3(const char *a, int b,
+__attribute_noinline__ uint64_t __bench_read(char *a, int b,
 							   uint64_t c)
 {
 	return a[b] + c;
 }
 
-__attribute_noinline__ uint64_t __benchmark_test_function2(const char *a, int b,
+__attribute_noinline__ uint64_t __bench_write(char *a, int b,
 							   uint64_t c)
 {
-	static int i = 0;
-	__sync_fetch_and_add(&i, 1);
 	return a[b] + c;
 }
 
-__attribute_noinline__ uint64_t __benchmark_test_function1(const char *a, int b,
+__attribute_noinline__ uint64_t __bench_uprobe(char *a, int b,
 							   uint64_t c)
 {
 	return a[b] + c;
 }
 
-typedef uint64_t (*benchmark_test_function_t)(const char *, int, uint64_t);
+__attribute_noinline__ uint64_t __bench_uretprobe(char *a, int b,
+							   uint64_t c)
+{
+	return a[b] + c;
+}
+
+__attribute_noinline__ uint64_t __bench_uprobe_uretprobe(char *a, int b,
+							   uint64_t c)
+{
+	return a[b] + c;
+}
+
+typedef uint64_t (*benchmark_test_function_t)(char *, int, uint64_t);
 
 void start_timer(struct timespec *start_time)
 {
@@ -53,9 +63,10 @@ static double get_function_time(benchmark_test_function_t func, int iter)
 	// The timespec struct holds seconds and nanoseconds
 	struct timespec start_time, end_time;
 	start_timer(&start_time);
+	char buffer[20] = "hello world";
 	// test base line
 	for (int i = 0; i < iter; i++) {
-		func("hello", i % 4, i);
+		func(buffer, i % 4, i);
 	}
 	end_timer(&end_time);
 	double time = get_elapsed_time(start_time, end_time);
@@ -83,9 +94,11 @@ void *run_bench_functions(void *id_ptr)
 {
 	int id = *(int *)id_ptr;
 	printf("id: %d\n", id);
-	do_benchmark_func(__benchmark_test_function1, iter, id);
-	do_benchmark_func(__benchmark_test_function2, iter, id);
-	do_benchmark_func(__benchmark_test_function3, iter, id);
+	do_benchmark_func(__bench_uprobe_uretprobe, iter, id);
+	do_benchmark_func(__bench_uretprobe, iter, id);
+	do_benchmark_func(__bench_uprobe, iter, id);
+	do_benchmark_func(__bench_read, iter, id);
+	do_benchmark_func(__bench_write, iter, id);
 	return NULL;
 }
 
diff --git a/benchmark/test_embed.c b/benchmark/test_embed.c
index 4fc90266..ee595972 100644
--- a/benchmark/test_embed.c
+++ b/benchmark/test_embed.c
@@ -76,7 +76,7 @@ void end_timer()
 	clock_gettime(CLOCK_MONOTONIC_RAW, &end_time);
 }
 
-__attribute_noinline__ uint64_t __benchmark_test_function3(const char *a, int b,
+__attribute_noinline__ uint64_t __bench_probe(const char *a, int b,
 							   uint64_t c)
 {
 	return a[b] + c;
@@ -93,7 +93,7 @@ uint64_t test_func_wrapper(const char *a, int b, uint64_t c)
 		PT_REGS_PARM3(&regs) = c;
 		ebpf_exec(begin_vm, &regs, sizeof(regs), &ret);
 	}
-	uint64_t hook_func_ret = __benchmark_test_function3(a, b, c);
+	uint64_t hook_func_ret = __bench_probe(a, b, c);
 	if (enable_ebpf) {
 		memset(&regs, 0, sizeof(regs));
 		PT_REGS_PARM1(&regs) = hook_func_ret;
diff --git a/benchmark/uprobe/uprobe-override.c b/benchmark/uprobe/uprobe-override.c
index a16003f2..96b9f5da 100644
--- a/benchmark/uprobe/uprobe-override.c
+++ b/benchmark/uprobe/uprobe-override.c
@@ -55,7 +55,7 @@ int main(int argc, char **argv)
 	}
 	err = bpf_prog_attach_uprobe_with_override(
 		bpf_program__fd(skel->progs.do_uprobe_override_patch), "benchmark/test",
-		"__benchmark_test_function1");
+		"__bench_uprobe_uretprobe");
 	if (err) {
 		fprintf(stderr, "Failed to attach BPF program\n");
 		goto cleanup;
diff --git a/benchmark/uprobe/uprobe.bpf.c b/benchmark/uprobe/uprobe.bpf.c
index 7ac7471e..5450828d 100644
--- a/benchmark/uprobe/uprobe.bpf.c
+++ b/benchmark/uprobe/uprobe.bpf.c
@@ -3,25 +3,41 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
-SEC("uprobe/benchmark/test:__benchmark_test_function3")
-int BPF_UPROBE(__benchmark_test_function3, const char *a, int b, uint64_t c)
+SEC("uprobe/benchmark/test:__bench_write")
+int BPF_UPROBE(__bench_write, char *a, int b, uint64_t c)
 {
+	char buffer[5] = "text";
+	bpf_probe_write_user(a, buffer, sizeof(buffer));
 	return b + c;
 }
 
-SEC("uretprobe/benchmark/test:__benchmark_test_function2")
-int BPF_URETPROBE(__benchmark_test_function2, int ret)
+SEC("uprobe/benchmark/test:__bench_read")
+int BPF_UPROBE(__bench_read, char *a, int b, uint64_t c)
+{
+	char buffer[5];
+	int res = bpf_probe_read_user(buffer, sizeof(buffer), a);
+	return b + c + res + buffer[1];
+}
+
+SEC("uprobe/benchmark/test:__bench_uprobe")
+int BPF_UPROBE(__bench_uprobe, char *a, int b, uint64_t c)
+{
+	return b + c;
+}
+
+SEC("uretprobe/benchmark/test:__bench_uretprobe")
+int BPF_URETPROBE(__bench_uretprobe, int ret)
 {
 	return ret;
 }
 
-SEC("uprobe/benchmark/test:__benchmark_test_function1")
-int BPF_UPROBE(__benchmark_test_function1_1, const char *a, int b, uint64_t c)
+SEC("uprobe/benchmark/test:__bench_uprobe_uretprobe")
+int BPF_UPROBE(__bench_uprobe_uretprobe_1, char *a, int b, uint64_t c)
 {
 	return b + c;
 }
 
-SEC("uretprobe/benchmark/test:__benchmark_test_function1")
+SEC("uretprobe/benchmark/test:__bench_uprobe_uretprobe")
 int BPF_URETPROBE(__benchmark_test_function_1_2, int ret)
 {
 	return ret;
diff --git a/benchmark/uprobe/uprobe.c b/benchmark/uprobe/uprobe.c
index 38e52abb..9eff05c3 100644
--- a/benchmark/uprobe/uprobe.c
+++ b/benchmark/uprobe/uprobe.c
@@ -57,9 +57,9 @@ int main(int argc, char **argv)
 	}
 
 	printf("Successfully started! Press Ctrl+C to stop.\n");
-	printf("__benchmark_test_function1 is for both uprobe and uretprobe\n");
-	printf("__benchmark_test_function2 is for uretprobe only\n");
-	printf("__benchmark_test_function3 is for uprobe only\n");
+	printf("__bench_uprobe_uretprobe is for both uprobe and uretprobe\n");
+	printf("__bench_uretprobe is for uretprobe only\n");
+	printf("__bench_probe is for uprobe only\n");
 	fflush(stdout);
 	while (!exiting) {
 		sleep(1);

From c01b07fa8ea1a59866ed044f526409f3e65b231a Mon Sep 17 00:00:00 2001
From: yunwei37 <1067852565@qq.com>
Date: Fri, 16 Aug 2024 07:35:28 -0700
Subject: [PATCH 2/6] Add new syscount bench

---
 Makefile                          |   2 +-
 benchmark/.gitignore              |   1 +
 benchmark/README.md               | 145 ++++++++++--------------------
 benchmark/syscount/Makefile       |   2 +
 benchmark/syscount/read-sendmsg.c | 100 +++++++++++++++++++++
 benchmark/syscount/test.sh        |  21 +++++
 benchmark/syscount/testfile.txt   |   1 +
 example/minimal/README.md         |   2 +-
 8 files changed, 176 insertions(+), 98 deletions(-)
 create mode 100644 benchmark/syscount/Makefile
 create mode 100644 benchmark/syscount/read-sendmsg.c
 create mode 100644 benchmark/syscount/test.sh
 create mode 100644 benchmark/syscount/testfile.txt

diff --git a/Makefile b/Makefile
index 55ebb63d..884fd33f 100644
--- a/Makefile
+++ b/Makefile
@@ -65,7 +65,7 @@ release: ## build the release version
 
 release-with-llvm-jit: ## build the package, with llvm-jit
 	cmake -Bbuild  -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo \
-				   -DBPFTIME_LLVM_JIT=1
+				   -DBPFTIME_LLVM_JIT=1 \
 				   -DBUILD_BPFTIME_DAEMON=1
 	cmake --build build --config RelWithDebInfo --target install -j$(JOBS)
 
diff --git a/benchmark/.gitignore b/benchmark/.gitignore
index 173b04ce..e904185d 100644
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@@ -1,3 +1,4 @@
 test
 micro-bench
 benchmark-output.json
+syscount/read-sendmsg
\ No newline at end of file
diff --git a/benchmark/README.md b/benchmark/README.md
index 93d32b0a..9ed517b4 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -13,6 +13,13 @@ With userspace eBPF runntime, we can:
 | Syscall Tracepoint     | 151.82801    | 232.57691      | 4    |
 | Embedding runtime      | Not avaliable |  110.008430   | 4    |
 
+## Suggest build configuration
+
+```sh
+cmake -Bbuild -DLLVM_DIR=/usr/lib/llvm-15/cmake -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo -DBPFTIME_LLVM_JIT=1 -DBPFTIME_ENABLE_LTO=1
+cmake --build build --config RelWithDebInfo --target install -j
+```
+
 ## build and run at a click
 
 Build the agent first. In project root:
@@ -43,11 +50,8 @@ Linux yunwei37server 6.2.0-32-generic #32-Ubuntu SMP PREEMPT_DYNAMIC Mon Aug 14
 
 ## base line
 
-```console
-$ benchmark/test
-a[b] + c for 100000 times
-Elapsed time: 0.000446995 seconds
-avg function elapse time: 4.469950 ns
+```sh
+benchmark/test
 ```
 
 The base line function elapsed time is 0.000243087 seconds, for the test function:
@@ -72,22 +76,14 @@ make -C benchmark/uretprobe
 
 run the uprobe:
 
-```console
-$  sudo benchmark/uprobe/uprobe
-libbpf: loading object 'uprobe_bpf' from buffer
-libbpf: elf: section(2) .symtab, size 120, link 1, flags 0, type=2
-...
-loaded ebpf program...
-...
+```sh
+sudo benchmark/uprobe/uprobe
 ```
 
 in another terminal, run the benchmark:
 
-```console
-$ benchmark/test
-a[b] + c for 100000 times
-Elapsed time: 0.322417276 seconds
-avg function elapse time: 3224.172760 ns
+```sh
+benchmark/test
 ```
 
 The uprobe or uretprobe function we used is like:
@@ -100,53 +96,18 @@ int BPF_UPROBE(__benchmark_test_function, const char *a, int b, uint64_t c)
 }
 ```
 
-## kernel uretuprobe
-
-run the uretprobe:
-
-```console
-$  sudo benchmark/uretprobe/uretprobe
-libbpf: loading object 'uprobe_bpf' from buffer
-libbpf: elf: section(2) .symtab, size 120, link 1, flags 0, type=2
-...
-loaded ebpf program...
-...
-
-in another terminal, run the benchmark:
-
-```console
-$ benchmark/test
-a[b] + c for 100000 times
-Elapsed time: 0.589970682 seconds
-avg function elapse time: 3996.799580 ns
-```
-
 ## userspace uprobe
 
 run the uprobe:
 
-```console
-$ LD_PRELOAD=build/runtime/syscall-server/libbpftime-syscall-server.so benchmark/uprobe/uprobe
-manager constructed
-global_shm_open_type 0 for bpftime_maps_shm
-Closing 3
-libbpf: loading object 'uprobe_bpf' from buffer
-libbpf: elf: section(2) .symtab, size 120, link 1, flags 0, type=2
-...
-loaded ebpf program...
-...
+```sh
+LD_PRELOAD=build/runtime/syscall-server/libbpftime-syscall-server.so benchmark/uprobe/uprobe
 ```
 
 in another terminal, run the benchmark:
 
-```console
-$ LD_PRELOAD=build/runtime/agent/libbpftime-agent.so benchmark/test
-attaching prog 3 to fd 4
-Successfully attached
-
-a[b] + c for 100000 times
-Elapsed time: 0.031456911 seconds
-avg function elapse time: 314.569110 ns
+```sh
+LD_PRELOAD=build/runtime/agent/libbpftime-agent.so benchmark/test
 ```
 
 If errors like:
@@ -159,43 +120,10 @@ Aborted (core dumped)
 
 happpens, try to use `sudo` mode.
 
-## userspace uretprobe
-
-run the uretprobe:
-
-```console
-$ LD_PRELOAD=build/runtime/syscall-server/libbpftime-syscall-server.so benchmark/uretprobe/uretprobe
-manager constructed
-global_shm_open_type 0 for bpftime_maps_shm
-Closing 3
-libbpf: loading object 'uprobe_bpf' from buffer
-libbpf: elf: section(2) .symtab, size 120, link 1, flags 0, type=2
-...
-loaded ebpf program...
-...
-```
-
-in another terminal, run the benchmark:
-
-```console
-$ LD_PRELOAD=build/runtime/agent/libbpftime-agent.so benchmark/test
-attaching prog 3 to fd 4
-Successfully attached
-
-a[b] + c for 100000 times
-Elapsed time: 0.038127027 seconds
-avg function elapse time: 381.270270 ns
-```
-
 ## embed runtime
 
-```console
-$ build/benchmark/simple-benchmark-with-embed-ebpf-calling
-uprobe elf: /home/yunwei/bpftime/build/benchmark/uprobe_prog.bpf.o
-uretprobe elf:/home/yunwei/bpftime/build/benchmark/uretprobe_prog.bpf.o
-a[b] + c for 100000 times
-Elapsed time: 0.011000843 seconds
-avg function elapse time: 110.008430 ns
+```sh
+build/benchmark/simple-benchmark-with-embed-ebpf-calling
 ```
 
 ## userspace syscall
@@ -222,6 +150,31 @@ You can use python script to run the benchmark:
 python3 benchmark/tools/driving.py
 ```
 
+## Test syscall trace and untrace
+
+run the test:
+
+```sh
+bash ./benchmark/syscount/test.sh
+```
+
+result:
+
+```txt
+# baseline, no trace syscall
+Average read() time over 10 runs: 349 ns
+Average sendmsg() time over 10 runs: 3640 ns
+# trace with syscount
+Average read() time over 10 runs: 437 ns
+Average sendmsg() time over 10 runs: 3952 ns
+# filter out the pid
+Average read() time over 10 runs: 398 ns
+Average sendmsg() time over 10 runs: 3690 ns
+# trace with userspace syscall tracepoint
+Average read() time over 10 runs: 531 ns
+Average sendmsg() time over 10 runs: 3681 ns
+```
+
 ## Results on another machine
 
 kernel:
@@ -247,17 +200,17 @@ Userspace:
 
 ```txt
 Benchmarking __bench_uprobe_uretprobe in thread 1
-Average time usage 412.607790 ns, iter 100000 times
+Average time usage 391.967450 ns, iter 100000 times
 
 Benchmarking __bench_uretprobe in thread 1
-Average time usage 389.096230 ns, iter 100000 times
+Average time usage 383.851670 ns, iter 100000 times
 
 Benchmarking __bench_uprobe in thread 1
-Average time usage 387.022160 ns, iter 100000 times
+Average time usage 380.935190 ns, iter 100000 times
 
 Benchmarking __bench_read in thread 1
-Average time usage 415.350530 ns, iter 100000 times
+Average time usage 383.135720 ns, iter 100000 times
 
 Benchmarking __bench_write in thread 1
-Average time usage 414.350230 ns, iter 100000 times
+Average time usage 389.037170 ns, iter 100000 times
 ```
diff --git a/benchmark/syscount/Makefile b/benchmark/syscount/Makefile
new file mode 100644
index 00000000..75f7fc0b
--- /dev/null
+++ b/benchmark/syscount/Makefile
@@ -0,0 +1,2 @@
+syscount-driver: read-sendmsg.c
+	gcc read-sendmsg.c -g -O3 -lpthread -o read-sendmsg
diff --git a/benchmark/syscount/read-sendmsg.c b/benchmark/syscount/read-sendmsg.c
new file mode 100644
index 00000000..1884beb0
--- /dev/null
+++ b/benchmark/syscount/read-sendmsg.c
@@ -0,0 +1,100 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <time.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#define NUM_ITERATIONS 1000000
+
+void measure_read_time(int fd) {
+    char buffer[1024];
+    struct timespec start, end;
+    long total_time_ns = 0;
+
+    for (int i = 0; i < NUM_ITERATIONS; i++) {
+        clock_gettime(CLOCK_MONOTONIC, &start);
+        ssize_t bytes_read = read(fd, buffer, sizeof(buffer));
+        clock_gettime(CLOCK_MONOTONIC, &end);
+
+        if (bytes_read == -1) {
+            perror("read");
+            exit(EXIT_FAILURE);
+        }
+
+        long time_ns = (end.tv_sec - start.tv_sec) * 1e9 + (end.tv_nsec - start.tv_nsec);
+        total_time_ns += time_ns;
+    }
+
+    printf("Average read() time: %ld ns\n", total_time_ns / NUM_ITERATIONS);
+}
+
+void measure_sendmsg_time(int sockfd) {
+    struct msghdr msg;
+    struct iovec iov;
+    char buffer[1024] = "test message";
+    struct timespec start, end;
+    long total_time_ns = 0;
+
+    memset(&msg, 0, sizeof(msg));
+    iov.iov_base = buffer;
+    iov.iov_len = sizeof(buffer);
+    msg.msg_iov = &iov;
+    msg.msg_iovlen = 1;
+
+    struct sockaddr_in dest_addr;
+    memset(&dest_addr, 0, sizeof(dest_addr));
+    dest_addr.sin_family = AF_INET;
+    dest_addr.sin_port = htons(12345); // Arbitrary port number
+    inet_pton(AF_INET, "127.0.0.1", &dest_addr.sin_addr); // Loopback address
+
+    msg.msg_name = &dest_addr;
+    msg.msg_namelen = sizeof(dest_addr);
+
+    for (int i = 0; i < NUM_ITERATIONS; i++) {
+        clock_gettime(CLOCK_MONOTONIC, &start);
+        ssize_t bytes_sent = sendmsg(sockfd, &msg, 0);
+        clock_gettime(CLOCK_MONOTONIC, &end);
+
+        if (bytes_sent == -1) {
+            perror("sendmsg");
+            exit(EXIT_FAILURE);
+        }
+
+        long time_ns = (end.tv_sec - start.tv_sec) * 1e9 + (end.tv_nsec - start.tv_nsec);
+        total_time_ns += time_ns;
+    }
+
+    printf("Average sendmsg() time: %ld ns\n", total_time_ns / NUM_ITERATIONS);
+}
+
+int main() {
+    // Open a file for reading
+    int fd = open("./benchmark/syscount/testfile.txt", O_RDONLY);
+    if (fd == -1) {
+        perror("open");
+        exit(EXIT_FAILURE);
+    }
+
+    // Create a socket for sendmsg()
+    int sockfd = socket(AF_INET, SOCK_DGRAM, 0);
+    if (sockfd == -1) {
+        perror("socket");
+        close(fd);
+        exit(EXIT_FAILURE);
+    }
+
+    // Measure read() time
+    measure_read_time(fd);
+
+    // Measure sendmsg() time
+    measure_sendmsg_time(sockfd);
+
+    close(fd);
+    close(sockfd);
+    return 0;
+}
diff --git a/benchmark/syscount/test.sh b/benchmark/syscount/test.sh
new file mode 100644
index 00000000..99db667c
--- /dev/null
+++ b/benchmark/syscount/test.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+total_read_time=0
+total_sendmsg_time=0
+
+for i in {1..10}
+do
+    output=$(sudo AGENT_SO=/tmp/bpftime/runtime/agent/libbpftime-agent.so LD_PRELOAD=/tmp/bpftime/attach/text_segment_transformer/libbpftime-agent-transformer.so ./benchmark/syscount/read-sendmsg)
+    # output=$(sudo ./benchmark/syscount/read-sendmsg)
+    read_time=$(echo "$output" | grep "Average read() time" | awk '{print $4}')
+    sendmsg_time=$(echo "$output" | grep "Average sendmsg() time" | awk '{print $4}')
+    
+    total_read_time=$((total_read_time + read_time))
+    total_sendmsg_time=$((total_sendmsg_time + sendmsg_time))
+done
+
+avg_read_time=$((total_read_time / 10))
+avg_sendmsg_time=$((total_sendmsg_time / 10))
+
+echo "Average read() time over 10 runs: $avg_read_time ns"
+echo "Average sendmsg() time over 10 runs: $avg_sendmsg_time ns"
diff --git a/benchmark/syscount/testfile.txt b/benchmark/syscount/testfile.txt
new file mode 100644
index 00000000..0fd26800
--- /dev/null
+++ b/benchmark/syscount/testfile.txt
@@ -0,0 +1 @@
+hhhhhhhhhhhhhhhhhh
diff --git a/example/minimal/README.md b/example/minimal/README.md
index 33d89c71..528a4756 100644
--- a/example/minimal/README.md
+++ b/example/minimal/README.md
@@ -121,5 +121,5 @@ client
 
 ```sh
 sudo ~/.bpftime/bpftime start -s ./victim
-# or AGENT_SO=build/runtime/agent/libbpftime-agent.so LD_PRELOAD=build/runtime/agent-transformer/libbpftime-agent-transformer.so ./victim
+# or AGENT_SO=build/runtime/agent/libbpftime-agent.so LD_PRELOAD=build/attach/text_segment_transformer/libbpftime-agent-transformer.so ./victim
 ```

From da78a39046c8ca5d8287ab0eaa4ff8dd9c581677 Mon Sep 17 00:00:00 2001
From: yunwei37 <1067852565@qq.com>
Date: Fri, 16 Aug 2024 07:42:37 -0700
Subject: [PATCH 3/6] fix bench command

---
 benchmark/syscount/test.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmark/syscount/test.sh b/benchmark/syscount/test.sh
index 99db667c..c3369ecd 100644
--- a/benchmark/syscount/test.sh
+++ b/benchmark/syscount/test.sh
@@ -5,8 +5,8 @@ total_sendmsg_time=0
 
 for i in {1..10}
 do
-    output=$(sudo AGENT_SO=/tmp/bpftime/runtime/agent/libbpftime-agent.so LD_PRELOAD=/tmp/bpftime/attach/text_segment_transformer/libbpftime-agent-transformer.so ./benchmark/syscount/read-sendmsg)
-    # output=$(sudo ./benchmark/syscount/read-sendmsg)
+    # output=$(sudo AGENT_SO=/tmp/bpftime/runtime/agent/libbpftime-agent.so LD_PRELOAD=/tmp/bpftime/attach/text_segment_transformer/libbpftime-agent-transformer.so ./benchmark/syscount/read-sendmsg)
+    output=$(sudo ./benchmark/syscount/read-sendmsg)
     read_time=$(echo "$output" | grep "Average read() time" | awk '{print $4}')
     sendmsg_time=$(echo "$output" | grep "Average sendmsg() time" | awk '{print $4}')
     

From eefb384fc6c6259d8ddd6bf75a0a4189d8390b12 Mon Sep 17 00:00:00 2001
From: yunwei37 <1067852565@qq.com>
Date: Fri, 16 Aug 2024 07:47:34 -0700
Subject: [PATCH 4/6] Add hash map op bench

---
 benchmark/hash_map/.gitignore   |  10 ---
 benchmark/hash_map/Makefile     | 138 --------------------------------
 benchmark/hash_map/README.md    |  42 ----------
 benchmark/hash_map/uprobe.bpf.c |  43 ----------
 benchmark/hash_map/uprobe.c     |  68 ----------------
 benchmark/test.c                |  24 +++++-
 benchmark/uprobe/uprobe.bpf.c   |  39 +++++++++
 7 files changed, 62 insertions(+), 302 deletions(-)
 delete mode 100644 benchmark/hash_map/.gitignore
 delete mode 100644 benchmark/hash_map/Makefile
 delete mode 100644 benchmark/hash_map/README.md
 delete mode 100644 benchmark/hash_map/uprobe.bpf.c
 delete mode 100644 benchmark/hash_map/uprobe.c

diff --git a/benchmark/hash_map/.gitignore b/benchmark/hash_map/.gitignore
deleted file mode 100644
index 3028a00f..00000000
--- a/benchmark/hash_map/.gitignore
+++ /dev/null
@@ -1,10 +0,0 @@
-.vscode
-package.json
-*.o
-*.skel.json
-*.skel.yaml
-package.yaml
-ecli
-.output
-test
-uprobe
diff --git a/benchmark/hash_map/Makefile b/benchmark/hash_map/Makefile
deleted file mode 100644
index 87a6287c..00000000
--- a/benchmark/hash_map/Makefile
+++ /dev/null
@@ -1,138 +0,0 @@
-# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
-OUTPUT := .output
-CLANG ?= clang
-LIBBPF_SRC := $(abspath ../../third_party/libbpf/src)
-BPFTOOL_SRC := $(abspath ../../third_party/bpftool/src)
-LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a)
-BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool)
-BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool
-ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \
-			 | sed 's/arm.*/arm/' \
-			 | sed 's/aarch64/arm64/' \
-			 | sed 's/ppc64le/powerpc/' \
-			 | sed 's/mips.*/mips/' \
-			 | sed 's/riscv64/riscv/' \
-			 | sed 's/loongarch64/loongarch/')
-VMLINUX := ../../third_party/vmlinux/$(ARCH)/vmlinux.h
-# Use our own libbpf API headers and Linux UAPI headers distributed with
-# libbpf to avoid dependency on system-wide headers, which could be missing or
-# outdated
-INCLUDES := -I$(OUTPUT) -I../../third_party/libbpf/include/uapi -I$(dir $(VMLINUX))
-CFLAGS := -g -Wall
-ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS)
-
-APPS = uprobe # minimal minimal_legacy  kprobe fentry usdt sockfilter tc ksyscall
-
-CARGO ?= $(shell which cargo)
-ifeq ($(strip $(CARGO)),)
-BZS_APPS :=
-else
-BZS_APPS := # profile
-APPS += $(BZS_APPS)
-# Required by libblazesym
-ALL_LDFLAGS += -lrt -ldl -lpthread -lm
-endif
-
-# Get Clang's default includes on this system. We'll explicitly add these dirs
-# to the includes list when compiling with `-target bpf` because otherwise some
-# architecture-specific dirs will be "missing" on some architectures/distros -
-# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h,
-# sys/cdefs.h etc. might be missing.
-#
-# Use '-idirafter': Don't interfere with include mechanics except where the
-# build would have failed anyways.
-CLANG_BPF_SYS_INCLUDES ?= $(shell $(CLANG) -v -E - </dev/null 2>&1 \
-	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
-
-ifeq ($(V),1)
-	Q =
-	msg =
-else
-	Q = @
-	msg = @printf '  %-8s %s%s\n'					\
-		      "$(1)"						\
-		      "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))"	\
-		      "$(if $(3), $(3))";
-	MAKEFLAGS += --no-print-directory
-endif
-
-define allow-override
-  $(if $(or $(findstring environment,$(origin $(1))),\
-            $(findstring command line,$(origin $(1)))),,\
-    $(eval $(1) = $(2)))
-endef
-
-$(call allow-override,CC,$(CROSS_COMPILE)cc)
-$(call allow-override,LD,$(CROSS_COMPILE)ld)
-
-.PHONY: all
-all: $(APPS)
-
-.PHONY: clean
-clean:
-	$(call msg,CLEAN)
-	$(Q)rm -rf $(OUTPUT) $(APPS)
-
-$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT):
-	$(call msg,MKDIR,$@)
-	$(Q)mkdir -p $@
-
-# Build libbpf
-$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
-	$(call msg,LIB,$@)
-	$(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1		      \
-		    OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@)		      \
-		    INCLUDEDIR= LIBDIR= UAPIDIR=			      \
-		    install
-
-# Build bpftool
-$(BPFTOOL): | $(BPFTOOL_OUTPUT)
-	$(call msg,BPFTOOL,$@)
-	$(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap
-
-
-$(LIBBLAZESYM_SRC)/target/release/libblazesym.a::
-	$(Q)cd $(LIBBLAZESYM_SRC) && $(CARGO) build --features=cheader,dont-generate-test-files --release
-
-$(LIBBLAZESYM_OBJ): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
-	$(call msg,LIB, $@)
-	$(Q)cp $(LIBBLAZESYM_SRC)/target/release/libblazesym.a $@
-
-$(LIBBLAZESYM_HEADER): $(LIBBLAZESYM_SRC)/target/release/libblazesym.a | $(OUTPUT)
-	$(call msg,LIB,$@)
-	$(Q)cp $(LIBBLAZESYM_SRC)/target/release/blazesym.h $@
-
-# Build BPF code
-$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL)
-	$(call msg,BPF,$@)
-	$(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH)		      \
-		     $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES)		      \
-		     -c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
-	$(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@)
-
-# Generate BPF skeletons
-$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL)
-	$(call msg,GEN-SKEL,$@)
-	$(Q)$(BPFTOOL) gen skeleton $< > $@
-
-# Build user-space code
-$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h
-
-$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT)
-	$(call msg,CC,$@)
-	$(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@
-
-$(patsubst %,$(OUTPUT)/%.o,$(BZS_APPS)): $(LIBBLAZESYM_HEADER)
-
-$(BZS_APPS): $(LIBBLAZESYM_OBJ)
-
-# Build application binary
-$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT)
-	$(call msg,BINARY,$@)
-	$(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@
-
-# delete failed targets
-.DELETE_ON_ERROR:
-
-# keep intermediate (.skel.h, .bpf.o, etc) targets
-.SECONDARY:
diff --git a/benchmark/hash_map/README.md b/benchmark/hash_map/README.md
deleted file mode 100644
index 092e26c9..00000000
--- a/benchmark/hash_map/README.md
+++ /dev/null
@@ -1,42 +0,0 @@
-# benchmark of hash maps
-
-- __bench_uprobe_uretprobe: hashmap bpf_map_lookup_elem
-- __bench_uretprobe: hashmap bpf_map_delete_elem
-- __bench_probe: hashmap bpf_map_update_elem
-
-run the uprobe:
-
-```console
-$ LD_PRELOAD=build/runtime/syscall-server/libbpftime-syscall-server.so benchmark/hash_map/uprobe
-manager constructed
-global_shm_open_type 0 for bpftime_maps_shm
-Closing 3
-libbpf: loading object 'uprobe_bpf' from buffer
-libbpf: elf: section(2) .symtab, size 120, link 1, flags 0, type=2
-...
-loaded ebpf program...
-...
-```
-
-in another terminal, run the benchmark:
-
-```console
-$ LD_PRELOAD=build/runtime/agent/libbpftime-agent.so benchmark/test
-
-Benchmarking __bench_uprobe_uretprobe
-a[b] + c for 100000 times
-Elapsed time: 0.038217773 seconds
-Average time usage 382.177730 ns
-
-Benchmarking __bench_uretprobe
-a[b] + c for 100000 times
-Elapsed time: 0.020004455 seconds
-Average time usage 200.044550 ns
-
-Benchmarking __bench_probe
-a[b] + c for 100000 times
-Elapsed time: 0.047916014 seconds
-Average time usage 479.160140 ns
-
-INFO [34534]: Global shm destructed
-```
diff --git a/benchmark/hash_map/uprobe.bpf.c b/benchmark/hash_map/uprobe.bpf.c
deleted file mode 100644
index 8cbc53be..00000000
--- a/benchmark/hash_map/uprobe.bpf.c
+++ /dev/null
@@ -1,43 +0,0 @@
-#define BPF_NO_GLOBAL_DATA
-#include <vmlinux.h>
-#include <bpf/bpf_helpers.h>
-#include <bpf/bpf_tracing.h>
-
-struct {
-	__uint(type, BPF_MAP_TYPE_HASH);
-	__uint(max_entries, 1024);
-	__type(key, u32);
-	__type(value, u64);
-} libc_malloc_calls_total SEC(".maps");
-
-SEC("uprobe/benchmark/test:__bench_probe")
-int test_update(struct pt_regs *ctx)
-{
-	u32 key = 0;
-	u64 value = 0;
-	bpf_map_update_elem(&libc_malloc_calls_total, &key, &value, 0);
-
-	return 0;
-}
-
-SEC("uprobe/benchmark/test:__bench_uretprobe")
-int test_delete(struct pt_regs *ctx)
-{
-	u32 key = 0;
-	u64 value = 0;
-	bpf_map_delete_elem(&libc_malloc_calls_total, &key);
-
-	return 0;
-}
-
-SEC("uprobe/benchmark/test:__bench_uprobe_uretprobe")
-int test_lookup(struct pt_regs *ctx)
-{
-	u32 key = 0;
-	u64 value = 0;
-	bpf_map_lookup_elem(&libc_malloc_calls_total, &key);
-
-	return 0;
-}
-
-char LICENSE[] SEC("license") = "GPL";
diff --git a/benchmark/hash_map/uprobe.c b/benchmark/hash_map/uprobe.c
deleted file mode 100644
index c0ed0303..00000000
--- a/benchmark/hash_map/uprobe.c
+++ /dev/null
@@ -1,68 +0,0 @@
-// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
-/* Copyright (c) 2020 Facebook */
-#include <signal.h>
-#include <stdio.h>
-#include <time.h>
-#include <stdint.h>
-#include <sys/resource.h>
-#include <bpf/libbpf.h>
-#include <bpf/bpf.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include "uprobe.skel.h"
-#define warn(...) fprintf(stderr, __VA_ARGS__)
-
-static int libbpf_print_fn(enum libbpf_print_level level, const char *format,
-			   va_list args)
-{
-	return vfprintf(stderr, format, args);
-}
-
-static volatile bool exiting = false;
-
-static void sig_handler(int sig)
-{
-	exiting = true;
-}
-
-int main(int argc, char **argv)
-{
-	struct uprobe_bpf *skel;
-	int err;
-
-	/* Set up libbpf errors and debug info callback */
-	libbpf_set_print(libbpf_print_fn);
-
-	/* Cleaner handling of Ctrl-C */
-	signal(SIGINT, sig_handler);
-	signal(SIGTERM, sig_handler);
-
-	/* Load and verify BPF application */
-	skel = uprobe_bpf__open();
-	if (!skel) {
-		fprintf(stderr, "Failed to open and load BPF skeleton\n");
-		return 1;
-	}
-
-	/* Load & verify BPF programs */
-	err = uprobe_bpf__load(skel);
-	if (err) {
-		fprintf(stderr, "Failed to load and verify BPF skeleton\n");
-		goto cleanup;
-	}
-	err = uprobe_bpf__attach(skel);
-	if (err) {
-		fprintf(stderr, "Failed to attach BPF skeleton\n");
-		goto cleanup;
-	}
-
-	while (!exiting) {
-		sleep(1);
-		printf("loaded ebpf program...\n");
-	}
-cleanup:
-	/* Clean up */
-	uprobe_bpf__destroy(skel);
-
-	return err < 0 ? -err : 0;
-}
diff --git a/benchmark/test.c b/benchmark/test.c
index 81bd5d77..79cc3e6b 100644
--- a/benchmark/test.c
+++ b/benchmark/test.c
@@ -4,6 +4,26 @@
 #include <stdint.h>
 #include <pthread.h>
 
+
+__attribute_noinline__ uint64_t __bench_map_lookup(char *a, int b,
+							   uint64_t c)
+{
+	return a[b] + c;
+}
+
+__attribute_noinline__ uint64_t __bench_map_delete(char *a, int b,
+							   uint64_t c)
+{
+	return a[b] + c;
+}
+
+__attribute_noinline__ uint64_t __bench_map_update(char *a, int b,
+							   uint64_t c)
+{
+	return a[b] + c;
+}
+
+
 __attribute_noinline__ uint64_t __bench_read(char *a, int b,
 							   uint64_t c)
 {
@@ -15,7 +35,6 @@ __attribute_noinline__ uint64_t __bench_write(char *a, int b,
 {
 	return a[b] + c;
 }
-
 __attribute_noinline__ uint64_t __bench_uprobe(char *a, int b,
 							   uint64_t c)
 {
@@ -99,6 +118,9 @@ void *run_bench_functions(void *id_ptr)
 	do_benchmark_func(__bench_uprobe, iter, id);
 	do_benchmark_func(__bench_read, iter, id);
 	do_benchmark_func(__bench_write, iter, id);
+	do_benchmark_func(__bench_map_update, iter, id);
+	do_benchmark_func(__bench_map_delete, iter, id);
+	do_benchmark_func(__bench_map_lookup, iter, id);
 	return NULL;
 }
 
diff --git a/benchmark/uprobe/uprobe.bpf.c b/benchmark/uprobe/uprobe.bpf.c
index 5450828d..e4e3696d 100644
--- a/benchmark/uprobe/uprobe.bpf.c
+++ b/benchmark/uprobe/uprobe.bpf.c
@@ -3,6 +3,45 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 1024);
+	__type(key, u32);
+	__type(value, u64);
+} libc_malloc_calls_total SEC(".maps");
+
+SEC("uprobe/benchmark/test:__bench_map_update")
+int test_update(struct pt_regs *ctx)
+{
+	u32 key = 0;
+	u64 value = 0;
+	bpf_map_update_elem(&libc_malloc_calls_total, &key, &value, 0);
+
+	return 0;
+}
+
+SEC("uprobe/benchmark/test:__bench_map_delete")
+int test_delete(struct pt_regs *ctx)
+{
+	u32 key = 0;
+	u64 value = 0;
+	bpf_map_delete_elem(&libc_malloc_calls_total, &key);
+
+	return 0;
+}
+
+SEC("uprobe/benchmark/test:__bench_map_lookup")
+int test_lookup(struct pt_regs *ctx)
+{
+	u32 key = 0;
+	u64 value = 0;
+	bpf_map_lookup_elem(&libc_malloc_calls_total, &key);
+
+	return 0;
+}
+
+
 SEC("uprobe/benchmark/test:__bench_write")
 int BPF_UPROBE(__bench_write, char *a, int b, uint64_t c)
 {

From b9d3c2b72b5b36ff15801b52c09243eea7ec465f Mon Sep 17 00:00:00 2001
From: yunwei37 <1067852565@qq.com>
Date: Fri, 16 Aug 2024 07:59:26 -0700
Subject: [PATCH 5/6] update

---
 benchmark/test.c              |  2 +-
 benchmark/uprobe/uprobe.bpf.c | 26 ++++++++++++++------------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/benchmark/test.c b/benchmark/test.c
index 79cc3e6b..88bbc98f 100644
--- a/benchmark/test.c
+++ b/benchmark/test.c
@@ -119,8 +119,8 @@ void *run_bench_functions(void *id_ptr)
 	do_benchmark_func(__bench_read, iter, id);
 	do_benchmark_func(__bench_write, iter, id);
 	do_benchmark_func(__bench_map_update, iter, id);
-	do_benchmark_func(__bench_map_delete, iter, id);
 	do_benchmark_func(__bench_map_lookup, iter, id);
+	do_benchmark_func(__bench_map_delete, iter, id);
 	return NULL;
 }
 
diff --git a/benchmark/uprobe/uprobe.bpf.c b/benchmark/uprobe/uprobe.bpf.c
index e4e3696d..d0c8608c 100644
--- a/benchmark/uprobe/uprobe.bpf.c
+++ b/benchmark/uprobe/uprobe.bpf.c
@@ -14,30 +14,32 @@ struct {
 SEC("uprobe/benchmark/test:__bench_map_update")
 int test_update(struct pt_regs *ctx)
 {
-	u32 key = 0;
-	u64 value = 0;
-	bpf_map_update_elem(&libc_malloc_calls_total, &key, &value, 0);
-
+	for (int i = 0; i < 1000; i++) {
+		u32 key = i;
+		u64 value = i;
+		bpf_map_update_elem(&libc_malloc_calls_total, &key, &value, BPF_ANY);
+	}
 	return 0;
 }
 
 SEC("uprobe/benchmark/test:__bench_map_delete")
 int test_delete(struct pt_regs *ctx)
 {
-	u32 key = 0;
-	u64 value = 0;
-	bpf_map_delete_elem(&libc_malloc_calls_total, &key);
-
+	for (int i = 0; i < 1000; i++) {
+		u32 key = i;
+		bpf_map_delete_elem(&libc_malloc_calls_total, &key);
+	}
 	return 0;
 }
 
 SEC("uprobe/benchmark/test:__bench_map_lookup")
 int test_lookup(struct pt_regs *ctx)
 {
-	u32 key = 0;
-	u64 value = 0;
-	bpf_map_lookup_elem(&libc_malloc_calls_total, &key);
-
+	for (int i = 0; i < 1000; i++) {
+		u32 key = i;
+		u64 value = i;
+		bpf_map_lookup_elem(&libc_malloc_calls_total, &key);
+	}
 	return 0;
 }
 

From bfc0447fada19e700e1822976704339fb68221eb Mon Sep 17 00:00:00 2001
From: yunwei37 <1067852565@qq.com>
Date: Fri, 16 Aug 2024 14:03:45 -0700
Subject: [PATCH 6/6] add per cpu map op bench

---
 benchmark/README.md                           | 158 ++++++++++++++++--
 benchmark/test.c                              |  84 ++++------
 benchmark/tools/.gitignore                    |   1 -
 benchmark/tools/Makefile                      |   2 -
 benchmark/tools/driving.py                    | 108 ------------
 benchmark/tools/fig.py                        |  29 ----
 benchmark/tools/readlink.cpp                  |  15 --
 benchmark/uprobe/uprobe.bpf.c                 |  82 +++++----
 benchmark/uprobe/uprobe.c                     |   3 -
 .../bpf_map/userspace/per_cpu_array_map.cpp   |   2 +-
 10 files changed, 228 insertions(+), 256 deletions(-)
 delete mode 100644 benchmark/tools/.gitignore
 delete mode 100644 benchmark/tools/Makefile
 delete mode 100644 benchmark/tools/driving.py
 delete mode 100644 benchmark/tools/fig.py
 delete mode 100644 benchmark/tools/readlink.cpp

diff --git a/benchmark/README.md b/benchmark/README.md
index 9ed517b4..71ad1d1c 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -2,10 +2,13 @@
 
 With userspace eBPF runntime, we can:
 
-- speed up the uprobe and uretprobe by approximate 10x
-- with out any kernel patch or modify the tracing eBPF program
+- Speed up the uprobe and uretprobe by approximate `10x`
+- The userspace read and write user memory is approximate `10x` faster than kernel (~5ns vs ~50ns)
+- With out any kernel patch or modify the tracing eBPF program
 - No privilege is needed for running the eBPF tracing program.
 
+Probes:
+
 | Probe/Tracepoint Types | Kernel (ns)  | Userspace (ns) | Insn Count |
 |------------------------|-------------:|---------------:|---------------:|
 | Uprobe                 | 3224.172760  | 314.569110     | 4    |
@@ -13,6 +16,13 @@ With userspace eBPF runntime, we can:
 | Syscall Tracepoint     | 151.82801    | 232.57691      | 4    |
 | Embedding runtime      | Not avaliable |  110.008430   | 4    |
 
+Read and write user memory:
+
+| Probe/Tracepoint Types  | Kernel (ns)     | Userspace (ns) |
+|-------------------------|----------------:|---------------:|
+| bpf_probe_read - uprobe  | 46.820830       | 2.200530       |
+| bpf_probe_write_user - uprobe | 45.004100  | 8.101980       |
+
 ## Suggest build configuration
 
 ```sh
@@ -41,13 +51,6 @@ cd benchmark
 python3 run_benchmark.py
 ```
 
-## test environment
-
-```console
-$ uname -a
-Linux yunwei37server 6.2.0-32-generic #32-Ubuntu SMP PREEMPT_DYNAMIC Mon Aug 14 10:03:50 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
-```
-
 ## base line
 
 ```sh
@@ -150,7 +153,7 @@ You can use python script to run the benchmark:
 python3 benchmark/tools/driving.py
 ```
 
-## Test syscall trace and untrace
+## Test syscall trace and untrace with syscount
 
 run the test:
 
@@ -175,9 +178,24 @@ Average read() time over 10 runs: 531 ns
 Average sendmsg() time over 10 runs: 3681 ns
 ```
 
+## Results for uprobe, uretprobe, and syscall tracepoint
+
+| Probe/Tracepoint Types | Kernel (ns)  | Userspace (ns) | Insn Count |
+|------------------------|-------------:|---------------:|---------------:|
+| Uprobe                 | 3224.172760  | 314.569110     | 4    |
+| Uretprobe              | 3996.799580  | 381.270270     | 2    |
+| Syscall Tracepoint     | 151.82801    | 232.57691      | 4    |
+| Embedding runtime      | Not avaliable |  110.008430   | 4    |
+
+Tested on `6.2.0-32-generic` kernel and `Intel(R) Core(TM) i7-11800H CPU @ 2.30GHz`.
+
 ## Results on another machine
 
-kernel:
+Tested on `kernel version 6.2` and `Intel(R) Xeon(R) Gold 5418Y` CPU.
+
+### Uprobe and read/write with `bpf_probe_write_user` and `bpf_probe_read_user`
+
+Userspace:
 
 ```txt
 Benchmarking __bench_uprobe_uretprobe in thread 1
@@ -214,3 +232,121 @@ Average time usage 383.135720 ns, iter 100000 times
 Benchmarking __bench_write in thread 1
 Average time usage 389.037170 ns, iter 100000 times
 ```
+
+### maps operations
+
+Run the map op 1000 times in one function. Userspace map op is also faster than the kernel in the current version. Current version is 10x faster than stupid old version.
+
+```c
+SEC("uprobe/benchmark/test:__bench_hash_map_lookup")
+int test_lookup(struct pt_regs *ctx)
+{
+    for (int i = 0; i < 1000; i++) {
+        u32 key = i;
+        u64 value = i;
+        bpf_map_lookup_elem(&test_hash_map, &key);
+    }
+    return 0;
+}
+```
+
+Kernel map op cost:
+
+```txt
+
+Benchmarking __bench_hash_map_update in thread 1
+Average time usage 64738.264680 ns, iter 100000 times
+
+Benchmarking __bench_hash_map_lookup in thread 1
+Average time usage 17805.898280 ns, iter 100000 times
+
+Benchmarking __bench_hash_map_delete in thread 1
+Average time usage 21795.665340 ns, iter 100000 times
+
+Benchmarking __bench_array_map_update in thread 1
+Average time usage 11449.295960 ns, iter 100000 times
+
+Benchmarking __bench_array_map_lookup in thread 1
+Average time usage 2093.886500 ns, iter 100000 times
+
+Benchmarking __bench_array_map_delete in thread 1
+Average time usage 2126.820310 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_hash_map_update in thread 1
+Average time usage 35050.915650 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_hash_map_lookup in thread 1
+Average time usage 15999.969590 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_hash_map_delete in thread 1
+Average time usage 21664.294940 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_array_map_update in thread 1
+Average time usage 10886.969860 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_array_map_lookup in thread 1
+Average time usage 2749.468760 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_array_map_delete in thread 1
+Average time usage 2778.679460 ns, iter 100000 times
+```
+
+Userspace map op cost:
+
+```txt
+Benchmarking __bench_hash_map_update in thread 1
+Average time usage 30676.986820 ns, iter 100000 times
+
+Benchmarking __bench_hash_map_lookup in thread 1
+Average time usage 23486.304570 ns, iter 100000 times
+
+Benchmarking __bench_hash_map_delete in thread 1
+Average time usage 13435.901160 ns, iter 100000 times
+
+Benchmarking __bench_array_map_update in thread 1
+Average time usage 7081.922160 ns, iter 100000 times
+
+Benchmarking __bench_array_map_lookup in thread 1
+Average time usage 4685.450360 ns, iter 100000 times
+
+Benchmarking __bench_array_map_delete in thread 1
+Average time usage 6367.443010 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_hash_map_update in thread 1
+Average time usage 95918.602090 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_hash_map_lookup in thread 1
+Average time usage 63294.791110 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_hash_map_delete in thread 1
+Average time usage 460207.364100 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_array_map_update in thread 1
+Average time usage 26109.863360 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_array_map_lookup in thread 1
+Average time usage 9139.355980 ns, iter 100000 times
+
+Benchmarking __bench_per_cpu_array_map_delete in thread 1
+Average time usage 5203.339320 ns, iter 100000 times
+```
+
+The benchmark without inline the map op function:
+
+| Map Operation                      | Kernel (op - uprobe) (ns) | Userspace (op - uprobe) (ns) |
+|------------------------------------|--------------------------:|-----------------------------:|
+| __bench_hash_map_update            | 62827.533320              | 30296.051630                 |
+| __bench_hash_map_lookup            | 15895.166920              | 23005.369380                 |
+| __bench_hash_map_delete            | 19884.933980              | 13054.965970                 |
+| __bench_array_map_update           | 9538.564600               | 6701.987970                  |
+| __bench_array_map_lookup           |  183.155140               | 4305.515170                  |
+| __bench_array_map_delete           |  216.088950               | 5987.507820                  |
+| __bench_per_cpu_hash_map_update    | 33140.184290              | 95537.666900                 |
+| __bench_per_cpu_hash_map_lookup    | 14089.238230              | 62913.855920                 |
+| __bench_per_cpu_hash_map_delete    | 19753.563580              | 459826.428910                |
+| __bench_per_cpu_array_map_update   |  8885.238500              | 25728.928170                 |
+| __bench_per_cpu_array_map_lookup   |  1838.737400              | 8759.420790                  |
+| __bench_per_cpu_array_map_delete   |  1867.948100              | 4802.404130                  |
+
+- Some overhead can be reduced by inlining the map op function.
+- We need to fix the performance issue of the per-cpu map in the userspace runtime.
\ No newline at end of file
diff --git a/benchmark/test.c b/benchmark/test.c
index 88bbc98f..73418af0 100644
--- a/benchmark/test.c
+++ b/benchmark/test.c
@@ -4,54 +4,29 @@
 #include <stdint.h>
 #include <pthread.h>
 
-
-__attribute_noinline__ uint64_t __bench_map_lookup(char *a, int b,
-							   uint64_t c)
-{
-	return a[b] + c;
-}
-
-__attribute_noinline__ uint64_t __bench_map_delete(char *a, int b,
-							   uint64_t c)
-{
-	return a[b] + c;
-}
-
-__attribute_noinline__ uint64_t __bench_map_update(char *a, int b,
-							   uint64_t c)
-{
-	return a[b] + c;
-}
-
-
-__attribute_noinline__ uint64_t __bench_read(char *a, int b,
-							   uint64_t c)
-{
-	return a[b] + c;
-}
-
-__attribute_noinline__ uint64_t __bench_write(char *a, int b,
-							   uint64_t c)
-{
-	return a[b] + c;
-}
-__attribute_noinline__ uint64_t __bench_uprobe(char *a, int b,
-							   uint64_t c)
-{
-	return a[b] + c;
+#define BENCH_FUNC(name) \
+__attribute_noinline__ uint64_t name(char *a, int b, uint64_t c) \
+{ \
+    return a[b] + c; \
 }
 
-__attribute_noinline__ uint64_t __bench_uretprobe(char *a, int b,
-							   uint64_t c)
-{
-	return a[b] + c;
-}
-
-__attribute_noinline__ uint64_t __bench_uprobe_uretprobe(char *a, int b,
-							   uint64_t c)
-{
-	return a[b] + c;
-}
+BENCH_FUNC(__bench_array_map_lookup)
+BENCH_FUNC(__bench_array_map_delete)
+BENCH_FUNC(__bench_array_map_update)
+BENCH_FUNC(__bench_hash_map_lookup)
+BENCH_FUNC(__bench_hash_map_delete)
+BENCH_FUNC(__bench_hash_map_update)
+BENCH_FUNC(__bench_per_cpu_hash_map_lookup)
+BENCH_FUNC(__bench_per_cpu_hash_map_delete)
+BENCH_FUNC(__bench_per_cpu_hash_map_update)
+BENCH_FUNC(__bench_per_cpu_array_map_lookup)
+BENCH_FUNC(__bench_per_cpu_array_map_delete)
+BENCH_FUNC(__bench_per_cpu_array_map_update)
+BENCH_FUNC(__bench_read)
+BENCH_FUNC(__bench_write)
+BENCH_FUNC(__bench_uprobe)
+BENCH_FUNC(__bench_uretprobe)
+BENCH_FUNC(__bench_uprobe_uretprobe)
 
 typedef uint64_t (*benchmark_test_function_t)(char *, int, uint64_t);
 
@@ -104,7 +79,7 @@ void do_benchmark_userspace(benchmark_test_function_t func, const char *name,
 
 #define do_benchmark_func(func, iter, id)                                      \
 	do {                                                                   \
-		do_benchmark_userspace(func, #func, iter, id);                        \
+		do_benchmark_userspace(func, #func, iter, id);                 \
 	} while (0)
 
 int iter = 100 * 1000;
@@ -118,9 +93,18 @@ void *run_bench_functions(void *id_ptr)
 	do_benchmark_func(__bench_uprobe, iter, id);
 	do_benchmark_func(__bench_read, iter, id);
 	do_benchmark_func(__bench_write, iter, id);
-	do_benchmark_func(__bench_map_update, iter, id);
-	do_benchmark_func(__bench_map_lookup, iter, id);
-	do_benchmark_func(__bench_map_delete, iter, id);
+	do_benchmark_func(__bench_hash_map_update, iter, id);
+	do_benchmark_func(__bench_hash_map_lookup, iter, id);
+	do_benchmark_func(__bench_hash_map_delete, iter, id);
+	do_benchmark_func(__bench_array_map_update, iter, id);
+	do_benchmark_func(__bench_array_map_lookup, iter, id);
+	do_benchmark_func(__bench_array_map_delete, iter, id);
+	do_benchmark_func(__bench_per_cpu_hash_map_update, iter, id);
+	do_benchmark_func(__bench_per_cpu_hash_map_lookup, iter, id);
+	do_benchmark_func(__bench_per_cpu_hash_map_delete, iter, id);
+	do_benchmark_func(__bench_per_cpu_array_map_update, iter, id);
+	do_benchmark_func(__bench_per_cpu_array_map_lookup, iter, id);
+	do_benchmark_func(__bench_per_cpu_array_map_delete, iter, id);
 	return NULL;
 }
 
diff --git a/benchmark/tools/.gitignore b/benchmark/tools/.gitignore
deleted file mode 100644
index b66ea85e..00000000
--- a/benchmark/tools/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-readlink
diff --git a/benchmark/tools/Makefile b/benchmark/tools/Makefile
deleted file mode 100644
index 3c81d575..00000000
--- a/benchmark/tools/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-readlink: readlink.cpp
-	g++ readlink.cpp -o readlink
diff --git a/benchmark/tools/driving.py b/benchmark/tools/driving.py
deleted file mode 100644
index fe75c3c1..00000000
--- a/benchmark/tools/driving.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import re
-import json
-import numpy as np
-import subprocess
-import os
-import signal
-import time
-
-def run_command(cmd):
-    """Run a command in the background and return its process."""
-    process = subprocess.Popen(cmd, shell=True)
-    return process
-
-
-def kill_process(process):
-    """Kill a given process."""
-    os.kill(process.pid, signal.SIGKILL)
-    print("Process killed")
-       # Give the process some time to terminate.
-    time.sleep(1)
-
-    # Check if the process has really terminated. If it has, poll() should return the exit code.
-    if process.poll() is None:
-        print(f"Process {process.pid} was not killed, forcing kill.")
-        os.kill(process.pid, signal.SIGKILL)
-    else:
-        print(f"Process {process.pid} was successfully killed.")
-
-
-# Function to run the command and extract the average write time
-def run_command_and_extract_time(name: str, library: str):
-    print("run_command_and_extract_time")
-    try:
-        result = subprocess.check_output(
-            [
-                "sudo",
-                library,
-                name,
-            ],
-            universal_newlines=True,
-        )
-        match = re.search(r"Average time usage (\d+\.\d+)ns,", result)
-        print(float(match.group(1)))
-        if match:
-            return float(match.group(1))
-        else:
-            print("Warning: No match found in the output")
-            return None
-    except Exception as e:
-        print(f"Error during command execution: {e}")
-        return None
-
-
-def save_micro_benchmark_data(name: str, library: str, output_file: str):
-    # Run the command 100 times and collect the average write times
-    times = [run_command_and_extract_time(name, library) for _ in range(20)]
-    times = [time for time in times if time is not None]  # Filter out None values
-
-    # Compute metrics
-    mean_time = np.mean(times)
-    median_time = np.median(times)
-    min_time = np.min(times)
-    max_time = np.max(times)
-    std_dev_time = np.std(times)
-
-    # Prepare the data for the JSON file
-    data = {
-        "raw_times": times,
-        "mean": mean_time,
-        "median": median_time,
-        "min": min_time,
-        "max": max_time,
-        "std_dev": std_dev_time,
-    }
-
-    # Save the data to a JSON file
-    with open(output_file, "w") as f:
-        json.dump(data, f, indent=4)
-
-def run_kernel_syscall_tracepoint_test():
-    server = run_command("sudo benchmark/syscall/syscall")
-    save_micro_benchmark_data(
-        "benchmark/syscall/victim", "A=B", "benchmark/micro-bench/kernel-syscall.json"
-    )
-    kill_process(server)
-    run_command("pkill syscall/syscall")
-
-def run_userspace_syscall_tracepoint_test():
-    server = run_command(
-        "sudo LD_PRELOAD=build/runtime/syscall-server/libbpftime-syscall-server.so  benchmark/syscall/syscall"
-    )
-    save_micro_benchmark_data(
-        "benchmark/syscall/victim",
-        "LD_PRELOAD=build/runtime/agent/libbpftime-agent.so",
-        "benchmark/micro-bench/userspace-syscall.json",
-    )
-    kill_process(server)
-    run_command("pkill syscall/syscall")
-
-def run_syscall_baseline_test():
-    save_micro_benchmark_data(
-        "benchmark/syscall/victim",
-        "LD_PRELOAD=build/runtime/agent/libbpftime-agent.so",
-        "benchmark/micro-bench/baseline-syscall.json",
-    )
-
-
-run_syscall_baseline_test()
\ No newline at end of file
diff --git a/benchmark/tools/fig.py b/benchmark/tools/fig.py
deleted file mode 100644
index 0a33c99b..00000000
--- a/benchmark/tools/fig.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import matplotlib.pyplot as plt
-
-# Categories
-categories = ["Syscall Tracepoint", "Uprobe", "Uretprobe"]
-
-# Time values for Kernel and Userspace
-kernel_times = [1499.47708, 4751.462610, 5899.706820]
-userspace_times = [1489.04251, 445.169770, 472.972220]
-
-bar_width = 0.35
-index = range(len(categories))
-
-plt.figure(figsize=(12, 7))
-
-# Plot bars for Kernel and Userspace
-bar1 = plt.bar(index, kernel_times, bar_width, color='b', label='Kernel')
-bar2 = plt.bar([i + bar_width for i in index], userspace_times, bar_width, color='r', label='Userspace')
-
-# Labeling the figure
-plt.xlabel('Probe/Tracepoint Types')
-plt.ylabel('Avg Time (ns)')
-plt.title('Comparison of Kernel vs. Userspace for Different Probe/Tracepoint Types')
-plt.xticks([i + bar_width/2 for i in index], categories)
-plt.legend()
-
-plt.tight_layout()
-plt.grid(True, which="both", ls="--", c="0.65")
-plt.savefig("trace_overhead.png")
-plt.show()
diff --git a/benchmark/tools/readlink.cpp b/benchmark/tools/readlink.cpp
deleted file mode 100644
index 62bd4bcc..00000000
--- a/benchmark/tools/readlink.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#include <iostream>
-#include <unistd.h>
-#include <limits.h>
-
-int main() {
-    char execPath[PATH_MAX];
-    ssize_t len = readlink("/proc/self/exe", execPath, sizeof(execPath) - 1);
-    if (len != -1) {
-        execPath[len] = '\0';  // Null-terminate the string
-        std::cout << "Executable Path: " << execPath << std::endl;
-    } else {
-        std::cerr << "Error retrieving executable path" << std::endl;
-    }
-    return 0;
-}
\ No newline at end of file
diff --git a/benchmark/uprobe/uprobe.bpf.c b/benchmark/uprobe/uprobe.bpf.c
index d0c8608c..a91e0ef9 100644
--- a/benchmark/uprobe/uprobe.bpf.c
+++ b/benchmark/uprobe/uprobe.bpf.c
@@ -3,46 +3,56 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 
-
-struct {
-	__uint(type, BPF_MAP_TYPE_HASH);
-	__uint(max_entries, 1024);
-	__type(key, u32);
-	__type(value, u64);
-} libc_malloc_calls_total SEC(".maps");
-
-SEC("uprobe/benchmark/test:__bench_map_update")
-int test_update(struct pt_regs *ctx)
-{
-	for (int i = 0; i < 1000; i++) {
-		u32 key = i;
-		u64 value = i;
-		bpf_map_update_elem(&libc_malloc_calls_total, &key, &value, BPF_ANY);
-	}
-	return 0;
+#define DEFINE_MAP_OPERATIONS(map_name, map_type) \
+struct { \
+    __uint(type, map_type); \
+    __uint(max_entries, 1024); \
+    __type(key, u32); \
+    __type(value, u64); \
+} map_name SEC(".maps"); \
+\
+SEC("uprobe/benchmark/test:__bench_" #map_name "_update") \
+int map_name##_update(struct pt_regs *ctx) \
+{ \
+    for (int i = 0; i < 1000; i++) { \
+        u32 key = i; \
+        u64 value = i; \
+        bpf_map_update_elem(&map_name, &key, &value, BPF_ANY); \
+    } \
+    return 0; \
+} \
+\
+SEC("uprobe/benchmark/test:__bench_" #map_name "_delete") \
+int map_name##_delete(struct pt_regs *ctx) \
+{ \
+    for (int i = 0; i < 1000; i++) { \
+        u32 key = i; \
+        bpf_map_delete_elem(&map_name, &key); \
+    } \
+    return 0; \
+} \
+\
+SEC("uprobe/benchmark/test:__bench_" #map_name "_lookup") \
+int map_name##_lookup(struct pt_regs *ctx) \
+{ \
+    for (int i = 0; i < 1000; i++) { \
+        u32 key = i; \
+        bpf_map_lookup_elem(&map_name, &key); \
+    } \
+    return 0; \
 }
 
-SEC("uprobe/benchmark/test:__bench_map_delete")
-int test_delete(struct pt_regs *ctx)
-{
-	for (int i = 0; i < 1000; i++) {
-		u32 key = i;
-		bpf_map_delete_elem(&libc_malloc_calls_total, &key);
-	}
-	return 0;
-}
+// Define operations for an array map
+DEFINE_MAP_OPERATIONS(array_map, BPF_MAP_TYPE_ARRAY)
 
-SEC("uprobe/benchmark/test:__bench_map_lookup")
-int test_lookup(struct pt_regs *ctx)
-{
-	for (int i = 0; i < 1000; i++) {
-		u32 key = i;
-		u64 value = i;
-		bpf_map_lookup_elem(&libc_malloc_calls_total, &key);
-	}
-	return 0;
-}
+// Define operations for a hash map
+DEFINE_MAP_OPERATIONS(hash_map, BPF_MAP_TYPE_HASH)
+
+// Define operations for a per-cpu array map
+DEFINE_MAP_OPERATIONS(per_cpu_hash_map, BPF_MAP_TYPE_PERCPU_HASH)
 
+// Define operations for a per-cpu hash map
+DEFINE_MAP_OPERATIONS(per_cpu_array_map, BPF_MAP_TYPE_PERCPU_ARRAY)
 
 SEC("uprobe/benchmark/test:__bench_write")
 int BPF_UPROBE(__bench_write, char *a, int b, uint64_t c)
diff --git a/benchmark/uprobe/uprobe.c b/benchmark/uprobe/uprobe.c
index 9eff05c3..9850abc1 100644
--- a/benchmark/uprobe/uprobe.c
+++ b/benchmark/uprobe/uprobe.c
@@ -57,9 +57,6 @@ int main(int argc, char **argv)
 	}
 
 	printf("Successfully started! Press Ctrl+C to stop.\n");
-	printf("__bench_uprobe_uretprobe is for both uprobe and uretprobe\n");
-	printf("__bench_uretprobe is for uretprobe only\n");
-	printf("__bench_probe is for uprobe only\n");
 	fflush(stdout);
 	while (!exiting) {
 		sleep(1);
diff --git a/runtime/src/bpf_map/userspace/per_cpu_array_map.cpp b/runtime/src/bpf_map/userspace/per_cpu_array_map.cpp
index 7b0f81e0..b4114f9a 100644
--- a/runtime/src/bpf_map/userspace/per_cpu_array_map.cpp
+++ b/runtime/src/bpf_map/userspace/per_cpu_array_map.cpp
@@ -69,7 +69,7 @@ long per_cpu_array_map_impl::elem_update(const void *key, const void *value,
 long per_cpu_array_map_impl::elem_delete(const void *key)
 {
 	errno = ENOTSUP;
-	SPDLOG_ERROR("Deleting of per cpu array is not supported");
+	SPDLOG_DEBUG("Deleting of per cpu array is not supported");
 	return -1;
 }