diff --git a/.github/workflows/kvm_watcher.yml b/.github/workflows/kvm_watcher.yml index e2e4b84fc..c21216604 100644 --- a/.github/workflows/kvm_watcher.yml +++ b/.github/workflows/kvm_watcher.yml @@ -44,6 +44,6 @@ jobs: sudo ./kvm_watcher -e -t 10 -s sudo ./kvm_watcher -n -t 10 sudo ./kvm_watcher -d -t 10 - sudo ./kvm_watcher -f -t 10 + sudo ./kvm_watcher -f -m -t 10 make clean diff --git a/eBPF_Supermarket/kvm_watcher/README.md b/eBPF_Supermarket/kvm_watcher/README.md new file mode 100755 index 000000000..5991795be --- /dev/null +++ b/eBPF_Supermarket/kvm_watcher/README.md @@ -0,0 +1,142 @@ +# kvm_watcher项目 + +## 一、项目简介 + +`kvm_watcher` 是一个基于 eBPF 技术的项目,旨在在宿主机侧监控和提取 KVM 虚拟机的性能指标,同时对宿主机性能影响较小。该项目基于 eBPF 的实时监控方案,通过在宿主机中执行eBPF程序,实时捕获有关 KVM 虚拟机的关键性能数据和性能事件,提供全面的性能数据,帮助管理员优化虚拟化环境,改善虚拟机的运行效率和响应性,并且允许用户根据实际需求选择监控的指标和事件,实现个性化配置。 + +## 二、功能介绍 + +`kvm_watcher`是一款基于eBPF的kvm检测工具,其旨在使用户方便快捷在宿主机侧获取kvm虚拟机中的各种信息。 + +目前,其实现的功能主要包括: + +- **VM Exit 事件分析:** + - 捕获 VM Exit 事件,包括发生的时间戳、原因、次数以及处理时延等信息。 +- **KVM mmu事件分析:** + - 监控 KVM 中的 mmu page fault 和mmio page fault 事件,记录gva、hva、pfn、错误类型和处理时延等关键信息。 + - 实时监控kvm虚拟机中产生的dirty page,记录脏页地址、变脏时间、变脏次数和memslot等相关信息。 +- **vCPU相关指标分析:** + - 记录有关vCPU的性能指标,包括唤醒时的时间戳,halt持续时间,vCPU id等相关信息。 + - 实时监控vCPU的halt-polling时间的变化信息,包括vCPU的线程tid,变化类型,变化前后的halt-polling时间等信息。 + +## 三、使用方法 + +> 测试环境: +> +> Kernel: Linux6.2 +> +> OS: Ubuntu 23.04 + +**安装依赖:** + +``` +sudo apt install clang libelf1 libelf-dev zlib1g-dev libbpf-dev linux-tools-$(uname -r) linux-cloud-tools-$(uname -r) +sudo modprobe kvm && sudo modprobe kvm-intel //加载kvm模块 +``` + +**编译运行:** + +``` +make +sudo ./kvm_watcher [options] +make clean +``` + +**参数介绍:** + +`kvm_watcher`通过一系列命令参数来控制其具体行为: +``` +Usage: kvm_watcher [OPTION...] +BPF program used for monitoring KVM event + + -d, --mark_page_dirty Monitor virtual machine dirty page information. + -e, --vm_exit Monitoring the event of vm exit. + -f, --kvmmmu_page_fault Monitoring the data of kvmmmu page fault. + -m, --mmio Monitoring the data of mmio page fault..(The -f option must be specified.) + -n, --halt_poll_ns Monitoring the variation in vCPU halt-polling time. + -p, --vm_pid=PID Specify the virtual machine pid to monitor. + -s, --stat Display statistical data.(The -e option must be specified.) + -t, --monitoring_time=SEC Time for monitoring. + -w, --vcpu_wakeup Monitoring the wakeup of vcpu. + -?, --help Give this help list + --usage Give a short usage message + -V, --version Print program version +``` + +`-e`:记录vm exit事件信息 + +`-s`:输出最后的vm exit事件统计信息(需要和`-e`一同使用) + +`-f`:记录kvmmmu缺页信息 + +`-m`:记录mmio缺页信息(需要和`-f`一同使用) + +`-d`:记录kvm脏页信息 + +`-n`:记录vcpu的halt-polling相关信息 + +`-w`:记录vcpu唤醒时的相关信息 + +`-p`:指定kvm虚拟机进程pid(必须为虚拟机进程,否则会报错) + +`-t`:监控时间 + +## 四、代码结构 + +``` +├── include +│ ├── kvm_exits.h //vm exit事件相关的内核bpf程序 +│ ├── kvm_mmu.h //kvmmmu相关的内核bpf程序 +│ ├── kvm_vcpu.h //vcpu相关内核bpf程序 +│ └── kvm_watcher.h //项目公用头文件 +├── Makefile //编译脚本 +├── src +│ ├── kvm_watcher.bpf.c //内核态bpf入口程序 +│ └── kvm_watcher.c //用户态bpf程序 +└── temp + └── dirty_temp //脏页临时文件 +``` + +## 五、测试 + +可以按照如下流程测试程序输出: + +- **安装依赖** + + ``` + sudo apt install clang libelf1 libelf-dev zlib1g-dev libbpf-dev linux-tools-$(uname -r) linux-cloud-tools-$(uname -r) + ``` + +- **加载KVM模块** + + ``` + sudo modprobe kvm && sudo modprobe kvm-intel + ``` + +- **下载CirrOs镜像** + + > CirrOS 是一个专门设计用于在云环境中运行的轻量级 Linux 发行版,特别适用于测试和虚拟机环境,[cirros官网](https://download.cirros-cloud.net/)。 + + ``` + wget http://download.cirros-cloud.net/0.5.1/cirros-0.5.1-x86_64-disk.img //Download Cirros image + ``` + +- **使用QEMU启动虚拟机** + + ``` + sudo qemu-system-x86_64 -enable-kvm -cpu host -m 2048 -drive file=cirros-0.5.1-x86_64-disk.img,format=qcow2 -boot c -nographic + ``` + +- **编译&&运行程序** + + ``` + make + sudo ./kvm_watcher -w -t 10 + sudo ./kvm_watcher -e -t 10 -s + sudo ./kvm_watcher -n -t 10 + sudo ./kvm_watcher -d -t 10 + make clean + ``` + + + diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_mmu.h b/eBPF_Supermarket/kvm_watcher/include/kvm_mmu.h index cc3292eb3..5a7edc5d2 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_mmu.h +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_mmu.h @@ -90,4 +90,47 @@ static int trace_direct_page_fault(struct kvm_vcpu *vcpu, } return 0; } + +static int trace_kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + u64 error_code, pid_t vm_pid) { + CHECK_PID(vm_pid) { + if (error_code & PFERR_RSVD_MASK) { + u64 ts = bpf_ktime_get_ns(); + u64 addr = cr2_or_gpa; + bpf_map_update_elem(&pf_delay, &addr, &ts, BPF_ANY); + } + } + return 0; +} + +static int trace_handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, + bool direct, void *rb) { + u64 *ts; + ts = bpf_map_lookup_elem(&pf_delay, &addr); + if (ts) { + u32 *count; + u32 new_count = 1; + u64 delay = bpf_ktime_get_ns() - *ts; + bpf_map_delete_elem(&pf_delay, &addr); + struct page_fault_event *e; + RESERVE_RINGBUF_ENTRY(rb, e); + count = bpf_map_lookup_elem(&pf_count, &addr); + if (count) { + (*count)++; + e->count = *count; + bpf_map_update_elem(&pf_count, &addr, count, BPF_ANY); + } else { + e->count = 1; + bpf_map_update_elem(&pf_count, &addr, &new_count, BPF_ANY); + } + e->delay = delay; + e->addr = addr; + e->error_code = PFERR_RSVD_MASK; + e->process.pid = bpf_get_current_pid_tgid() >> 32; + bpf_get_current_comm(&e->process.comm, sizeof(e->process.comm)); + e->time = *ts; + bpf_ringbuf_submit(e, 0); + } + return 0; +} #endif /* __KVM_MMU_H */ \ No newline at end of file diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_vcpu.h b/eBPF_Supermarket/kvm_watcher/include/kvm_vcpu.h index 8a7167f7c..6216e8481 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_vcpu.h +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_vcpu.h @@ -93,7 +93,9 @@ static int trace_mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *slot; bpf_probe_read_kernel(&slot, sizeof(memslot), &memslot); bpf_probe_read_kernel(&flags, sizeof(memslot->flags), &memslot->flags); - if (slot && (flags & KVM_MEM_LOG_DIRTY_PAGES)) { // 检查memslot是否启用了脏页追踪 + if (slot && + (flags & + KVM_MEM_LOG_DIRTY_PAGES)) { // 检查memslot是否启用了脏页追踪 gfn_t gfnum = gfn; u32 *count = bpf_map_lookup_elem(&count_dirty_map, &gfnum); if (count) { diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h b/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h index 2fc23538b..4c7c32694 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h @@ -30,6 +30,8 @@ #define PFERR_PK_BIT 5 #define PFERR_SGX_BIT 15 +#define PFERR_RSVD_MASK (1UL << 3) // mmio + #define PRINT_USAGE_ERR() \ do { \ fprintf(stderr, "Use either the -w, -p, -d,-f or -e option.\n"); \ diff --git a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c index de5748498..be5294823 100644 --- a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c +++ b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c @@ -36,44 +36,49 @@ struct { SEC("tp/kvm/kvm_vcpu_wakeup") int tp_vcpu_wakeup(struct vcpu_wakeup *ctx) { - trace_kvm_vcpu_wakeup(ctx, &rb, vm_pid); - return 0; + return trace_kvm_vcpu_wakeup(ctx, &rb, vm_pid); } SEC("tp/kvm/kvm_halt_poll_ns") int tp_kvm_halt_poll_ns(struct halt_poll_ns *ctx) { - trace_kvm_halt_poll_ns(ctx, &rb, vm_pid); - return 0; + return trace_kvm_halt_poll_ns(ctx, &rb, vm_pid); } SEC("tp/kvm/kvm_exit") int tp_exit(struct exit *ctx) { - trace_kvm_exit(ctx, vm_pid); - return 0; + return trace_kvm_exit(ctx, vm_pid); } SEC("tp/kvm/kvm_entry") int tp_entry(struct exit *ctx) { - trace_kvm_entry(&rb); - return 0; + return trace_kvm_entry(&rb); } SEC("kprobe/mark_page_dirty_in_slot") int BPF_KPROBE(kp_mark_page_dirty_in_slot, struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn) { - trace_mark_page_dirty_in_slot(kvm, memslot, gfn, &rb, vm_pid); - return 0; + return trace_mark_page_dirty_in_slot(kvm, memslot, gfn, &rb, vm_pid); } SEC("tp/kvm/kvm_page_fault") int tp_page_fault(struct trace_event_raw_kvm_page_fault *ctx) { - trace_page_fault(ctx, vm_pid); - return 0; + return trace_page_fault(ctx, vm_pid); } SEC("fexit/direct_page_fault") int BPF_PROG(fexit_direct_page_fault, struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - trace_direct_page_fault(vcpu, fault, &rb); - return 0; + return trace_direct_page_fault(vcpu, fault, &rb); } + +SEC("fentry/kvm_mmu_page_fault") +int BPF_PROG(fentry_kvm_mmu_page_fault, struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + u64 error_code) { + return trace_kvm_mmu_page_fault(vcpu, cr2_or_gpa, error_code, vm_pid); +} + +SEC("fexit/handle_mmio_page_fault") +int BPF_PROG(fexit_handle_mmio_page_fault, struct kvm_vcpu *vcpu, u64 addr, + bool direct) { + return trace_handle_mmio_page_fault(vcpu, addr, direct, &rb); +} \ No newline at end of file diff --git a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c index c96a51c53..af118a208 100644 --- a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c +++ b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c @@ -296,6 +296,7 @@ static struct env { bool execute_halt_poll_ns; bool execute_mark_page_dirty; bool execute_page_fault; + bool mmio_page_fault; int monitoring_time; pid_t vm_pid; } env = { @@ -305,6 +306,7 @@ static struct env { .execute_halt_poll_ns = false, .execute_mark_page_dirty = false, .execute_page_fault = false, + .mmio_page_fault = false, .monitoring_time = 0, .vm_pid = -1, }; @@ -322,9 +324,12 @@ static const struct argp_option opts[] = { {"mark_page_dirty", 'd', NULL, 0, "Monitor virtual machine dirty page information."}, {"kvmmmu_page_fault", 'f', NULL, 0, - "Monitoring the date of kvmmmu page fault."}, + "Monitoring the data of kvmmmu page fault."}, {"stat", 's', NULL, 0, "Display statistical data.(The -e option must be specified.)"}, + {"mmio", 'm', NULL, 0, + "Monitoring the data of mmio page fault..(The -f option must be " + "specified.)"}, {"vm_pid", 'p', "PID", 0, "Specify the virtual machine pid to monitor."}, {"monitoring_time", 't', "SEC", 0, "Time for monitoring event."}, {}, @@ -358,6 +363,14 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) { argp_usage(state); } break; + case 'm': + if (env.execute_page_fault) { + env.mmio_page_fault = true; + } else { + fprintf(stderr, "The -f option must be specified.\n"); + argp_usage(state); + } + break; case 't': env.monitoring_time = strtol(arg, NULL, 10); if (env.monitoring_time <= 0) { @@ -434,30 +447,34 @@ static int handle_event(void *ctx, void *data, size_t data_sz) { e->rel_gfn, e->npages, e->userspace_addr, e->slot_id); } else if (env.execute_page_fault) { const struct page_fault_event *e = data; - printf( - "%-18llu %-15s %-10u %-12llx %-6u %-10llu %-20llx %-17llx %-10d ", - e->time, e->process.comm, e->process.pid, e->addr, e->count, - e->delay, e->hva, e->pfn, e->memslot_id); + printf("%-18llu %-15s %-10u %-12llx %-6u %-10llu ", e->time, + e->process.comm, e->process.pid, e->addr, e->count, e->delay); + if (e->error_code & (1ULL << PFERR_RSVD_BIT)) { + printf("%-20s %-17s %-10s", "-", "-", "-"); + } else { + printf("%-20llx %-17llx %-10d", e->hva, e->pfn, e->memslot_id); + } if (e->error_code & (1ULL << PFERR_PRESENT_BIT)) { - printf("Present "); + printf(" Present"); } if (e->error_code & (1ULL << PFERR_WRITE_BIT)) { - printf("Write "); + printf(" Write"); } if (e->error_code & (1ULL << PFERR_USER_BIT)) { - printf("User "); + printf(" User"); } if (e->error_code & (1ULL << PFERR_RSVD_BIT)) { - printf("Reserved "); + printf(" Reserved(MMIO)"); + /*IOAPIC 的mmio基址 #define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000*/ } if (e->error_code & (1ULL << PFERR_FETCH_BIT)) { - printf("Exec "); + printf(" Exec"); } if (e->error_code & (1ULL << PFERR_PK_BIT)) { - printf("Protection-Key "); + printf(" Protection-Key"); } if (e->error_code & (1ULL << PFERR_SGX_BIT)) { - printf("SGX "); + printf(" SGX"); } printf("\n"); } @@ -506,6 +523,10 @@ int main(int argc, char **argv) { env.execute_page_fault ? true : false); bpf_program__set_autoload(skel->progs.fexit_direct_page_fault, env.execute_page_fault ? true : false); + bpf_program__set_autoload(skel->progs.fentry_kvm_mmu_page_fault, + env.mmio_page_fault ? true : false); + bpf_program__set_autoload(skel->progs.fexit_handle_mmio_page_fault, + env.mmio_page_fault ? true : false); /* Load & verify BPF programs */ err = kvm_watcher_bpf__load(skel); if (err) { @@ -535,14 +556,14 @@ int main(int argc, char **argv) { printf("%-18s %-21s %-18s %-15s %-8s %-13s \n", "TIME", "EXIT_REASON", "COMM", "PID/TID", "COUNT", "DURATION(ns)"); } else if (env.execute_halt_poll_ns) { - printf("%-18s %-15s %-15s %-10s %-7s %-11s %-10s\n", "TIME(ns)", - "COMM", "PID/TID", "TYPE", "VCPU_ID", "OLD(ns)", "NEW(ns)"); + printf("%-18s %-15s %-15s %-10s %-7s %-11s %-10s\n", "TIME(ns)", "COMM", + "PID/TID", "TYPE", "VCPU_ID", "OLD(ns)", "NEW(ns)"); } else if (env.execute_mark_page_dirty) { printf("%-18s %-15s %-15s %-10s %-11s %-10s %-10s %-10s\n", "TIME(ns)", - "COMM", "PID/TID", "GFN", "REL_GFN", "NPAGES", - "USERSPACE_ADDR", "SLOT_ID"); + "COMM", "PID/TID", "GFN", "REL_GFN", "NPAGES", "USERSPACE_ADDR", + "SLOT_ID"); } else if (env.execute_page_fault) { - printf("%-18s %-15s %-10s %-12s %-6s %-10s %-20s %-17s %-10s %-10s\n", + printf("%-18s %-15s %-10s %-12s %-6s %-10s %-20s %-17s %-10s %s\n", "TIMESTAMP", "COMM", "PID", "ADDRESS", "COUNT", "DELAY", "HVA", "PFN", "MEM_SLOTID", "ERROR_TYPE"); }