Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kvm_watcher:优化代码,更新文档 #775

Merged
merged 14 commits into from
May 6, 2024
10 changes: 6 additions & 4 deletions .github/workflows/ebpf_kvm_watcher.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@ jobs:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3

- name: Test program execution
- name: Install dependencies
run: |
cd eBPF_Supermarket/kvm_watcher/
make deps
make

- name: Test program execution
continue-on-error: true
run: |
cd eBPF_Supermarket/kvm_watcher/
make
3 changes: 2 additions & 1 deletion eBPF_Supermarket/kvm_watcher/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ default: bpf
deps:
sudo apt-get update
sudo apt-get install -y clang libelf1 libelf-dev zlib1g-dev libbpf-dev \
linux-tools-$$(uname -r) linux-cloud-tools-$$(uname -r)
linux-tools-$$(uname -r) linux-cloud-tools-$$(uname -r) \
libpcap-dev gcc-multilib build-essential
sudo apt-get install -y lolcat qemu-kvm wget
# 生成 vmlinux.h
.PHONY: vmlinux
Expand Down
2 changes: 2 additions & 0 deletions eBPF_Supermarket/kvm_watcher/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

`kvm_watcher`是一款基于eBPF的kvm虚拟机检测工具,其旨在使用户方便快捷在宿主机侧获取kvm虚拟机中的各种信息,报告所有正在运行的guest行为。

![kvm watcher项目框图](https://gitee.com/nan-shuaibo/image/raw/master/202404251704350.png)

目前,其实现的功能主要包括:

- **[VM Exit 事件分析](./docs/kvm_exit.md)**
Expand Down
5 changes: 3 additions & 2 deletions eBPF_Supermarket/kvm_watcher/docs/kvm_exit.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

考虑到频繁的虚拟机退出事件可能会导致性能问题,kvm_watcher中的kvm_exit子功能通过显示详细的退出原因和在一台主机上运行的所有vm的每个虚拟机的vcpu上的退出计数及处理时延,可以捕获和分析vm exit事件,该工具旨在定位频繁退出的原因(如EPT_VIOLATION、EPT_MISCONFIG、PML_FULL等),在vm exit基础上,如果kvm这个时候因为某些原因,需要退出到用户态的hypervisor(比如qemu),kvm就要设置KVM_EXIT_XXX,此工具包含了这两部分exit reason。

![kvm exit](https://gitee.com/nan-shuaibo/image/raw/master/202404251707665.png)

## 原理介绍

### VMX 操作模式
Expand Down Expand Up @@ -102,5 +104,4 @@ pid tid total_time max_time min_time counts re
- **VM Exit 原因统计**:记录并展示触发 VM Exit 的具体原因,帮助用户理解 VM Exit 发生的上下文和背景。
- **VM Exit 延时分析**:统计每次 VM Exit 处理的最大、最小和总共延时,为性能分析提供量化数据。
- **VM Exit 次数计数**:计算每种类型的 VM Exit 发生的次数,帮助识别最频繁的性能瓶颈。
- **PID、TID号**:其中PID为主机侧的虚拟机进程号,TID为虚拟机内部的vcpu**的进程号**

- **PID、TID号**:其中PID为主机侧的虚拟机进程号,TID为虚拟机内部的vcpu**的进程号**
106 changes: 54 additions & 52 deletions eBPF_Supermarket/kvm_watcher/docs/kvm_irq.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

kvm watcher中的kvm irq子功能模块可以对kvm中的虚拟化中断事件的实时监控和分析能力,可以捕获和记录各种中断事件,支持监控传统的PIC中断、高级的IOAPIC中断以及基于消息的MSI中断,覆盖了KVM虚拟化环境中的主要中断类型。对于每个捕获的中断事件,记录详细信息,包括中断类型、中断注入延时、引脚号、触发方式、目标LAPIC的ID、向量号以及是否被屏蔽等关键数据。

![kvm_irq](https://gitee.com/nan-shuaibo/image/raw/master/202404251710847.png)

## 原理介绍

x86平台主要使用的中断类型有pic、apic及msi中断,在多核系统下的apic结构图如下所示,每个cpu有一个lapic,外部中断通过ioapic转发到lapic,如果是msi中断,则绕过了io apic直接发给lapic。
Expand All @@ -14,48 +16,48 @@ KVM_CREATE_IRQCHIP的ioctl用于在虚拟机初始化阶段创建中断请求芯

```
int kvm_set_routing_entry(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
switch (ue->type) {
case KVM_IRQ_ROUTING_IRQCHIP: //中断路由芯片
if (irqchip_split(kvm))
return -EINVAL;
e->irqchip.pin = ue->u.irqchip.pin;//设置中断芯片引脚
switch (ue->u.irqchip.irqchip) {
case KVM_IRQCHIP_PIC_SLAVE:
e->irqchip.pin += PIC_NUM_PINS / 2; //从片引脚
fallthrough;
case KVM_IRQCHIP_PIC_MASTER:
if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
return -EINVAL;
//// 设置处理 PIC 中断的回调函数
e->set = kvm_set_pic_irq;
break;
case KVM_IRQCHIP_IOAPIC:
if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
return -EINVAL;
// 设置处理 IOPIC 中断的回调函数
e->set = kvm_set_ioapic_irq;
break;
default:
return -EINVAL;
}
e->irqchip.irqchip = ue->u.irqchip.irqchip;
break;
case KVM_IRQ_ROUTING_MSI:
// 设置处理 MSI 中断的回调函数
e->set = kvm_set_msi;
e->msi.address_lo = ue->u.msi.address_lo;
e->msi.address_hi = ue->u.msi.address_hi;
e->msi.data = ue->u.msi.data;

if (kvm_msi_route_invalid(kvm, e))
return -EINVAL;
break;
switch (ue->type) {
case KVM_IRQ_ROUTING_IRQCHIP: //中断路由芯片
if (irqchip_split(kvm))
return -EINVAL;
e->irqchip.pin = ue->u.irqchip.pin;//设置中断芯片引脚
switch (ue->u.irqchip.irqchip) {
case KVM_IRQCHIP_PIC_SLAVE:
e->irqchip.pin += PIC_NUM_PINS / 2; //从片引脚
fallthrough;
case KVM_IRQCHIP_PIC_MASTER:
if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2)
return -EINVAL;
//// 设置处理 PIC 中断的回调函数
e->set = kvm_set_pic_irq;
break;
case KVM_IRQCHIP_IOAPIC:
if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS)
return -EINVAL;
// 设置处理 IOPIC 中断的回调函数
e->set = kvm_set_ioapic_irq;
break;
default:
return -EINVAL;
}
e->irqchip.irqchip = ue->u.irqchip.irqchip;
break;
case KVM_IRQ_ROUTING_MSI:
// 设置处理 MSI 中断的回调函数
e->set = kvm_set_msi;
e->msi.address_lo = ue->u.msi.address_lo;
e->msi.address_hi = ue->u.msi.address_hi;
e->msi.data = ue->u.msi.data;

if (kvm_msi_route_invalid(kvm, e))
return -EINVAL;
break;
.....

return 0;
return 0;
}
```

Expand All @@ -75,22 +77,22 @@ KVM_CREATE_IRQCHIP用于虚拟机向VMM的虚拟apic发送中断请求,再有V
* > 0 中断成功送达的 CPU 数量
*/
int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
bool line_status)
bool line_status)
{
struct kvm_kernel_irq_routing_entry irq_set[KVM_NR_IRQCHIPS];
....
struct kvm_kernel_irq_routing_entry irq_set[KVM_NR_IRQCHIPS];
....

while (i--) {
int r;
r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level,
line_status);
if (r < 0)
continue;
while (i--) {
int r;
r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level,
line_status);
if (r < 0)
continue;

ret = r + ((ret < 0) ? 0 : ret);
}
ret = r + ((ret < 0) ? 0 : ret);
}

return ret;
return ret;
}
```

Expand All @@ -101,7 +103,7 @@ KVM_CREATE_IRQCHIP用于虚拟机向VMM的虚拟apic发送中断请求,再有V
其中ioapic的回调函数kvm_set_ioapic_irq依次调用kvm_ioapic_set_irq、ioapic_set_irq最后调用ioapic_service函数,ioapic_service主要是找到中断的重映射表,然后查找中断的目的地信息并转发到对应vcpu的lapic去处理。然后会调用kvm_irq_delivery_to_apic负责将中断分发给lapic。

> 中断虚拟化详细介绍可以参考:[kvm中断虚拟化 ](https://blog.csdn.net/zgy666/article/details/105456569)
> [内核虚拟化:虚拟中断注入](https://blog.csdn.net/weixin_46324627/article/details/136661252?csdn_share_tail=%7B%22type%22%3A%22blog%22%2C%22rType%22%3A%22article%22%2C%22rId%22%3A%22136661252%22%2C%22source%22%3A%22weixin_46324627%22%7D)
> [内核虚拟化:虚拟中断注入](https://blog.csdn.net/weixin_46324627/article/details/136661252?csdn_share_tail=%7B%22type%22%3A%22blog%22%2C%22rType%22%3A%22article%22%2C%22rId%22%3A%22136661252%22%2C%22source%22%3A%22weixin_46324627%22%7D)

## 挂载点

Expand Down
4 changes: 4 additions & 0 deletions eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ kvm watcher中的kvm vcpu子功能模块是设计用于监控和分析虚拟化

### wakeup、暂停轮询

![kvm vcpu](https://gitee.com/nan-shuaibo/image/raw/master/202404251709557.png)

KVM 暂停轮询系统是 KVM 内的一项功能,其在某些情况下可以通过在vCPU 放弃运行并让出后,在主机中进行一段时间的轮询来降低虚拟机的延迟。简而言之,当vCPU 放弃运行(即执行 cede 操作)或在 PowerPC 中,当一个虚拟核心(vcore)的所有vCPU 都放弃运行时,主机内核会在将 CPU 让给调度程序之前,通过轮询等待唤醒条件。

轮询在某些情况下提供了延迟优势,尤其是在虚拟机可以非常快速地再次运行的情况下。这至少可以通过减少通过调度程序的开销来节省一些时间,通常在几微秒的数量级上,尽管性能优势取决于工作负载。在轮询间隔内,如果没有唤醒源到达,或者运行队列上有其他可运行的任务,则会调用调度程序。因此,在具有非常短唤醒周期的工作负载中,halt轮询特别有用,因为最小化了halt轮询的时间,同时可以避免调用调度程序的时间花费。
Expand All @@ -16,6 +18,8 @@ KVM 暂停轮询系统是 KVM 内的一项功能,其在某些情况下可以

### dirty page

![dirty page](https://gitee.com/nan-shuaibo/image/raw/master/202404251709559.png)

在虚拟化环境中,脏页指的是自上次同步以来已经被修改的内存页。特别是在虚拟机热迁移过程中,源虚拟机上的内存页在复制到目标虚拟机的同时仍然处于活动状态,任何在此过程中对这些页的修改都会导致脏页的产生。监控这些脏页对于优化热迁移过程至关重要,因为过多的脏页生成可能会导致迁移延迟,甚至影响到虚拟机的运行性能。此监控功能特别适用于虚拟机热迁移的场景,其中脏页的精确监控和管理可以显著优化迁移过程。

## 挂载点
Expand Down
2 changes: 1 addition & 1 deletion eBPF_Supermarket/kvm_watcher/include/kvm_exits.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ static int trace_kvm_entry() {
return 0;
}

static int trace_kvm_userspace_entry(struct kvm_vcpu *vcpu) {
static int trace_kvm_vcpu_ioctl() {
pid_t tid = (u32)bpf_get_current_pid_tgid();
u64 ts = bpf_ktime_get_ns();
bpf_map_update_elem(&userspace_exit_times, &tid, &ts, BPF_ANY);
Expand Down
13 changes: 0 additions & 13 deletions eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,9 @@

#include "kvm_watcher.h"
#include "vmlinux.h"
#include <asm-generic/ioctl.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_core_read.h>
#include <bpf/bpf_tracing.h>
#include <linux/version.h>

#define KVMIO 0xAE
#define KVM_CREATE_VM _IO(KVMIO, 0x01) /* returns a VM fd */
#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
#define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events)
#define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events)
#define KVM_SET_USER_MEMORY_REGION \
_IOW(KVMIO, 0x46, struct kvm_userspace_memory_region)
#define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation)
#define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt)
#define KVM_RUN _IO(KVMIO, 0x80)

static int trace_kvm_ioctl(struct trace_event_raw_sys_enter *args) {
int fd = (int)args->args[0];
Expand Down
46 changes: 46 additions & 0 deletions eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,39 @@
#ifndef __KVM_WATCHER_H
#define __KVM_WATCHER_H

static const char binary_path[] = "/bin/qemu-system-x86_64";
#define __ATTACH_UPROBE(skel, sym_name, prog_name, is_retprobe) \
do { \
LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts, .func_name = #sym_name, \
.retprobe = is_retprobe); \
skel->links.prog_name = bpf_program__attach_uprobe_opts( \
skel->progs.prog_name, env.vm_pid, binary_path, 0, &uprobe_opts); \
} while (false)

#define __CHECK_PROGRAM(skel, prog_name) \
do { \
if (!skel->links.prog_name) { \
perror("no program attached for " #prog_name); \
return -errno; \
} \
} while (false)

#define __ATTACH_UPROBE_CHECKED(skel, sym_name, prog_name, is_retprobe) \
do { \
__ATTACH_UPROBE(skel, sym_name, prog_name, is_retprobe); \
__CHECK_PROGRAM(skel, prog_name); \
} while (false)

#define ATTACH_UPROBE(skel, sym_name, prog_name) \
__ATTACH_UPROBE(skel, sym_name, prog_name, false)
#define ATTACH_URETPROBE(skel, sym_name, prog_name) \
__ATTACH_UPROBE(skel, sym_name, prog_name, true)

#define ATTACH_UPROBE_CHECKED(skel, sym_name, prog_name) \
__ATTACH_UPROBE_CHECKED(skel, sym_name, prog_name, false)
#define ATTACH_URETPROBE_CHECKED(skel, sym_name, prog_name) \
__ATTACH_UPROBE_CHECKED(skel, sym_name, prog_name, true)

#define TASK_COMM_LEN 16
#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0)

Expand Down Expand Up @@ -56,6 +89,19 @@
#define APIC_LVT_TIMER_PERIODIC (1 << 17) // 周期性触发模式
#define APIC_LVT_TIMER_TSCDEADLINE (2 << 17) // TSC 截止模式

// IOCTL
#include <asm-generic/ioctl.h>
#define KVMIO 0xAE
#define KVM_CREATE_VM _IO(KVMIO, 0x01)
#define KVM_CREATE_VCPU _IO(KVMIO, 0x41)
#define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events)
#define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events)
#define KVM_SET_USER_MEMORY_REGION \
_IOW(KVMIO, 0x46, struct kvm_userspace_memory_region)
#define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation)
#define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt)
#define KVM_RUN _IO(KVMIO, 0x80)

#define PRINT_USAGE_ERR() \
do { \
fprintf(stderr, "Please specify exactly one option from %s.\n", \
Expand Down
21 changes: 12 additions & 9 deletions eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,18 +68,18 @@ int tp_entry(struct exit *ctx) {
return trace_kvm_entry();
}
// 记录VCPU调度的信息--进入
SEC("kprobe/vmx_vcpu_load")
int BPF_KPROBE(kp_vmx_vcpu_load, struct kvm_vcpu *vcpu, int cpu) {
SEC("fentry/vmx_vcpu_load")
int BPF_PROG(kp_vmx_vcpu_load, struct kvm_vcpu *vcpu, int cpu) {
CHECK_PID(vm_pid);
return trace_vmx_vcpu_load(vcpu, cpu);
}
// 记录VCPU调度的信息--退出
SEC("kprobe/vmx_vcpu_put")
int BPF_KPROBE(kp_vmx_vcpu_put, struct kvm_vcpu *vcpu) {
SEC("fentry/vmx_vcpu_put")
int BPF_PROG(kp_vmx_vcpu_put, struct kvm_vcpu *vcpu) {
return trace_vmx_vcpu_put();
}
SEC("kprobe/mark_page_dirty_in_slot")
int BPF_KPROBE(kp_mark_page_dirty_in_slot, struct kvm *kvm,
SEC("fentry/mark_page_dirty_in_slot")
int BPF_PROG(kp_mark_page_dirty_in_slot, struct kvm *kvm,
const struct kvm_memory_slot *memslot, gfn_t gfn) {
CHECK_PID(vm_pid);
return trace_mark_page_dirty_in_slot(kvm, memslot, gfn, &rb, e);
Expand Down Expand Up @@ -172,10 +172,13 @@ int tp_ioctl(struct trace_event_raw_sys_enter *args) {
CHECK_PID(vm_pid);
return trace_kvm_ioctl(args);
}
SEC("fentry/kvm_arch_vcpu_ioctl_run")
int BPF_PROG(fentry_kvm_arch_vcpu_ioctl_run, struct kvm_vcpu *vcpu) {

SEC("uprobe")
int BPF_KPROBE(up_kvm_vcpu_ioctl, void *cpu, int type) {
CHECK_PID(vm_pid);
return trace_kvm_userspace_entry(vcpu);
if (type != KVM_RUN)
return 0;
return trace_kvm_vcpu_ioctl();
}

SEC("tp/kvm/kvm_userspace_exit")
Expand Down
Loading
Loading