-
Notifications
You must be signed in to change notification settings - Fork 3.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
nvmecmd: add nvme command observability tool
The tool observes NVMe commands and checks for LBA and block size alignment. The tool is used as part of the Large block size (LBS) effort [1] in the kernel to validate min order mapping [2]. [1] https://kernelnewbies.org/KernelProjects/large-block-size [2] min order: use of min order: linux-kdevops/linux@563cea7 add min order support: linux-kdevops/linux@27f85d8 upstream RFC: https://lore.kernel.org/all/[email protected]/ Signed-off-by: Daniel Gomez <[email protected]>
- Loading branch information
Showing
5 changed files
with
550 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
.TH nvmecmd 8 "2023-11-06 "USER COMMANDS" | ||
.SH NAME | ||
nvmecmd \- Observes NVMe commands and alignment. | ||
.SH SYNOPSIS | ||
.B nvmecmd.py [\-h] [\-d DISK] [\-o OPS] [\--debug] [\--trace] | ||
.B [\--interval INTERVAL] | ||
.SH DESCRIPTION | ||
nvmecmd observes and traces NVMe commands. The program attaches kprobe on | ||
`nvme_setup_cmd` by default to capture NVMe commands issued to any device. If | ||
disk and/or operation filters are used, the program will then skip capturing | ||
for that particular event. If tracing option is passed, then all captured events | ||
will be printed in a table with the following columns, sorted from left to | ||
right: | ||
|
||
- DISK: Prints the NVMe node (e.g. 'nvme0n9'). | ||
|
||
- OPS: Prints the NVMe operation (read/write). | ||
|
||
- LEN: Prints the length in bytes. | ||
|
||
- LBA: Prints the Logical Block Address (LBA). | ||
|
||
- PID: Prints the process ID. | ||
|
||
- COMM: Prints the process name (command). | ||
|
||
- ALGN: Prints the maximum alignment possible in power-of-2 bytes. Example: | ||
An alignment value of 16384 (16k) indicates the command is aligned in size and | ||
LBA to 4k, 8k and 16k. | ||
|
||
Since this uses BPF, only the root user can use this tool. | ||
.SH REQUIREMENTS | ||
CONFIG_BPF and bcc | ||
.SH OPTIONS | ||
.TP | ||
\-h, --help | ||
show this help message and exit | ||
.TP | ||
\-d DISK, --disk DISK | ||
If set, the BPF will add a disk name filter to skip NVMe commands that don't | ||
match the given NVMe node. | ||
Example: nvme0n9 | ||
.TP | ||
\-o OPS, --ops OPS | ||
If set, the BPF will add a operation filter to skip NVMe commands that don't | ||
match the given operation. A full list of the operation values can be found at | ||
the 'enum req_op' in the kernel header 'include/linux/blk_types.h'. | ||
.TP | ||
\--debug | ||
Prints BPF code before capturing. | ||
.TP | ||
\--trace | ||
Prints NVMe captured commands in a table form. | ||
|
||
Header: DISK OPS LEN LBA PID COMM ALGN. | ||
.TP | ||
\--interval INTERVAL | ||
Specifies the maximum event polling event interval. | ||
.SH EXAMPLES | ||
.TP | ||
Observe all NVMe commands and print a power-of-2 histogram with the block and \ | ||
alignment sizes at the end. | ||
# | ||
.B nvmecmd | ||
.TP | ||
Observe all commands issued to the 9th NVMe node and print a power-of-2 \ | ||
histogram with the block and alignment sizes at the end. | ||
# | ||
.B nvmecmd --disk nvme9n1 | ||
.TP | ||
Observe and trace all write commands issued to the 9th NVMe node. And print a \ | ||
power-of-2 histogram with the block and alignment sizes at the end. | ||
# | ||
.B nvmecmd --disk nvme9n1 --ops write --trace | ||
.TP | ||
Print eBPF program before observe starts. Observe and trace all write \ | ||
command issued to the 9th NVMe node. And print a power-of-2 histogram with the \ | ||
block and alignment sizes at the end. | ||
# | ||
.B nvmecmd --disk nvme9n1 --ops write --debug | ||
.TP | ||
Observe and trace all write commands issued to the 9th NVMe node. Poll NVMe \ | ||
events from the data ring buffer every 100 ms. And print a power-of-2 \ | ||
histogram with the block and alignment sizes at the end. | ||
# | ||
.B nvmecmd --disk nvme9n1 --ops write --interval 0.1 | ||
.SH OVERHEAD | ||
This traces all NVMe commands issued to any device. The overhead of this can be | ||
high if the volume of the commands is high. To reduce overhead, add filters | ||
such as disk ('--disk') and/or operation ('--ops'). You can also increase the | ||
polling interval ('--interval') when tracing ('--trace') or if possible, just | ||
disable tracing completely. You should only run this on a process where the | ||
slowdown is acceptable. | ||
.SH SOURCE | ||
This is from bcc. | ||
.IP | ||
https://github.com/iovisor/bcc | ||
.PP | ||
Also look in the bcc distribution for a companion _examples.txt file containing | ||
example usage, output, and commentary for this tool. | ||
.SH OS | ||
Linux | ||
.SH STABILITY | ||
Unstable - in development. | ||
.SH AUTHOR | ||
Daniel Gomez |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,239 @@ | ||
#!/usr/bin/env python | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# nvmecmd NVMe command observability tool. | ||
# | ||
# Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved. | ||
# Licensed under the Apache License, Version 2.0 (the "License") | ||
# | ||
# 06-Nov-2023 Daniel Gomez Created this. | ||
from __future__ import ( | ||
absolute_import, division, unicode_literals, print_function | ||
) | ||
from bcc import BPF | ||
import argparse | ||
import time | ||
|
||
examples = """examples: | ||
nvmecmd # Observe all NVMe commands | ||
nvmecmd --disk nvme9n1 # Observe all commands on 9th NVMe node | ||
nvmecmd --ops read # Observe read commands on all NVMe | ||
nvmecmd --ops write # Observe write commands on all NVMe | ||
nvmecmd --ops write --disk nvme9n1 # Observe write commands on 9th NVMe node | ||
nvmecmd --debug # Print eBPF program before observe | ||
nvmecmd --trace # Print NVMe captured events | ||
nvmecmd --interval 0.1 # Poll data ring buffer every 100 ms | ||
""" | ||
|
||
parser = argparse.ArgumentParser( | ||
description="NVMe commands observer tool", | ||
formatter_class=argparse.RawDescriptionHelpFormatter, | ||
epilog=examples, | ||
) | ||
parser.add_argument( | ||
"-d", | ||
"--disk", | ||
type=str, | ||
help="capture commands for this NVMe disk node only" | ||
) | ||
parser.add_argument( | ||
"-o", | ||
"--ops", | ||
type=str, | ||
help="capture this command operation only" | ||
) | ||
parser.add_argument("--debug", action="store_true", help="debug") | ||
parser.add_argument( | ||
"--trace", | ||
action="store_true", | ||
help="trace NVMe captured commands" | ||
) | ||
parser.add_argument( | ||
"--interval", | ||
type=float, | ||
help="polling interval" | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
# define BPF program | ||
bpf_text = """ | ||
#include <uapi/linux/ptrace.h> | ||
#include <linux/blk-mq.h> | ||
#include <linux/blk_types.h> | ||
#include <linux/nvme.h> | ||
struct nvme_ns { | ||
struct list_head list; | ||
struct nvme_ctrl *ctrl; | ||
struct request_queue *queue; | ||
struct gendisk *disk; | ||
#ifdef CONFIG_NVME_MULTIPATH | ||
enum nvme_ana_state ana_state; | ||
u32 ana_grpid; | ||
#endif | ||
struct list_head siblings; | ||
struct kref kref; | ||
struct nvme_ns_head *head; | ||
int lba_shift; | ||
// [...] | ||
}; | ||
struct data_t { | ||
u32 pid; | ||
char comm[TASK_COMM_LEN]; | ||
char disk[DISK_NAME_LEN]; | ||
u32 op; | ||
u32 len; | ||
u32 lba; | ||
u32 algn; | ||
}; | ||
BPF_HISTOGRAM(block_len, u32, 64); | ||
BPF_HISTOGRAM(algn, u32, 64); | ||
BPF_ARRAY(counts, u64, 1); | ||
BPF_RINGBUF_OUTPUT(events, 8); | ||
/* local strcmp function, max length 16 to protect instruction loops */ | ||
#define CMPMAX 16 | ||
static int local_strcmp(const char *cs, const char *ct) | ||
{ | ||
int len = 0; | ||
unsigned char c1, c2; | ||
while (len++ < CMPMAX) { | ||
c1 = *cs++; | ||
c2 = *ct++; | ||
if (c1 != c2) | ||
return c1 < c2 ? -1 : 1; | ||
if (!c1) | ||
break; | ||
} | ||
return 0; | ||
} | ||
""" | ||
|
||
bpf_text_disk_filter = "" | ||
if args.disk: | ||
bpf_text_disk_filter = """ | ||
if (local_strcmp(req->q->disk->disk_name, "{disk}")) | ||
return; | ||
""".format( | ||
disk=args.disk | ||
) | ||
|
||
bpf_text_ops_filter = "" | ||
# Operation dictionary. Full list of operations at Linux kernel | ||
# 'include/linux/blk_types.h' header file. | ||
nvme_ops = { | ||
0: "read", | ||
1: "write", | ||
"read": 0, | ||
"write": 1, | ||
} | ||
if args.ops: | ||
operation = nvme_ops[args.ops] | ||
bpf_text_ops_filter = """ | ||
if ((req->cmd_flags & 0xff) != {ops}) | ||
return; | ||
""".format( | ||
ops=operation | ||
) | ||
|
||
bpf_text += """ | ||
void kprobe__nvme_setup_cmd(struct pt_regs *ctx, struct nvme_ns *ns, | ||
struct request *req) | ||
{{ | ||
struct data_t data = {{}}; | ||
u32 max_algn_size = 4096, algn_size = 4096; | ||
u32 lba_len = algn_size / 4096; | ||
bool is_algn = false; | ||
u8 i; | ||
{disk_filter} | ||
{ops_filter} | ||
data.pid = bpf_get_current_pid_tgid() >> 32; | ||
bpf_get_current_comm(&data.comm, sizeof(data.comm)); | ||
bpf_probe_read_kernel(&data.disk, sizeof(data.disk), | ||
req->q->disk->disk_name); | ||
data.op = req->cmd_flags & 0xff; | ||
data.len = req->__data_len; | ||
data.lba = req->__sector >> (ns->lba_shift - SECTOR_SHIFT); | ||
for (i=0; i<8; i++) {{ | ||
is_algn = !(data.len % algn_size) && !(data.lba % lba_len); | ||
if (is_algn) {{ | ||
max_algn_size = algn_size; | ||
}} | ||
algn_size = algn_size << 1; | ||
lba_len = algn_size / 4096; | ||
}} | ||
data.algn = max_algn_size; | ||
events.ringbuf_output(&data, sizeof(data), 0); | ||
block_len.increment(bpf_log2l(req->__data_len)); | ||
algn.increment(bpf_log2l(max_algn_size)); | ||
}} | ||
""".format( | ||
disk_filter=bpf_text_disk_filter, ops_filter=bpf_text_ops_filter | ||
) | ||
|
||
if args.debug: | ||
print(args) | ||
print(bpf_text) | ||
|
||
bpf = BPF(text=bpf_text) | ||
if args.trace: | ||
print("Tracing NVMe commands... Hit Ctrl-C to end.") | ||
print( | ||
"%-10s %-8s %-8s %-10s %-10s %-16s %-8s" | ||
% ("DISK", "OPS", "LEN", "LBA", "PID", "COMM", "ALGN") | ||
) | ||
|
||
|
||
def capture_event(ctx, data, size): | ||
event = bpf["events"].event(data) | ||
if args.trace: | ||
print_event(event) | ||
|
||
|
||
def print_event(event): | ||
print( | ||
"%-10s %-8s %-8s %-10s %-10s %-16s %-8s" | ||
% ( | ||
event.disk.decode("utf-8", "replace"), | ||
nvme_ops[event.op], | ||
event.len, | ||
event.lba, | ||
event.pid, | ||
event.comm.decode("utf-8", "replace"), | ||
event.algn, | ||
), | ||
) | ||
|
||
|
||
bpf["events"].open_ring_buffer(capture_event) | ||
block_len = bpf["block_len"] | ||
algn = bpf["algn"] | ||
while 1: | ||
try: | ||
bpf.ring_buffer_poll(30) | ||
if args.interval: | ||
time.sleep(abs(args.interval)) | ||
except KeyboardInterrupt: | ||
bpf.ring_buffer_consume() | ||
print() | ||
block_len.print_log2_hist( | ||
"Block size", "operation", section_print_fn=bytes.decode | ||
) | ||
block_len.clear() | ||
print() | ||
algn.print_log2_hist("Algn size", "operation", | ||
section_print_fn=bytes.decode) | ||
algn.clear() | ||
break | ||
exit() |
Oops, something went wrong.