blob: 882835f6395c77eba2c8886d8d2bfbc764657502 [file] [log] [blame]
Alexey Ivanovcc01a9c2019-01-16 09:50:46 -08001#!/usr/bin/python
Brendan Gregg6f075b92016-02-07 00:46:34 -08002# @lint-avoid-python-3-compatibility-imports
3#
4# biotop block device (disk) I/O by process.
5# For Linux, uses BCC, eBPF.
6#
Nathan Scottf5fb9af2018-01-17 09:39:59 +11007# USAGE: biotop.py [-h] [-C] [-r MAXROWS] [interval] [count]
Brendan Gregg6f075b92016-02-07 00:46:34 -08008#
9# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
10# request, as well as a starting timestamp for calculating I/O latency.
11#
12# Copyright 2016 Netflix, Inc.
13# Licensed under the Apache License, Version 2.0 (the "License")
14#
15# 06-Feb-2016 Brendan Gregg Created this.
16
17from __future__ import print_function
18from bcc import BPF
19from time import sleep, strftime
20import argparse
Brendan Gregg6f075b92016-02-07 00:46:34 -080021from subprocess import call
22
23# arguments
24examples = """examples:
25 ./biotop # block device I/O top, 1 second refresh
26 ./biotop -C # don't clear the screen
27 ./biotop 5 # 5 second summaries
28 ./biotop 5 10 # 5 second summaries, 10 times only
29"""
30parser = argparse.ArgumentParser(
31 description="Block device (disk) I/O by process",
32 formatter_class=argparse.RawDescriptionHelpFormatter,
Nathan Scott1a197db2018-01-21 09:14:27 +110033 epilog=examples)
Brendan Gregg6f075b92016-02-07 00:46:34 -080034parser.add_argument("-C", "--noclear", action="store_true",
35 help="don't clear the screen")
36parser.add_argument("-r", "--maxrows", default=20,
37 help="maximum rows to print, default 20")
38parser.add_argument("interval", nargs="?", default=1,
39 help="output interval, in seconds")
40parser.add_argument("count", nargs="?", default=99999999,
41 help="number of outputs")
Nathan Scottf5fb9af2018-01-17 09:39:59 +110042parser.add_argument("--ebpf", action="store_true",
43 help=argparse.SUPPRESS)
Brendan Gregg6f075b92016-02-07 00:46:34 -080044args = parser.parse_args()
45interval = int(args.interval)
46countdown = int(args.count)
47maxrows = int(args.maxrows)
48clear = not int(args.noclear)
49
50# linux stats
51loadavg = "/proc/loadavg"
52diskstats = "/proc/diskstats"
53
Brendan Gregg6f075b92016-02-07 00:46:34 -080054# load BPF program
Nathan Scottca4ba552018-01-16 11:02:58 +110055bpf_text = """
Brendan Gregg6f075b92016-02-07 00:46:34 -080056#include <uapi/linux/ptrace.h>
Jerome Marchandee810722022-02-23 16:04:30 +010057#include <linux/blk-mq.h>
Brendan Gregg6f075b92016-02-07 00:46:34 -080058
Andreas Gerstmayr77668262021-02-25 19:33:08 +010059// for saving the timestamp and __data_len of each request
60struct start_req_t {
61 u64 ts;
62 u64 data_len;
63};
64
Brendan Gregg6321d002016-02-07 00:54:44 -080065// for saving process info by request
Brendan Gregg6f075b92016-02-07 00:46:34 -080066struct who_t {
67 u32 pid;
68 char name[TASK_COMM_LEN];
69};
70
Brendan Gregg6321d002016-02-07 00:54:44 -080071// the key for the output summary
Brendan Gregg6f075b92016-02-07 00:46:34 -080072struct info_t {
73 u32 pid;
Brendan Gregg51add782016-11-29 11:11:31 -080074 int rwflag;
Brendan Gregg6f075b92016-02-07 00:46:34 -080075 int major;
76 int minor;
77 char name[TASK_COMM_LEN];
78};
79
Brendan Gregg6321d002016-02-07 00:54:44 -080080// the value of the output summary
Brendan Gregg6f075b92016-02-07 00:46:34 -080081struct val_t {
82 u64 bytes;
83 u64 us;
84 u32 io;
85};
86
Andreas Gerstmayr77668262021-02-25 19:33:08 +010087BPF_HASH(start, struct request *, struct start_req_t);
Brendan Gregg6f075b92016-02-07 00:46:34 -080088BPF_HASH(whobyreq, struct request *, struct who_t);
89BPF_HASH(counts, struct info_t, struct val_t);
90
91// cache PID and comm by-req
92int trace_pid_start(struct pt_regs *ctx, struct request *req)
93{
94 struct who_t who = {};
95
96 if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
Brendan Greggeb32c152019-03-19 22:29:49 -070097 who.pid = bpf_get_current_pid_tgid() >> 32;
Brendan Gregg6f075b92016-02-07 00:46:34 -080098 whobyreq.update(&req, &who);
99 }
100
101 return 0;
102}
103
104// time block I/O
105int trace_req_start(struct pt_regs *ctx, struct request *req)
106{
Andreas Gerstmayr77668262021-02-25 19:33:08 +0100107 struct start_req_t start_req = {
108 .ts = bpf_ktime_get_ns(),
109 .data_len = req->__data_len
110 };
111 start.update(&req, &start_req);
Brendan Gregg6f075b92016-02-07 00:46:34 -0800112 return 0;
113}
114
115// output
116int trace_req_completion(struct pt_regs *ctx, struct request *req)
117{
Andreas Gerstmayr77668262021-02-25 19:33:08 +0100118 struct start_req_t *startp;
Brendan Gregg6f075b92016-02-07 00:46:34 -0800119
120 // fetch timestamp and calculate delta
Andreas Gerstmayr77668262021-02-25 19:33:08 +0100121 startp = start.lookup(&req);
122 if (startp == 0) {
Brendan Gregg6f075b92016-02-07 00:46:34 -0800123 return 0; // missed tracing issue
124 }
125
126 struct who_t *whop;
127 struct val_t *valp, zero = {};
Andreas Gerstmayr77668262021-02-25 19:33:08 +0100128 u64 delta_us = (bpf_ktime_get_ns() - startp->ts) / 1000;
Brendan Gregg6f075b92016-02-07 00:46:34 -0800129
130 // setup info_t key
131 struct info_t info = {};
Tejun Heoddfcc292022-03-10 08:37:21 -1000132 info.major = req->__RQ_DISK__->major;
133 info.minor = req->__RQ_DISK__->first_minor;
Brendan Gregg51add782016-11-29 11:11:31 -0800134/*
135 * The following deals with a kernel version change (in mainline 4.7, although
136 * it may be backported to earlier kernels) with how block request write flags
137 * are tested. We handle both pre- and post-change versions here. Please avoid
138 * kernel version tests like this as much as possible: they inflate the code,
139 * test, and maintenance burden.
140 */
141#ifdef REQ_WRITE
142 info.rwflag = !!(req->cmd_flags & REQ_WRITE);
Ryan Learybc43a292017-01-07 15:34:31 -0500143#elif defined(REQ_OP_SHIFT)
Brendan Gregg51add782016-11-29 11:11:31 -0800144 info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
Ryan Learybc43a292017-01-07 15:34:31 -0500145#else
146 info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
Brendan Gregg51add782016-11-29 11:11:31 -0800147#endif
148
Brendan Gregg6f075b92016-02-07 00:46:34 -0800149 whop = whobyreq.lookup(&req);
150 if (whop == 0) {
151 // missed pid who, save stats as pid 0
yonghong-song82f43022019-10-31 08:16:12 -0700152 valp = counts.lookup_or_try_init(&info, &zero);
Brendan Gregg6f075b92016-02-07 00:46:34 -0800153 } else {
154 info.pid = whop->pid;
155 __builtin_memcpy(&info.name, whop->name, sizeof(info.name));
yonghong-song82f43022019-10-31 08:16:12 -0700156 valp = counts.lookup_or_try_init(&info, &zero);
Brendan Gregg6f075b92016-02-07 00:46:34 -0800157 }
158
Philip Gladstoneba64f032019-09-20 01:12:01 -0400159 if (valp) {
160 // save stats
161 valp->us += delta_us;
Andreas Gerstmayr77668262021-02-25 19:33:08 +0100162 valp->bytes += startp->data_len;
Philip Gladstoneba64f032019-09-20 01:12:01 -0400163 valp->io++;
164 }
Brendan Gregg6f075b92016-02-07 00:46:34 -0800165
166 start.delete(&req);
167 whobyreq.delete(&req);
168
169 return 0;
170}
Nathan Scottca4ba552018-01-16 11:02:58 +1100171"""
172
173if args.ebpf:
174 print(bpf_text)
175 exit()
176
Tejun Heoddfcc292022-03-10 08:37:21 -1000177if BPF.kernel_struct_has_field(b'request', b'rq_disk'):
178 bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk')
179else:
180 bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk')
181
Nathan Scottca4ba552018-01-16 11:02:58 +1100182b = BPF(text=bpf_text)
Hengqi Chen8c80b292021-12-11 17:36:17 +0800183if BPF.get_kprobe_functions(b'__blk_account_io_start'):
184 b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
185else:
186 b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
Jerome Marchand74e25ed2018-12-10 08:54:50 +0100187if BPF.get_kprobe_functions(b'blk_start_request'):
188 b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
Brendan Gregg6f075b92016-02-07 00:46:34 -0800189b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
Hengqi Chen8c80b292021-12-11 17:36:17 +0800190if BPF.get_kprobe_functions(b'__blk_account_io_done'):
191 b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion")
192else:
193 b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
Brendan Gregg6f075b92016-02-07 00:46:34 -0800194
195print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
196
197# cache disk major,minor -> diskname
198disklookup = {}
199with open(diskstats) as stats:
200 for line in stats:
201 a = line.split()
202 disklookup[a[0] + "," + a[1]] = a[2]
203
204# output
205exiting = 0
206while 1:
207 try:
208 sleep(interval)
209 except KeyboardInterrupt:
210 exiting = 1
211
212 # header
213 if clear:
214 call("clear")
215 else:
216 print()
217 with open(loadavg) as stats:
218 print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
219 print("%-6s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM",
220 "D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms"))
221
222 # by-PID output
223 counts = b.get_table("counts")
224 line = 0
225 for k, v in reversed(sorted(counts.items(),
226 key=lambda counts: counts[1].bytes)):
227
228 # lookup disk
229 disk = str(k.major) + "," + str(k.minor)
230 if disk in disklookup:
231 diskname = disklookup[disk]
232 else:
233 diskname = "?"
234
235 # print line
236 avg_ms = (float(v.us) / 1000) / v.io
Rafael F78948e42017-03-26 14:54:25 +0200237 print("%-6d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid,
jeromemarchandb96ebcd2018-10-10 01:58:15 +0200238 k.name.decode('utf-8', 'replace'), "W" if k.rwflag else "R",
239 k.major, k.minor, diskname, v.io, v.bytes / 1024, avg_ms))
Brendan Gregg6f075b92016-02-07 00:46:34 -0800240
241 line += 1
242 if line >= maxrows:
243 break
244 counts.clear()
245
246 countdown -= 1
247 if exiting or countdown == 0:
248 print("Detaching...")
249 exit()