blob: 3c9c071cf75c09d4e9eaf00861f549575b262756 [file] [log] [blame]
Alexey Ivanovcc01a9c2019-01-16 09:50:46 -08001#!/usr/bin/python
Brendan Gregg6f075b92016-02-07 00:46:34 -08002# @lint-avoid-python-3-compatibility-imports
3#
4# biotop block device (disk) I/O by process.
5# For Linux, uses BCC, eBPF.
6#
xingfeng251014dacd82022-03-17 22:53:00 +08007# USAGE: biotop.py [-h] [-C] [-r MAXROWS] [-p PID] [interval] [count]
Brendan Gregg6f075b92016-02-07 00:46:34 -08008#
9# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
10# request, as well as a starting timestamp for calculating I/O latency.
11#
12# Copyright 2016 Netflix, Inc.
13# Licensed under the Apache License, Version 2.0 (the "License")
14#
15# 06-Feb-2016 Brendan Gregg Created this.
xingfeng251014dacd82022-03-17 22:53:00 +080016# 17-Mar-2022 Rocky Xing Added PID filter support.
Brendan Gregg6f075b92016-02-07 00:46:34 -080017
18from __future__ import print_function
19from bcc import BPF
20from time import sleep, strftime
21import argparse
Brendan Gregg6f075b92016-02-07 00:46:34 -080022from subprocess import call
23
24# arguments
25examples = """examples:
26 ./biotop # block device I/O top, 1 second refresh
27 ./biotop -C # don't clear the screen
xingfeng251014dacd82022-03-17 22:53:00 +080028 ./biotop -p 181 # only trace PID 181
Brendan Gregg6f075b92016-02-07 00:46:34 -080029 ./biotop 5 # 5 second summaries
30 ./biotop 5 10 # 5 second summaries, 10 times only
31"""
32parser = argparse.ArgumentParser(
33 description="Block device (disk) I/O by process",
34 formatter_class=argparse.RawDescriptionHelpFormatter,
Nathan Scott1a197db2018-01-21 09:14:27 +110035 epilog=examples)
Brendan Gregg6f075b92016-02-07 00:46:34 -080036parser.add_argument("-C", "--noclear", action="store_true",
37 help="don't clear the screen")
38parser.add_argument("-r", "--maxrows", default=20,
39 help="maximum rows to print, default 20")
xingfeng251014dacd82022-03-17 22:53:00 +080040parser.add_argument("-p", "--pid", type=int, metavar="PID",
41 help="trace this PID only")
Brendan Gregg6f075b92016-02-07 00:46:34 -080042parser.add_argument("interval", nargs="?", default=1,
43 help="output interval, in seconds")
44parser.add_argument("count", nargs="?", default=99999999,
45 help="number of outputs")
Nathan Scottf5fb9af2018-01-17 09:39:59 +110046parser.add_argument("--ebpf", action="store_true",
47 help=argparse.SUPPRESS)
Brendan Gregg6f075b92016-02-07 00:46:34 -080048args = parser.parse_args()
49interval = int(args.interval)
50countdown = int(args.count)
51maxrows = int(args.maxrows)
52clear = not int(args.noclear)
53
54# linux stats
55loadavg = "/proc/loadavg"
56diskstats = "/proc/diskstats"
57
Brendan Gregg6f075b92016-02-07 00:46:34 -080058# load BPF program
Nathan Scottca4ba552018-01-16 11:02:58 +110059bpf_text = """
Brendan Gregg6f075b92016-02-07 00:46:34 -080060#include <uapi/linux/ptrace.h>
Jerome Marchandee810722022-02-23 16:04:30 +010061#include <linux/blk-mq.h>
Brendan Gregg6f075b92016-02-07 00:46:34 -080062
Andreas Gerstmayr77668262021-02-25 19:33:08 +010063// for saving the timestamp and __data_len of each request
64struct start_req_t {
65 u64 ts;
66 u64 data_len;
67};
68
Brendan Gregg6321d002016-02-07 00:54:44 -080069// for saving process info by request
Brendan Gregg6f075b92016-02-07 00:46:34 -080070struct who_t {
71 u32 pid;
72 char name[TASK_COMM_LEN];
73};
74
Brendan Gregg6321d002016-02-07 00:54:44 -080075// the key for the output summary
Brendan Gregg6f075b92016-02-07 00:46:34 -080076struct info_t {
77 u32 pid;
Brendan Gregg51add782016-11-29 11:11:31 -080078 int rwflag;
Brendan Gregg6f075b92016-02-07 00:46:34 -080079 int major;
80 int minor;
81 char name[TASK_COMM_LEN];
82};
83
Brendan Gregg6321d002016-02-07 00:54:44 -080084// the value of the output summary
Brendan Gregg6f075b92016-02-07 00:46:34 -080085struct val_t {
86 u64 bytes;
87 u64 us;
88 u32 io;
89};
90
Andreas Gerstmayr77668262021-02-25 19:33:08 +010091BPF_HASH(start, struct request *, struct start_req_t);
Brendan Gregg6f075b92016-02-07 00:46:34 -080092BPF_HASH(whobyreq, struct request *, struct who_t);
93BPF_HASH(counts, struct info_t, struct val_t);
94
95// cache PID and comm by-req
96int trace_pid_start(struct pt_regs *ctx, struct request *req)
97{
98 struct who_t who = {};
xingfeng251014dacd82022-03-17 22:53:00 +080099 u32 pid;
Brendan Gregg6f075b92016-02-07 00:46:34 -0800100
101 if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
xingfeng251014dacd82022-03-17 22:53:00 +0800102 pid = bpf_get_current_pid_tgid() >> 32;
103 if (FILTER_PID)
104 return 0;
105
106 who.pid = pid;
Brendan Gregg6f075b92016-02-07 00:46:34 -0800107 whobyreq.update(&req, &who);
108 }
109
110 return 0;
111}
112
113// time block I/O
114int trace_req_start(struct pt_regs *ctx, struct request *req)
115{
Andreas Gerstmayr77668262021-02-25 19:33:08 +0100116 struct start_req_t start_req = {
117 .ts = bpf_ktime_get_ns(),
118 .data_len = req->__data_len
119 };
120 start.update(&req, &start_req);
Brendan Gregg6f075b92016-02-07 00:46:34 -0800121 return 0;
122}
123
124// output
125int trace_req_completion(struct pt_regs *ctx, struct request *req)
126{
Andreas Gerstmayr77668262021-02-25 19:33:08 +0100127 struct start_req_t *startp;
Brendan Gregg6f075b92016-02-07 00:46:34 -0800128
129 // fetch timestamp and calculate delta
Andreas Gerstmayr77668262021-02-25 19:33:08 +0100130 startp = start.lookup(&req);
131 if (startp == 0) {
Brendan Gregg6f075b92016-02-07 00:46:34 -0800132 return 0; // missed tracing issue
133 }
134
135 struct who_t *whop;
xingfeng251014dacd82022-03-17 22:53:00 +0800136 u32 pid;
137
138 whop = whobyreq.lookup(&req);
139 pid = whop != 0 ? whop->pid : 0;
140 if (FILTER_PID) {
141 start.delete(&req);
142 if (whop != 0) {
143 whobyreq.delete(&req);
144 }
145 return 0;
146 }
147
Brendan Gregg6f075b92016-02-07 00:46:34 -0800148 struct val_t *valp, zero = {};
Andreas Gerstmayr77668262021-02-25 19:33:08 +0100149 u64 delta_us = (bpf_ktime_get_ns() - startp->ts) / 1000;
Brendan Gregg6f075b92016-02-07 00:46:34 -0800150
151 // setup info_t key
152 struct info_t info = {};
Tejun Heoddfcc292022-03-10 08:37:21 -1000153 info.major = req->__RQ_DISK__->major;
154 info.minor = req->__RQ_DISK__->first_minor;
Brendan Gregg51add782016-11-29 11:11:31 -0800155/*
156 * The following deals with a kernel version change (in mainline 4.7, although
157 * it may be backported to earlier kernels) with how block request write flags
158 * are tested. We handle both pre- and post-change versions here. Please avoid
159 * kernel version tests like this as much as possible: they inflate the code,
160 * test, and maintenance burden.
161 */
162#ifdef REQ_WRITE
163 info.rwflag = !!(req->cmd_flags & REQ_WRITE);
Ryan Learybc43a292017-01-07 15:34:31 -0500164#elif defined(REQ_OP_SHIFT)
Brendan Gregg51add782016-11-29 11:11:31 -0800165 info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
Ryan Learybc43a292017-01-07 15:34:31 -0500166#else
167 info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
Brendan Gregg51add782016-11-29 11:11:31 -0800168#endif
169
Brendan Gregg6f075b92016-02-07 00:46:34 -0800170 if (whop == 0) {
171 // missed pid who, save stats as pid 0
yonghong-song82f43022019-10-31 08:16:12 -0700172 valp = counts.lookup_or_try_init(&info, &zero);
Brendan Gregg6f075b92016-02-07 00:46:34 -0800173 } else {
174 info.pid = whop->pid;
175 __builtin_memcpy(&info.name, whop->name, sizeof(info.name));
yonghong-song82f43022019-10-31 08:16:12 -0700176 valp = counts.lookup_or_try_init(&info, &zero);
Brendan Gregg6f075b92016-02-07 00:46:34 -0800177 }
178
Philip Gladstoneba64f032019-09-20 01:12:01 -0400179 if (valp) {
180 // save stats
181 valp->us += delta_us;
Andreas Gerstmayr77668262021-02-25 19:33:08 +0100182 valp->bytes += startp->data_len;
Philip Gladstoneba64f032019-09-20 01:12:01 -0400183 valp->io++;
184 }
Brendan Gregg6f075b92016-02-07 00:46:34 -0800185
186 start.delete(&req);
187 whobyreq.delete(&req);
188
189 return 0;
190}
Nathan Scottca4ba552018-01-16 11:02:58 +1100191"""
192
193if args.ebpf:
194 print(bpf_text)
195 exit()
196
Tejun Heoddfcc292022-03-10 08:37:21 -1000197if BPF.kernel_struct_has_field(b'request', b'rq_disk'):
198 bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk')
199else:
200 bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk')
201
xingfeng251014dacd82022-03-17 22:53:00 +0800202if args.pid is not None:
203 bpf_text = bpf_text.replace('FILTER_PID', 'pid != %d' % args.pid)
204else:
205 bpf_text = bpf_text.replace('FILTER_PID', '0')
206
Nathan Scottca4ba552018-01-16 11:02:58 +1100207b = BPF(text=bpf_text)
Hengqi Chen8c80b292021-12-11 17:36:17 +0800208if BPF.get_kprobe_functions(b'__blk_account_io_start'):
209 b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
210else:
211 b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
Jerome Marchand74e25ed2018-12-10 08:54:50 +0100212if BPF.get_kprobe_functions(b'blk_start_request'):
213 b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
Brendan Gregg6f075b92016-02-07 00:46:34 -0800214b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
Hengqi Chen8c80b292021-12-11 17:36:17 +0800215if BPF.get_kprobe_functions(b'__blk_account_io_done'):
216 b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion")
217else:
218 b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
Brendan Gregg6f075b92016-02-07 00:46:34 -0800219
220print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
221
222# cache disk major,minor -> diskname
223disklookup = {}
224with open(diskstats) as stats:
225 for line in stats:
226 a = line.split()
227 disklookup[a[0] + "," + a[1]] = a[2]
228
229# output
230exiting = 0
231while 1:
232 try:
233 sleep(interval)
234 except KeyboardInterrupt:
235 exiting = 1
236
237 # header
238 if clear:
239 call("clear")
240 else:
241 print()
242 with open(loadavg) as stats:
243 print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
xingfeng251003e49482022-03-17 13:07:16 +0800244 print("%-7s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM",
Brendan Gregg6f075b92016-02-07 00:46:34 -0800245 "D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms"))
246
247 # by-PID output
248 counts = b.get_table("counts")
249 line = 0
250 for k, v in reversed(sorted(counts.items(),
251 key=lambda counts: counts[1].bytes)):
252
253 # lookup disk
254 disk = str(k.major) + "," + str(k.minor)
255 if disk in disklookup:
256 diskname = disklookup[disk]
257 else:
258 diskname = "?"
259
260 # print line
261 avg_ms = (float(v.us) / 1000) / v.io
xingfeng251003e49482022-03-17 13:07:16 +0800262 print("%-7d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid,
jeromemarchandb96ebcd2018-10-10 01:58:15 +0200263 k.name.decode('utf-8', 'replace'), "W" if k.rwflag else "R",
264 k.major, k.minor, diskname, v.io, v.bytes / 1024, avg_ms))
Brendan Gregg6f075b92016-02-07 00:46:34 -0800265
266 line += 1
267 if line >= maxrows:
268 break
269 counts.clear()
270
271 countdown -= 1
272 if exiting or countdown == 0:
273 print("Detaching...")
274 exit()