blob: c6e1ca2677b131b14ab7ee4bffcc817826bb7e82 [file] [log] [blame]
Brendan Gregg6f075b92016-02-07 00:46:34 -08001#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# biotop block device (disk) I/O by process.
5# For Linux, uses BCC, eBPF.
6#
Nathan Scottf5fb9af2018-01-17 09:39:59 +11007# USAGE: biotop.py [-h] [-C] [-r MAXROWS] [interval] [count]
Brendan Gregg6f075b92016-02-07 00:46:34 -08008#
9# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
10# request, as well as a starting timestamp for calculating I/O latency.
11#
12# Copyright 2016 Netflix, Inc.
13# Licensed under the Apache License, Version 2.0 (the "License")
14#
15# 06-Feb-2016 Brendan Gregg Created this.
16
17from __future__ import print_function
18from bcc import BPF
19from time import sleep, strftime
20import argparse
21import signal
22from subprocess import call
23
24# arguments
25examples = """examples:
26 ./biotop # block device I/O top, 1 second refresh
27 ./biotop -C # don't clear the screen
28 ./biotop 5 # 5 second summaries
29 ./biotop 5 10 # 5 second summaries, 10 times only
30"""
31parser = argparse.ArgumentParser(
32 description="Block device (disk) I/O by process",
33 formatter_class=argparse.RawDescriptionHelpFormatter,
Nathan Scott1a197db2018-01-21 09:14:27 +110034 epilog=examples)
Brendan Gregg6f075b92016-02-07 00:46:34 -080035parser.add_argument("-C", "--noclear", action="store_true",
36 help="don't clear the screen")
37parser.add_argument("-r", "--maxrows", default=20,
38 help="maximum rows to print, default 20")
39parser.add_argument("interval", nargs="?", default=1,
40 help="output interval, in seconds")
41parser.add_argument("count", nargs="?", default=99999999,
42 help="number of outputs")
Nathan Scottf5fb9af2018-01-17 09:39:59 +110043parser.add_argument("--ebpf", action="store_true",
44 help=argparse.SUPPRESS)
Brendan Gregg6f075b92016-02-07 00:46:34 -080045args = parser.parse_args()
46interval = int(args.interval)
47countdown = int(args.count)
48maxrows = int(args.maxrows)
49clear = not int(args.noclear)
50
51# linux stats
52loadavg = "/proc/loadavg"
53diskstats = "/proc/diskstats"
54
55# signal handler
56def signal_ignore(signal, frame):
57 print()
58
59# load BPF program
Nathan Scottca4ba552018-01-16 11:02:58 +110060bpf_text = """
Brendan Gregg6f075b92016-02-07 00:46:34 -080061#include <uapi/linux/ptrace.h>
62#include <linux/blkdev.h>
63
Brendan Gregg6321d002016-02-07 00:54:44 -080064// for saving process info by request
Brendan Gregg6f075b92016-02-07 00:46:34 -080065struct who_t {
66 u32 pid;
67 char name[TASK_COMM_LEN];
68};
69
Brendan Gregg6321d002016-02-07 00:54:44 -080070// the key for the output summary
Brendan Gregg6f075b92016-02-07 00:46:34 -080071struct info_t {
72 u32 pid;
Brendan Gregg51add782016-11-29 11:11:31 -080073 int rwflag;
Brendan Gregg6f075b92016-02-07 00:46:34 -080074 int major;
75 int minor;
76 char name[TASK_COMM_LEN];
77};
78
Brendan Gregg6321d002016-02-07 00:54:44 -080079// the value of the output summary
Brendan Gregg6f075b92016-02-07 00:46:34 -080080struct val_t {
81 u64 bytes;
82 u64 us;
83 u32 io;
84};
85
86BPF_HASH(start, struct request *);
87BPF_HASH(whobyreq, struct request *, struct who_t);
88BPF_HASH(counts, struct info_t, struct val_t);
89
90// cache PID and comm by-req
91int trace_pid_start(struct pt_regs *ctx, struct request *req)
92{
93 struct who_t who = {};
94
95 if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
96 who.pid = bpf_get_current_pid_tgid();
97 whobyreq.update(&req, &who);
98 }
99
100 return 0;
101}
102
103// time block I/O
104int trace_req_start(struct pt_regs *ctx, struct request *req)
105{
106 u64 ts;
107
108 ts = bpf_ktime_get_ns();
109 start.update(&req, &ts);
110
111 return 0;
112}
113
114// output
115int trace_req_completion(struct pt_regs *ctx, struct request *req)
116{
117 u64 *tsp;
118
119 // fetch timestamp and calculate delta
120 tsp = start.lookup(&req);
121 if (tsp == 0) {
122 return 0; // missed tracing issue
123 }
124
125 struct who_t *whop;
126 struct val_t *valp, zero = {};
127 u64 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
128
129 // setup info_t key
130 struct info_t info = {};
131 info.major = req->rq_disk->major;
132 info.minor = req->rq_disk->first_minor;
Brendan Gregg51add782016-11-29 11:11:31 -0800133/*
134 * The following deals with a kernel version change (in mainline 4.7, although
135 * it may be backported to earlier kernels) with how block request write flags
136 * are tested. We handle both pre- and post-change versions here. Please avoid
137 * kernel version tests like this as much as possible: they inflate the code,
138 * test, and maintenance burden.
139 */
140#ifdef REQ_WRITE
141 info.rwflag = !!(req->cmd_flags & REQ_WRITE);
Ryan Learybc43a292017-01-07 15:34:31 -0500142#elif defined(REQ_OP_SHIFT)
Brendan Gregg51add782016-11-29 11:11:31 -0800143 info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
Ryan Learybc43a292017-01-07 15:34:31 -0500144#else
145 info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
Brendan Gregg51add782016-11-29 11:11:31 -0800146#endif
147
Brendan Gregg6f075b92016-02-07 00:46:34 -0800148 whop = whobyreq.lookup(&req);
149 if (whop == 0) {
150 // missed pid who, save stats as pid 0
151 valp = counts.lookup_or_init(&info, &zero);
152 } else {
153 info.pid = whop->pid;
154 __builtin_memcpy(&info.name, whop->name, sizeof(info.name));
155 valp = counts.lookup_or_init(&info, &zero);
156 }
157
158 // save stats
159 valp->us += delta_us;
160 valp->bytes += req->__data_len;
161 valp->io++;
162
163 start.delete(&req);
164 whobyreq.delete(&req);
165
166 return 0;
167}
Nathan Scottca4ba552018-01-16 11:02:58 +1100168"""
169
170if args.ebpf:
171 print(bpf_text)
172 exit()
173
174b = BPF(text=bpf_text)
Brendan Gregg6f075b92016-02-07 00:46:34 -0800175b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
176b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
177b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
178b.attach_kprobe(event="blk_account_io_completion",
179 fn_name="trace_req_completion")
180
181print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
182
183# cache disk major,minor -> diskname
184disklookup = {}
185with open(diskstats) as stats:
186 for line in stats:
187 a = line.split()
188 disklookup[a[0] + "," + a[1]] = a[2]
189
190# output
191exiting = 0
192while 1:
193 try:
194 sleep(interval)
195 except KeyboardInterrupt:
196 exiting = 1
197
198 # header
199 if clear:
200 call("clear")
201 else:
202 print()
203 with open(loadavg) as stats:
204 print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
205 print("%-6s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM",
206 "D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms"))
207
208 # by-PID output
209 counts = b.get_table("counts")
210 line = 0
211 for k, v in reversed(sorted(counts.items(),
212 key=lambda counts: counts[1].bytes)):
213
214 # lookup disk
215 disk = str(k.major) + "," + str(k.minor)
216 if disk in disklookup:
217 diskname = disklookup[disk]
218 else:
219 diskname = "?"
220
221 # print line
222 avg_ms = (float(v.us) / 1000) / v.io
Rafael F78948e42017-03-26 14:54:25 +0200223 print("%-6d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid,
jeromemarchandb96ebcd2018-10-10 01:58:15 +0200224 k.name.decode('utf-8', 'replace'), "W" if k.rwflag else "R",
225 k.major, k.minor, diskname, v.io, v.bytes / 1024, avg_ms))
Brendan Gregg6f075b92016-02-07 00:46:34 -0800226
227 line += 1
228 if line >= maxrows:
229 break
230 counts.clear()
231
232 countdown -= 1
233 if exiting or countdown == 0:
234 print("Detaching...")
235 exit()