| Brendan Gregg | 6f075b9 | 2016-02-07 00:46:34 -0800 | [diff] [blame] | 1 | #!/usr/bin/python | 
|  | 2 | # @lint-avoid-python-3-compatibility-imports | 
|  | 3 | # | 
|  | 4 | # biotop  block device (disk) I/O by process. | 
|  | 5 | #         For Linux, uses BCC, eBPF. | 
|  | 6 | # | 
|  | 7 | # USAGE: biotop.py [-h] [-C] [-r MAXROWS] [interval] [count] | 
|  | 8 | # | 
|  | 9 | # This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O | 
|  | 10 | # request, as well as a starting timestamp for calculating I/O latency. | 
|  | 11 | # | 
|  | 12 | # Copyright 2016 Netflix, Inc. | 
|  | 13 | # Licensed under the Apache License, Version 2.0 (the "License") | 
|  | 14 | # | 
|  | 15 | # 06-Feb-2016   Brendan Gregg   Created this. | 
|  | 16 |  | 
|  | 17 | from __future__ import print_function | 
|  | 18 | from bcc import BPF | 
|  | 19 | from time import sleep, strftime | 
|  | 20 | import argparse | 
|  | 21 | import signal | 
|  | 22 | from subprocess import call | 
|  | 23 |  | 
|  | 24 | # arguments | 
|  | 25 | examples = """examples: | 
|  | 26 | ./biotop            # block device I/O top, 1 second refresh | 
|  | 27 | ./biotop -C         # don't clear the screen | 
|  | 28 | ./biotop 5          # 5 second summaries | 
|  | 29 | ./biotop 5 10       # 5 second summaries, 10 times only | 
|  | 30 | """ | 
|  | 31 | parser = argparse.ArgumentParser( | 
|  | 32 | description="Block device (disk) I/O by process", | 
|  | 33 | formatter_class=argparse.RawDescriptionHelpFormatter, | 
|  | 34 | epilog=examples) | 
|  | 35 | parser.add_argument("-C", "--noclear", action="store_true", | 
|  | 36 | help="don't clear the screen") | 
|  | 37 | parser.add_argument("-r", "--maxrows", default=20, | 
|  | 38 | help="maximum rows to print, default 20") | 
|  | 39 | parser.add_argument("interval", nargs="?", default=1, | 
|  | 40 | help="output interval, in seconds") | 
|  | 41 | parser.add_argument("count", nargs="?", default=99999999, | 
|  | 42 | help="number of outputs") | 
|  | 43 | args = parser.parse_args() | 
|  | 44 | interval = int(args.interval) | 
|  | 45 | countdown = int(args.count) | 
|  | 46 | maxrows = int(args.maxrows) | 
|  | 47 | clear = not int(args.noclear) | 
|  | 48 |  | 
|  | 49 | # linux stats | 
|  | 50 | loadavg = "/proc/loadavg" | 
|  | 51 | diskstats = "/proc/diskstats" | 
|  | 52 |  | 
|  | 53 | # signal handler | 
|  | 54 | def signal_ignore(signal, frame): | 
|  | 55 | print() | 
|  | 56 |  | 
|  | 57 | # load BPF program | 
|  | 58 | b = BPF(text=""" | 
|  | 59 | #include <uapi/linux/ptrace.h> | 
|  | 60 | #include <linux/blkdev.h> | 
|  | 61 |  | 
| Brendan Gregg | 6321d00 | 2016-02-07 00:54:44 -0800 | [diff] [blame] | 62 | // for saving process info by request | 
| Brendan Gregg | 6f075b9 | 2016-02-07 00:46:34 -0800 | [diff] [blame] | 63 | struct who_t { | 
|  | 64 | u32 pid; | 
|  | 65 | char name[TASK_COMM_LEN]; | 
|  | 66 | }; | 
|  | 67 |  | 
| Brendan Gregg | 6321d00 | 2016-02-07 00:54:44 -0800 | [diff] [blame] | 68 | // the key for the output summary | 
| Brendan Gregg | 6f075b9 | 2016-02-07 00:46:34 -0800 | [diff] [blame] | 69 | struct info_t { | 
|  | 70 | u32 pid; | 
| Brendan Gregg | 51add78 | 2016-11-29 11:11:31 -0800 | [diff] [blame] | 71 | int rwflag; | 
| Brendan Gregg | 6f075b9 | 2016-02-07 00:46:34 -0800 | [diff] [blame] | 72 | int major; | 
|  | 73 | int minor; | 
|  | 74 | char name[TASK_COMM_LEN]; | 
|  | 75 | }; | 
|  | 76 |  | 
| Brendan Gregg | 6321d00 | 2016-02-07 00:54:44 -0800 | [diff] [blame] | 77 | // the value of the output summary | 
| Brendan Gregg | 6f075b9 | 2016-02-07 00:46:34 -0800 | [diff] [blame] | 78 | struct val_t { | 
|  | 79 | u64 bytes; | 
|  | 80 | u64 us; | 
|  | 81 | u32 io; | 
|  | 82 | }; | 
|  | 83 |  | 
|  | 84 | BPF_HASH(start, struct request *); | 
|  | 85 | BPF_HASH(whobyreq, struct request *, struct who_t); | 
|  | 86 | BPF_HASH(counts, struct info_t, struct val_t); | 
|  | 87 |  | 
|  | 88 | // cache PID and comm by-req | 
|  | 89 | int trace_pid_start(struct pt_regs *ctx, struct request *req) | 
|  | 90 | { | 
|  | 91 | struct who_t who = {}; | 
|  | 92 |  | 
|  | 93 | if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) { | 
|  | 94 | who.pid = bpf_get_current_pid_tgid(); | 
|  | 95 | whobyreq.update(&req, &who); | 
|  | 96 | } | 
|  | 97 |  | 
|  | 98 | return 0; | 
|  | 99 | } | 
|  | 100 |  | 
|  | 101 | // time block I/O | 
|  | 102 | int trace_req_start(struct pt_regs *ctx, struct request *req) | 
|  | 103 | { | 
|  | 104 | u64 ts; | 
|  | 105 |  | 
|  | 106 | ts = bpf_ktime_get_ns(); | 
|  | 107 | start.update(&req, &ts); | 
|  | 108 |  | 
|  | 109 | return 0; | 
|  | 110 | } | 
|  | 111 |  | 
|  | 112 | // output | 
|  | 113 | int trace_req_completion(struct pt_regs *ctx, struct request *req) | 
|  | 114 | { | 
|  | 115 | u64 *tsp; | 
|  | 116 |  | 
|  | 117 | // fetch timestamp and calculate delta | 
|  | 118 | tsp = start.lookup(&req); | 
|  | 119 | if (tsp == 0) { | 
|  | 120 | return 0;    // missed tracing issue | 
|  | 121 | } | 
|  | 122 |  | 
|  | 123 | struct who_t *whop; | 
|  | 124 | struct val_t *valp, zero = {}; | 
|  | 125 | u64 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000; | 
|  | 126 |  | 
|  | 127 | // setup info_t key | 
|  | 128 | struct info_t info = {}; | 
|  | 129 | info.major = req->rq_disk->major; | 
|  | 130 | info.minor = req->rq_disk->first_minor; | 
| Brendan Gregg | 51add78 | 2016-11-29 11:11:31 -0800 | [diff] [blame] | 131 | /* | 
|  | 132 | * The following deals with a kernel version change (in mainline 4.7, although | 
|  | 133 | * it may be backported to earlier kernels) with how block request write flags | 
|  | 134 | * are tested. We handle both pre- and post-change versions here. Please avoid | 
|  | 135 | * kernel version tests like this as much as possible: they inflate the code, | 
|  | 136 | * test, and maintenance burden. | 
|  | 137 | */ | 
|  | 138 | #ifdef REQ_WRITE | 
|  | 139 | info.rwflag = !!(req->cmd_flags & REQ_WRITE); | 
| Ryan Leary | bc43a29 | 2017-01-07 15:34:31 -0500 | [diff] [blame] | 140 | #elif defined(REQ_OP_SHIFT) | 
| Brendan Gregg | 51add78 | 2016-11-29 11:11:31 -0800 | [diff] [blame] | 141 | info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE); | 
| Ryan Leary | bc43a29 | 2017-01-07 15:34:31 -0500 | [diff] [blame] | 142 | #else | 
|  | 143 | info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE); | 
| Brendan Gregg | 51add78 | 2016-11-29 11:11:31 -0800 | [diff] [blame] | 144 | #endif | 
|  | 145 |  | 
| Brendan Gregg | 6f075b9 | 2016-02-07 00:46:34 -0800 | [diff] [blame] | 146 | whop = whobyreq.lookup(&req); | 
|  | 147 | if (whop == 0) { | 
|  | 148 | // missed pid who, save stats as pid 0 | 
|  | 149 | valp = counts.lookup_or_init(&info, &zero); | 
|  | 150 | } else { | 
|  | 151 | info.pid = whop->pid; | 
|  | 152 | __builtin_memcpy(&info.name, whop->name, sizeof(info.name)); | 
|  | 153 | valp = counts.lookup_or_init(&info, &zero); | 
|  | 154 | } | 
|  | 155 |  | 
|  | 156 | // save stats | 
|  | 157 | valp->us += delta_us; | 
|  | 158 | valp->bytes += req->__data_len; | 
|  | 159 | valp->io++; | 
|  | 160 |  | 
|  | 161 | start.delete(&req); | 
|  | 162 | whobyreq.delete(&req); | 
|  | 163 |  | 
|  | 164 | return 0; | 
|  | 165 | } | 
|  | 166 | """, debug=0) | 
|  | 167 | b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start") | 
|  | 168 | b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") | 
|  | 169 | b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") | 
|  | 170 | b.attach_kprobe(event="blk_account_io_completion", | 
|  | 171 | fn_name="trace_req_completion") | 
|  | 172 |  | 
|  | 173 | print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval) | 
|  | 174 |  | 
|  | 175 | # cache disk major,minor -> diskname | 
|  | 176 | disklookup = {} | 
|  | 177 | with open(diskstats) as stats: | 
|  | 178 | for line in stats: | 
|  | 179 | a = line.split() | 
|  | 180 | disklookup[a[0] + "," + a[1]] = a[2] | 
|  | 181 |  | 
|  | 182 | # output | 
|  | 183 | exiting = 0 | 
|  | 184 | while 1: | 
|  | 185 | try: | 
|  | 186 | sleep(interval) | 
|  | 187 | except KeyboardInterrupt: | 
|  | 188 | exiting = 1 | 
|  | 189 |  | 
|  | 190 | # header | 
|  | 191 | if clear: | 
|  | 192 | call("clear") | 
|  | 193 | else: | 
|  | 194 | print() | 
|  | 195 | with open(loadavg) as stats: | 
|  | 196 | print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read())) | 
|  | 197 | print("%-6s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM", | 
|  | 198 | "D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms")) | 
|  | 199 |  | 
|  | 200 | # by-PID output | 
|  | 201 | counts = b.get_table("counts") | 
|  | 202 | line = 0 | 
|  | 203 | for k, v in reversed(sorted(counts.items(), | 
|  | 204 | key=lambda counts: counts[1].bytes)): | 
|  | 205 |  | 
|  | 206 | # lookup disk | 
|  | 207 | disk = str(k.major) + "," + str(k.minor) | 
|  | 208 | if disk in disklookup: | 
|  | 209 | diskname = disklookup[disk] | 
|  | 210 | else: | 
|  | 211 | diskname = "?" | 
|  | 212 |  | 
|  | 213 | # print line | 
|  | 214 | avg_ms = (float(v.us) / 1000) / v.io | 
| Rafael F | 78948e4 | 2017-03-26 14:54:25 +0200 | [diff] [blame] | 215 | print("%-6d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid, | 
|  | 216 | k.name.decode(), "W" if k.rwflag else "R", k.major, k.minor, | 
|  | 217 | diskname, v.io, v.bytes / 1024, avg_ms)) | 
| Brendan Gregg | 6f075b9 | 2016-02-07 00:46:34 -0800 | [diff] [blame] | 218 |  | 
|  | 219 | line += 1 | 
|  | 220 | if line >= maxrows: | 
|  | 221 | break | 
|  | 222 | counts.clear() | 
|  | 223 |  | 
|  | 224 | countdown -= 1 | 
|  | 225 | if exiting or countdown == 0: | 
|  | 226 | print("Detaching...") | 
|  | 227 | exit() |