Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 1 | #!/usr/bin/python |
| 2 | # @lint-avoid-python-3-compatibility-imports |
| 3 | # |
Brendan Gregg | 75d3e9d | 2016-02-07 18:48:20 -0800 | [diff] [blame] | 4 | # fileslower Trace slow synchronous file reads and writes. |
| 5 | # For Linux, uses BCC, eBPF. |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 6 | # |
Mark Drayton | 7434731 | 2016-08-25 20:46:35 +0100 | [diff] [blame] | 7 | # USAGE: fileslower [-h] [-p PID] [-a] [min_ms] |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 8 | # |
| 9 | # This script uses kernel dynamic tracing of synchronous reads and writes |
Brendan Gregg | 75d3e9d | 2016-02-07 18:48:20 -0800 | [diff] [blame] | 10 | # at the VFS interface, to identify slow file reads and writes for any file |
| 11 | # system. |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 12 | # |
| 13 | # This works by tracing __vfs_read() and __vfs_write(), and filtering for |
| 14 | # synchronous I/O (the path to new_sync_read() and new_sync_write()), and |
| 15 | # for I/O with filenames. This approach provides a view of just two file |
| 16 | # system request types. There are typically many others: asynchronous I/O, |
| 17 | # directory operations, file handle operations, etc, that this tool does not |
Brendan Gregg | 75d3e9d | 2016-02-07 18:48:20 -0800 | [diff] [blame] | 18 | # instrument. |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 19 | # |
| 20 | # WARNING: This traces VFS reads and writes, which can be extremely frequent, |
| 21 | # and so the overhead of this tool can become severe depending on the |
| 22 | # workload. |
| 23 | # |
| 24 | # By default, a minimum millisecond threshold of 10 is used. |
| 25 | # |
| 26 | # Copyright 2016 Netflix, Inc. |
| 27 | # Licensed under the Apache License, Version 2.0 (the "License") |
| 28 | # |
| 29 | # 06-Feb-2016 Brendan Gregg Created this. |
| 30 | |
| 31 | from __future__ import print_function |
| 32 | from bcc import BPF |
| 33 | import argparse |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 34 | import ctypes as ct |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 35 | import time |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 36 | |
| 37 | # arguments |
| 38 | examples = """examples: |
Brendan Gregg | 75d3e9d | 2016-02-07 18:48:20 -0800 | [diff] [blame] | 39 | ./fileslower # trace sync file I/O slower than 10 ms (default) |
| 40 | ./fileslower 1 # trace sync file I/O slower than 1 ms |
| 41 | ./fileslower -p 185 # trace PID 185 only |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 42 | """ |
| 43 | parser = argparse.ArgumentParser( |
Brendan Gregg | 75d3e9d | 2016-02-07 18:48:20 -0800 | [diff] [blame] | 44 | description="Trace slow synchronous file reads and writes", |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 45 | formatter_class=argparse.RawDescriptionHelpFormatter, |
| 46 | epilog=examples) |
Mark Drayton | 7434731 | 2016-08-25 20:46:35 +0100 | [diff] [blame] | 47 | parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid", |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 48 | help="trace this PID only") |
Mark Drayton | 7434731 | 2016-08-25 20:46:35 +0100 | [diff] [blame] | 49 | parser.add_argument("-a", "--all-files", action="store_true", |
| 50 | help="include non-regular file types (sockets, FIFOs, etc)") |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 51 | parser.add_argument("min_ms", nargs="?", default='10', |
| 52 | help="minimum I/O duration to trace, in ms (default 10)") |
Nathan Scott | cf0792f | 2018-02-02 16:56:50 +1100 | [diff] [blame] | 53 | parser.add_argument("--ebpf", action="store_true", |
| 54 | help=argparse.SUPPRESS) |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 55 | args = parser.parse_args() |
| 56 | min_ms = int(args.min_ms) |
Mark Drayton | 7434731 | 2016-08-25 20:46:35 +0100 | [diff] [blame] | 57 | tgid = args.tgid |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 58 | debug = 0 |
| 59 | |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 60 | # define BPF program |
| 61 | bpf_text = """ |
| 62 | #include <uapi/linux/ptrace.h> |
| 63 | #include <linux/fs.h> |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 64 | #include <linux/sched.h> |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 65 | |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 66 | enum trace_mode { |
| 67 | MODE_READ, |
| 68 | MODE_WRITE |
| 69 | }; |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 70 | |
| 71 | struct val_t { |
| 72 | u32 sz; |
| 73 | u64 ts; |
Mark Drayton | 7434731 | 2016-08-25 20:46:35 +0100 | [diff] [blame] | 74 | u32 name_len; |
| 75 | // de->d_name.name may point to de->d_iname so limit len accordingly |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 76 | char name[DNAME_INLINE_LEN]; |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 77 | char comm[TASK_COMM_LEN]; |
| 78 | }; |
| 79 | |
| 80 | struct data_t { |
| 81 | enum trace_mode mode; |
| 82 | u32 pid; |
| 83 | u32 sz; |
| 84 | u64 delta_us; |
Mark Drayton | 7434731 | 2016-08-25 20:46:35 +0100 | [diff] [blame] | 85 | u32 name_len; |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 86 | char name[DNAME_INLINE_LEN]; |
| 87 | char comm[TASK_COMM_LEN]; |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 88 | }; |
| 89 | |
| 90 | BPF_HASH(entryinfo, pid_t, struct val_t); |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 91 | BPF_PERF_OUTPUT(events); |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 92 | |
| 93 | // store timestamp and size on entry |
| 94 | static int trace_rw_entry(struct pt_regs *ctx, struct file *file, |
| 95 | char __user *buf, size_t count) |
| 96 | { |
Mark Drayton | 7434731 | 2016-08-25 20:46:35 +0100 | [diff] [blame] | 97 | u32 tgid = bpf_get_current_pid_tgid() >> 32; |
| 98 | if (TGID_FILTER) |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 99 | return 0; |
| 100 | |
Mark Drayton | 7434731 | 2016-08-25 20:46:35 +0100 | [diff] [blame] | 101 | u32 pid = bpf_get_current_pid_tgid(); |
| 102 | |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 103 | // skip I/O lacking a filename |
| 104 | struct dentry *de = file->f_path.dentry; |
Mark Drayton | 7434731 | 2016-08-25 20:46:35 +0100 | [diff] [blame] | 105 | int mode = file->f_inode->i_mode; |
| 106 | if (de->d_name.len == 0 || TYPE_FILTER) |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 107 | return 0; |
| 108 | |
| 109 | // store size and timestamp by pid |
| 110 | struct val_t val = {}; |
| 111 | val.sz = count; |
| 112 | val.ts = bpf_ktime_get_ns(); |
Mark Drayton | 7434731 | 2016-08-25 20:46:35 +0100 | [diff] [blame] | 113 | |
| 114 | val.name_len = de->d_name.len; |
| 115 | bpf_probe_read(&val.name, sizeof(val.name), (void *)de->d_name.name); |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 116 | bpf_get_current_comm(&val.comm, sizeof(val.comm)); |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 117 | entryinfo.update(&pid, &val); |
| 118 | |
| 119 | return 0; |
| 120 | } |
| 121 | |
| 122 | int trace_read_entry(struct pt_regs *ctx, struct file *file, |
| 123 | char __user *buf, size_t count) |
| 124 | { |
| 125 | // skip non-sync I/O; see kernel code for __vfs_read() |
| 126 | if (!(file->f_op->read_iter)) |
| 127 | return 0; |
| 128 | return trace_rw_entry(ctx, file, buf, count); |
| 129 | } |
| 130 | |
| 131 | int trace_write_entry(struct pt_regs *ctx, struct file *file, |
| 132 | char __user *buf, size_t count) |
| 133 | { |
| 134 | // skip non-sync I/O; see kernel code for __vfs_write() |
| 135 | if (!(file->f_op->write_iter)) |
| 136 | return 0; |
| 137 | return trace_rw_entry(ctx, file, buf, count); |
| 138 | } |
| 139 | |
| 140 | // output |
| 141 | static int trace_rw_return(struct pt_regs *ctx, int type) |
| 142 | { |
| 143 | struct val_t *valp; |
| 144 | u32 pid = bpf_get_current_pid_tgid(); |
| 145 | |
| 146 | valp = entryinfo.lookup(&pid); |
| 147 | if (valp == 0) { |
| 148 | // missed tracing issue or filtered |
| 149 | return 0; |
| 150 | } |
| 151 | u64 delta_us = (bpf_ktime_get_ns() - valp->ts) / 1000; |
| 152 | entryinfo.delete(&pid); |
| 153 | if (delta_us < MIN_US) |
| 154 | return 0; |
| 155 | |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 156 | struct data_t data = {}; |
| 157 | data.mode = type; |
| 158 | data.pid = pid; |
| 159 | data.sz = valp->sz; |
| 160 | data.delta_us = delta_us; |
Mark Drayton | 7434731 | 2016-08-25 20:46:35 +0100 | [diff] [blame] | 161 | data.name_len = valp->name_len; |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 162 | bpf_probe_read(&data.name, sizeof(data.name), valp->name); |
| 163 | bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm); |
| 164 | events.perf_submit(ctx, &data, sizeof(data)); |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 165 | |
| 166 | return 0; |
| 167 | } |
| 168 | |
| 169 | int trace_read_return(struct pt_regs *ctx) |
| 170 | { |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 171 | return trace_rw_return(ctx, MODE_READ); |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 172 | } |
| 173 | |
| 174 | int trace_write_return(struct pt_regs *ctx) |
| 175 | { |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 176 | return trace_rw_return(ctx, MODE_WRITE); |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 177 | } |
| 178 | |
| 179 | """ |
| 180 | bpf_text = bpf_text.replace('MIN_US', str(min_ms * 1000)) |
Mark Drayton | 7434731 | 2016-08-25 20:46:35 +0100 | [diff] [blame] | 181 | if args.tgid: |
| 182 | bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % tgid) |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 183 | else: |
Mark Drayton | 7434731 | 2016-08-25 20:46:35 +0100 | [diff] [blame] | 184 | bpf_text = bpf_text.replace('TGID_FILTER', '0') |
| 185 | if args.all_files: |
| 186 | bpf_text = bpf_text.replace('TYPE_FILTER', '0') |
| 187 | else: |
| 188 | bpf_text = bpf_text.replace('TYPE_FILTER', '!S_ISREG(mode)') |
| 189 | |
Nathan Scott | cf0792f | 2018-02-02 16:56:50 +1100 | [diff] [blame] | 190 | if debug or args.ebpf: |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 191 | print(bpf_text) |
Nathan Scott | cf0792f | 2018-02-02 16:56:50 +1100 | [diff] [blame] | 192 | if args.ebpf: |
| 193 | exit() |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 194 | |
| 195 | # initialize BPF |
Nathan Scott | cf0792f | 2018-02-02 16:56:50 +1100 | [diff] [blame] | 196 | b = BPF(text=bpf_text) |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 197 | |
| 198 | # I'd rather trace these via new_sync_read/new_sync_write (which used to be |
| 199 | # do_sync_read/do_sync_write), but those became static. So trace these from |
| 200 | # the parent functions, at the cost of more overhead, instead. |
| 201 | # Ultimately, we should be using [V]FS tracepoints. |
| 202 | b.attach_kprobe(event="__vfs_read", fn_name="trace_read_entry") |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 203 | b.attach_kretprobe(event="__vfs_read", fn_name="trace_read_return") |
Mark Drayton | 32a4fd3 | 2016-07-13 18:24:56 +0100 | [diff] [blame] | 204 | try: |
| 205 | b.attach_kprobe(event="__vfs_write", fn_name="trace_write_entry") |
| 206 | b.attach_kretprobe(event="__vfs_write", fn_name="trace_write_return") |
| 207 | except: |
| 208 | # older kernels don't have __vfs_write so try vfs_write instead |
| 209 | b.attach_kprobe(event="vfs_write", fn_name="trace_write_entry") |
| 210 | b.attach_kretprobe(event="vfs_write", fn_name="trace_write_return") |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 211 | |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 212 | TASK_COMM_LEN = 16 # linux/sched.h |
| 213 | DNAME_INLINE_LEN = 32 # linux/dcache.h |
| 214 | |
| 215 | class Data(ct.Structure): |
| 216 | _fields_ = [ |
| 217 | ("mode", ct.c_int), |
| 218 | ("pid", ct.c_uint), |
| 219 | ("sz", ct.c_uint), |
| 220 | ("delta_us", ct.c_ulonglong), |
Mark Drayton | 7434731 | 2016-08-25 20:46:35 +0100 | [diff] [blame] | 221 | ("name_len", ct.c_uint), |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 222 | ("name", ct.c_char * DNAME_INLINE_LEN), |
| 223 | ("comm", ct.c_char * TASK_COMM_LEN), |
| 224 | ] |
| 225 | |
| 226 | mode_s = { |
| 227 | 0: 'R', |
| 228 | 1: 'W', |
| 229 | } |
| 230 | |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 231 | # header |
| 232 | print("Tracing sync read/writes slower than %d ms" % min_ms) |
Mark Drayton | 7434731 | 2016-08-25 20:46:35 +0100 | [diff] [blame] | 233 | print("%-8s %-14s %-6s %1s %-7s %7s %s" % ("TIME(s)", "COMM", "TID", "D", |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 234 | "BYTES", "LAT(ms)", "FILENAME")) |
| 235 | |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 236 | start_ts = time.time() |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 237 | |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 238 | def print_event(cpu, data, size): |
| 239 | event = ct.cast(data, ct.POINTER(Data)).contents |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 240 | |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 241 | ms = float(event.delta_us) / 1000 |
Rafael F | 78948e4 | 2017-03-26 14:54:25 +0200 | [diff] [blame] | 242 | name = event.name.decode() |
Mark Drayton | 7434731 | 2016-08-25 20:46:35 +0100 | [diff] [blame] | 243 | if event.name_len > DNAME_INLINE_LEN: |
| 244 | name = name[:-3] + "..." |
Brendan Gregg | a32fbaf | 2016-02-07 12:14:37 -0800 | [diff] [blame] | 245 | |
| 246 | print("%-8.3f %-14.14s %-6s %1s %-7s %7.2f %s" % ( |
Rafael F | 78948e4 | 2017-03-26 14:54:25 +0200 | [diff] [blame] | 247 | time.time() - start_ts, event.comm.decode(), event.pid, |
| 248 | mode_s[event.mode], event.sz, ms, name)) |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 249 | |
Mark Drayton | 5f5687e | 2017-02-20 18:13:03 +0000 | [diff] [blame] | 250 | b["events"].open_perf_buffer(print_event, page_cnt=64) |
Mark Drayton | 266d6f6 | 2016-05-24 07:01:01 -0700 | [diff] [blame] | 251 | while 1: |
Teng Qin | dbf0029 | 2018-02-28 21:47:50 -0800 | [diff] [blame] | 252 | b.perf_buffer_poll() |