ext4dist
diff --git a/tools/ext4dist.py b/tools/ext4dist.py
new file mode 100755
index 0000000..b59a227
--- /dev/null
+++ b/tools/ext4dist.py
@@ -0,0 +1,203 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# ext4dist Summarize ext4 operation latency.
+# For Linux, uses BCC, eBPF.
+#
+# USAGE: ext4dist [-h] [-T] [-m] [-p PID] [interval] [count]
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 12-Feb-2016 Brendan Gregg Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+
+# symbols
+kallsyms = "/proc/kallsyms"
+
+# arguments
+examples = """examples:
+ ./ext4dist # show operation latency as a histogram
+ ./ext4dist -p 181 # trace PID 181 only
+ ./ext4dist 1 10 # print 1 second summaries, 10 times
+ ./ext4dist -m 5 # 5s summaries, milliseconds
+"""
+parser = argparse.ArgumentParser(
+ description="Summarize ext4 operation latency",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=examples)
+parser.add_argument("-T", "--notimestamp", action="store_true",
+ help="don't include timestamp on interval output")
+parser.add_argument("-m", "--milliseconds", action="store_true",
+ help="output in milliseconds")
+parser.add_argument("-p", "--pid",
+ help="trace this PID only")
+parser.add_argument("interval", nargs="?",
+ help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+ help="number of outputs")
+args = parser.parse_args()
+pid = args.pid
+countdown = int(args.count)
+if args.milliseconds:
+ factor = 1000000
+ label = "msecs"
+else:
+ factor = 1000
+ label = "usecs"
+if args.interval and int(args.interval) == 0:
+ print("ERROR: interval 0. Exiting.")
+ exit()
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+
+#define OP_NAME_LEN 8
+typedef struct dist_key {
+ char op[OP_NAME_LEN];
+ u64 slot;
+} dist_key_t;
+BPF_HASH(start, u32);
+BPF_HISTOGRAM(dist, dist_key_t);
+
+// time operation
+int trace_entry(struct pt_regs *ctx)
+{
+ u32 pid = bpf_get_current_pid_tgid();
+ if (FILTER_PID)
+ return 0;
+ u64 ts = bpf_ktime_get_ns();
+ start.update(&pid, &ts);
+ return 0;
+}
+
+// The current ext4 (Linux 4.5) uses generic_file_read_iter(), instead of it's
+// own function, for reads. So we need to trace that and then filter on ext4,
+// which I do by checking file->f_op.
+int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb)
+{
+ u32 pid = bpf_get_current_pid_tgid();
+ if (FILTER_PID)
+ return 0;
+
+ // ext4 filter on file->f_op == ext4_file_operations
+ struct file *fp = iocb->ki_filp;
+ if ((u64)fp->f_op != EXT4_FILE_OPERATIONS)
+ return 0;
+
+ u64 ts = bpf_ktime_get_ns();
+ start.update(&pid, &ts);
+ return 0;
+}
+
+static int trace_return(struct pt_regs *ctx, const char *op)
+{
+ u64 *tsp;
+ u32 pid = bpf_get_current_pid_tgid();
+
+ // fetch timestamp and calculate delta
+ tsp = start.lookup(&pid);
+ if (tsp == 0) {
+ return 0; // missed start or filtered
+ }
+ u64 delta = (bpf_ktime_get_ns() - *tsp) / FACTOR;
+
+ // store as histogram
+ dist_key_t key = {.slot = bpf_log2l(delta)};
+ __builtin_memcpy(&key.op, op, sizeof(key.op));
+ dist.increment(key);
+
+ start.delete(&pid);
+ return 0;
+}
+
+int trace_read_return(struct pt_regs *ctx)
+{
+ char *op = "read";
+ return trace_return(ctx, op);
+}
+
+int trace_write_return(struct pt_regs *ctx)
+{
+ char *op = "write";
+ return trace_return(ctx, op);
+}
+
+int trace_open_return(struct pt_regs *ctx)
+{
+ char *op = "open";
+ return trace_return(ctx, op);
+}
+
+int trace_fsync_return(struct pt_regs *ctx)
+{
+ char *op = "fsync";
+ return trace_return(ctx, op);
+}
+"""
+
+# code replacements
+with open(kallsyms) as syms:
+ ops = ''
+ for line in syms:
+ (addr, size, name) = line.rstrip().split(" ", 2)
+ if name == "ext4_file_operations":
+ ops = "0x" + addr
+ break
+ if ops == '':
+ print("ERROR: no ext4_file_operations in /proc/kallsyms. Exiting.")
+ exit()
+ bpf_text = bpf_text.replace('EXT4_FILE_OPERATIONS', ops)
+bpf_text = bpf_text.replace('FACTOR', str(factor))
+if args.pid:
+ bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
+else:
+ bpf_text = bpf_text.replace('FILTER_PID', '0')
+if debug:
+ print(bpf_text)
+
+# load BPF program
+b = BPF(text=bpf_text)
+
+# Common file functions. See earlier comment about generic_file_read_iter().
+b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry")
+b.attach_kprobe(event="ext4_file_write_iter", fn_name="trace_entry")
+b.attach_kprobe(event="ext4_file_open", fn_name="trace_entry")
+b.attach_kprobe(event="ext4_sync_file", fn_name="trace_entry")
+b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return")
+b.attach_kretprobe(event="ext4_file_write_iter", fn_name="trace_write_return")
+b.attach_kretprobe(event="ext4_file_open", fn_name="trace_open_return")
+b.attach_kretprobe(event="ext4_sync_file", fn_name="trace_fsync_return")
+
+print("Tracing ext4 operation latency... Hit Ctrl-C to end.")
+
+# output
+exiting = 0
+dist = b.get_table("dist")
+while (1):
+ try:
+ if args.interval:
+ sleep(int(args.interval))
+ else:
+ sleep(99999999)
+ except KeyboardInterrupt:
+ exiting = 1
+
+ print()
+ if args.interval and (not args.notimestamp):
+ print(strftime("%H:%M:%S:"))
+
+ dist.print_log2_hist(label, "operation")
+ dist.clear()
+
+ countdown -= 1
+ if exiting or countdown == 0:
+ exit()
diff --git a/tools/ext4dist_example.txt b/tools/ext4dist_example.txt
new file mode 100644
index 0000000..def8e8b
--- /dev/null
+++ b/tools/ext4dist_example.txt
@@ -0,0 +1,193 @@
+Demonstrations of ext4dist, the Linux eBPF/bcc version.
+
+
+ext4dist traces ext4 reads, writes, opens, and fsyncs, and summarizes their
+latency as a power-of-2 histogram. For example:
+
+# ./ext4dist
+Tracing ext4 operation latency... Hit Ctrl-C to end.
+^C
+
+operation = 'read'
+ usecs : count distribution
+ 0 -> 1 : 1210 |****************************************|
+ 2 -> 3 : 126 |**** |
+ 4 -> 7 : 376 |************ |
+ 8 -> 15 : 86 |** |
+ 16 -> 31 : 9 | |
+ 32 -> 63 : 47 |* |
+ 64 -> 127 : 6 | |
+ 128 -> 255 : 24 | |
+ 256 -> 511 : 137 |**** |
+ 512 -> 1023 : 66 |** |
+ 1024 -> 2047 : 13 | |
+ 2048 -> 4095 : 7 | |
+ 4096 -> 8191 : 13 | |
+ 8192 -> 16383 : 3 | |
+
+operation = 'write'
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 75 |****************************************|
+ 16 -> 31 : 5 |** |
+
+operation = 'open'
+ usecs : count distribution
+ 0 -> 1 : 1278 |****************************************|
+ 2 -> 3 : 40 |* |
+ 4 -> 7 : 4 | |
+ 8 -> 15 : 1 | |
+ 16 -> 31 : 1 | |
+
+This output shows a bi-modal distribution for read latency, with a faster
+mode of less than 7 microseconds, and a slower mode of between 256 and 1023
+microseconds. The count column shows how many events fell into that latency
+range. It's likely that the faster mode was a hit from the in-memory file
+system cache, and the slower mode is a read from a storage device (disk).
+
+This "latency" is measured from when the operation was issued from the VFS
+interface to the file system, to when it completed. This spans everything:
+block device I/O (disk I/O), file system CPU cycles, file system locks, run
+queue latency, etc. This is a better measure of the latency suffered by
+applications reading from the file system than measuring this down at the
+block device interface.
+
+Note that this only traces the common file system operations previously
+listed: other file system operations (eg, inode operations including
+getattr()) are not traced.
+
+
+An optional interval and a count can be provided, as well as -m to show the
+distributions in milliseconds. For example:
+
+# ./ext4dist -m 1 5
+Tracing ext4 operation latency... Hit Ctrl-C to end.
+
+10:19:00:
+
+operation = 'read'
+ msecs : count distribution
+ 0 -> 1 : 576 |****************************************|
+ 2 -> 3 : 5 | |
+ 4 -> 7 : 6 | |
+ 8 -> 15 : 13 | |
+ 16 -> 31 : 17 |* |
+ 32 -> 63 : 5 | |
+ 64 -> 127 : 1 | |
+
+operation = 'write'
+ msecs : count distribution
+ 0 -> 1 : 20 |****************************************|
+
+operation = 'open'
+ msecs : count distribution
+ 0 -> 1 : 346 |****************************************|
+
+10:19:01:
+
+operation = 'read'
+ msecs : count distribution
+ 0 -> 1 : 584 |****************************************|
+ 2 -> 3 : 10 | |
+ 4 -> 7 : 11 | |
+ 8 -> 15 : 16 |* |
+ 16 -> 31 : 6 | |
+ 32 -> 63 : 4 | |
+ 64 -> 127 : 2 | |
+ 128 -> 255 : 1 | |
+
+operation = 'write'
+ msecs : count distribution
+ 0 -> 1 : 20 |****************************************|
+
+operation = 'open'
+ msecs : count distribution
+ 0 -> 1 : 336 |****************************************|
+
+10:19:02:
+
+operation = 'read'
+ msecs : count distribution
+ 0 -> 1 : 678 |****************************************|
+ 2 -> 3 : 7 | |
+ 4 -> 7 : 9 | |
+ 8 -> 15 : 25 |* |
+ 16 -> 31 : 10 | |
+ 32 -> 63 : 3 | |
+
+operation = 'write'
+ msecs : count distribution
+ 0 -> 1 : 19 |****************************************|
+ 2 -> 3 : 1 |** |
+
+operation = 'open'
+ msecs : count distribution
+ 0 -> 1 : 390 |****************************************|
+
+10:19:03:
+
+operation = 'read'
+ msecs : count distribution
+ 0 -> 1 : 567 |****************************************|
+ 2 -> 3 : 7 | |
+ 4 -> 7 : 9 | |
+ 8 -> 15 : 20 |* |
+ 16 -> 31 : 15 |* |
+ 32 -> 63 : 5 | |
+ 64 -> 127 : 2 | |
+
+operation = 'write'
+ msecs : count distribution
+ 0 -> 1 : 20 |****************************************|
+
+operation = 'open'
+ msecs : count distribution
+ 0 -> 1 : 417 |****************************************|
+
+10:19:04:
+
+operation = 'read'
+ msecs : count distribution
+ 0 -> 1 : 762 |****************************************|
+ 2 -> 3 : 9 | |
+ 4 -> 7 : 9 | |
+ 8 -> 15 : 11 | |
+ 16 -> 31 : 20 |* |
+ 32 -> 63 : 4 | |
+ 64 -> 127 : 1 | |
+
+operation = 'write'
+ msecs : count distribution
+ 0 -> 1 : 20 |****************************************|
+
+operation = 'open'
+ msecs : count distribution
+ 0 -> 1 : 427 |****************************************|
+
+This shows a mixed read/write workload.
+
+
+USAGE message:
+
+# ./ext4dist -h
+usage: ext4dist [-h] [-T] [-m] [-p PID] [interval] [count]
+
+Summarize ext4 operation latency
+
+positional arguments:
+ interval output interval, in seconds
+ count number of outputs
+
+optional arguments:
+ -h, --help show this help message and exit
+ -T, --notimestamp don't include timestamp on interval output
+ -m, --milliseconds output in milliseconds
+ -p PID, --pid PID trace this PID only
+
+examples:
+ ./ext4dist # show operation latency as a histogram
+ ./ext4dist -p 181 # trace PID 181 only
+ ./ext4dist 1 10 # print 1 second summaries, 10 times
+ ./ext4dist -m 5 # 5s summaries, milliseconds