runqlat
diff --git a/tools/runqlat.py b/tools/runqlat.py
new file mode 100755
index 0000000..558981d
--- /dev/null
+++ b/tools/runqlat.py
@@ -0,0 +1,167 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# runqlat Run queue (scheduler) latency as a histogram.
+# For Linux, uses BCC, eBPF.
+#
+# USAGE: runqlat [-h] [-T] [-m] [-P] [-p PID] [interval] [count]
+#
+# This measures the time a task spends waiting on a run queue for a turn
+# on-CPU, and shows this time as a histogram. This time should be small, but a
+# task may need to wait its turn due to CPU load.
+#
+# This measures two types of run queue latency:
+# 1. The time from a task being enqueued on a run queue to its context switch
+# and execution. This traces enqueue_task_*() -> finish_task_switch(),
+# and instruments the run queue latency after a voluntary context switch.
+# 2. The time from when a task was involuntary context switched and still
+# in the runnable state, to when it next executed. This is instrumented
+# from finish_task_switch() alone.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 07-Feb-2016 Brendan Gregg Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+
+# arguments
+examples = """examples:
+ ./runqlat # summarize run queue latency as a histogram
+ ./runqlat 1 10 # print 1 second summaries, 10 times
+ ./runqlat -mT 1 # 1s summaries, milliseconds, and timestamps
+ ./runqlat -P # show each PID separately
+ ./runqlat -p 185 # trace PID 185 only
+"""
+parser = argparse.ArgumentParser(
+ description="Summarize run queue (schedular) latency as a histogram",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=examples)
+parser.add_argument("-T", "--timestamp", action="store_true",
+ help="include timestamp on output")
+parser.add_argument("-m", "--milliseconds", action="store_true",
+ help="millisecond histogram")
+parser.add_argument("-P", "--pids", action="store_true",
+ help="print a histogram per process ID")
+parser.add_argument("-p", "--pid",
+ help="trace this PID only")
+parser.add_argument("interval", nargs="?", default=99999999,
+ help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+ help="number of outputs")
+args = parser.parse_args()
+countdown = int(args.count)
+debug = 0
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include "kernel/sched/sched.h"
+
+typedef struct pid_key {
+ u64 pid; // work around
+ u64 slot;
+} pid_key_t;
+BPF_HASH(start, u32);
+STORAGE
+
+// record enqueue timestamp
+int trace_enqueue(struct pt_regs *ctx, struct rq *rq, struct task_struct *p,
+ int flags)
+{
+ u32 pid = p->pid;
+ if (FILTER)
+ return 0;
+ u64 ts = bpf_ktime_get_ns();
+ start.update(&pid, &ts);
+ return 0;
+}
+
+// calculate latency
+int trace_run(struct pt_regs *ctx, struct task_struct *prev)
+{
+ u32 pid;
+
+ // ivcsw: treat like an enqueue event and store timestamp
+ if (prev->state == TASK_RUNNING) {
+ pid = prev->pid;
+ if (!(FILTER)) {
+ u64 ts = bpf_ktime_get_ns();
+ start.update(&pid, &ts);
+ }
+ }
+
+ pid = bpf_get_current_pid_tgid();
+ if (FILTER)
+ return 0;
+ u64 *tsp, delta;
+
+ // fetch timestamp and calculate delta
+ tsp = start.lookup(&pid);
+ if (tsp == 0) {
+ return 0; // missed enqueue
+ }
+ delta = bpf_ktime_get_ns() - *tsp;
+ FACTOR
+
+ // store as histogram
+ STORE
+
+ start.delete(&pid);
+ return 0;
+}
+"""
+
+# code substitutions
+if args.pid:
+ bpf_text = bpf_text.replace('FILTER', 'pid != %s' % args.pid)
+else:
+ bpf_text = bpf_text.replace('FILTER', '0')
+if args.milliseconds:
+ bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000000;')
+ label = "msecs"
+else:
+ bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000;')
+ label = "usecs"
+if args.pids:
+ bpf_text = bpf_text.replace('STORAGE',
+ 'BPF_HISTOGRAM(dist, pid_key_t);')
+ bpf_text = bpf_text.replace('STORE',
+ 'pid_key_t key = {.pid = pid, .slot = bpf_log2l(delta)}; ' +
+ 'dist.increment(key);')
+else:
+ bpf_text = bpf_text.replace('STORAGE', 'BPF_HISTOGRAM(dist);')
+ bpf_text = bpf_text.replace('STORE',
+ 'dist.increment(bpf_log2l(delta));')
+if debug:
+ print(bpf_text)
+
+# load BPF program
+b = BPF(text=bpf_text)
+b.attach_kprobe(event_re="enqueue_task_*", fn_name="trace_enqueue")
+b.attach_kprobe(event="finish_task_switch", fn_name="trace_run")
+
+print("Tracing run queue latency... Hit Ctrl-C to end.")
+
+# output
+exiting = 0 if args.interval else 1
+dist = b.get_table("dist")
+while (1):
+ try:
+ sleep(int(args.interval))
+ except KeyboardInterrupt:
+ exiting = 1
+
+ print()
+ if args.timestamp:
+ print("%-8s\n" % strftime("%H:%M:%S"), end="")
+
+ dist.print_log2_hist(label, "pid", section_print_fn=int)
+ dist.clear()
+
+ countdown -= 1
+ if exiting or countdown == 0:
+ exit()
diff --git a/tools/runqlat_example.txt b/tools/runqlat_example.txt
new file mode 100644
index 0000000..9999dd3
--- /dev/null
+++ b/tools/runqlat_example.txt
@@ -0,0 +1,486 @@
+Demonstrations of runqlat, the Linux eBPF/bcc version.
+
+
+This program summarizes scheduler run queue latency as a histogram, showing
+how long tasks spent waiting their turn to run on-CPU.
+
+Here is a heavily loaded system:
+
+# ./runqlat
+Tracing run queue latency... Hit Ctrl-C to end.
+^C
+ usecs : count distribution
+ 0 -> 1 : 233 |*********** |
+ 2 -> 3 : 742 |************************************ |
+ 4 -> 7 : 203 |********** |
+ 8 -> 15 : 173 |******** |
+ 16 -> 31 : 24 |* |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 30 |* |
+ 128 -> 255 : 6 | |
+ 256 -> 511 : 3 | |
+ 512 -> 1023 : 5 | |
+ 1024 -> 2047 : 27 |* |
+ 2048 -> 4095 : 30 |* |
+ 4096 -> 8191 : 20 | |
+ 8192 -> 16383 : 29 |* |
+ 16384 -> 32767 : 809 |****************************************|
+ 32768 -> 65535 : 64 |*** |
+
+The distribution is bimodal, with one mode between 0 and 15 microseconds,
+and another between 16 and 65 milliseconds. These modes are visible as the
+spikes in the ASCII distribution (which is merely a visual representation
+of the "count" column). As an example of reading one line: 809 events fell
+into the 16384 to 32767 microsecond range (16 to 32 ms) while tracing.
+
+I would expect the two modes to be due the workload: 16 hot CPU-bound threads,
+and many other mostly idle threads doing occasional work. I suspect the mostly
+idle threads will run with a higher priority when they wake up, and are
+the reason for the low latency mode. The high latency mode will be the
+CPU-bound threads. More analysis with this and other tools can confirm.
+
+
+A -m option can be used to show milliseconds instead, as well as an interval
+and a count. For example, showing three x five second summary in milliseconds:
+
+# ./runqlat -m 5 3
+Tracing run queue latency... Hit Ctrl-C to end.
+
+ msecs : count distribution
+ 0 -> 1 : 3818 |****************************************|
+ 2 -> 3 : 39 | |
+ 4 -> 7 : 39 | |
+ 8 -> 15 : 62 | |
+ 16 -> 31 : 2214 |*********************** |
+ 32 -> 63 : 226 |** |
+
+ msecs : count distribution
+ 0 -> 1 : 3775 |****************************************|
+ 2 -> 3 : 52 | |
+ 4 -> 7 : 37 | |
+ 8 -> 15 : 65 | |
+ 16 -> 31 : 2230 |*********************** |
+ 32 -> 63 : 212 |** |
+
+ msecs : count distribution
+ 0 -> 1 : 3816 |****************************************|
+ 2 -> 3 : 49 | |
+ 4 -> 7 : 40 | |
+ 8 -> 15 : 53 | |
+ 16 -> 31 : 2228 |*********************** |
+ 32 -> 63 : 221 |** |
+
+This shows a similar distribution across the three summaries.
+
+
+A -p option can be used to show one PID only, which is filtered in kernel for
+efficiency. For example, PID 4505, and one second summaries:
+
+# ./runqlat -mp 4505 1
+Tracing run queue latency... Hit Ctrl-C to end.
+
+ msecs : count distribution
+ 0 -> 1 : 1 |* |
+ 2 -> 3 : 2 |*** |
+ 4 -> 7 : 1 |* |
+ 8 -> 15 : 0 | |
+ 16 -> 31 : 25 |****************************************|
+ 32 -> 63 : 3 |**** |
+
+ msecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 2 |** |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 1 |* |
+ 16 -> 31 : 30 |****************************************|
+ 32 -> 63 : 1 |* |
+
+ msecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 0 | |
+ 16 -> 31 : 28 |****************************************|
+ 32 -> 63 : 2 |** |
+
+ msecs : count distribution
+ 0 -> 1 : 1 |* |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 0 | |
+ 16 -> 31 : 27 |****************************************|
+ 32 -> 63 : 4 |***** |
+[...]
+
+For comparison, here is pidstat(1) for that process:
+
+# pidstat -p 4505 1
+Linux 4.4.0-virtual (bgregg-xxxxxxxx) 02/08/2016 _x86_64_ (8 CPU)
+
+08:56:11 AM UID PID %usr %system %guest %CPU CPU Command
+08:56:12 AM 0 4505 9.00 3.00 0.00 12.00 0 bash
+08:56:13 AM 0 4505 7.00 5.00 0.00 12.00 0 bash
+08:56:14 AM 0 4505 10.00 2.00 0.00 12.00 0 bash
+08:56:15 AM 0 4505 11.00 2.00 0.00 13.00 0 bash
+08:56:16 AM 0 4505 9.00 3.00 0.00 12.00 0 bash
+[...]
+
+This is a synthetic workload that is CPU bound. It's only spending 12% on-CPU
+each second because of high CPU demand on this server: the remaining time
+is spent waiting on a run queue, as visualized by runqlat.
+
+
+Here is the same system, but when it is CPU idle:
+
+# ./runqlat 5 1
+Tracing run queue latency... Hit Ctrl-C to end.
+
+ usecs : count distribution
+ 0 -> 1 : 2250 |******************************** |
+ 2 -> 3 : 2340 |********************************** |
+ 4 -> 7 : 2746 |****************************************|
+ 8 -> 15 : 418 |****** |
+ 16 -> 31 : 93 |* |
+ 32 -> 63 : 28 | |
+ 64 -> 127 : 119 |* |
+ 128 -> 255 : 9 | |
+ 256 -> 511 : 4 | |
+ 512 -> 1023 : 20 | |
+ 1024 -> 2047 : 22 | |
+ 2048 -> 4095 : 5 | |
+ 4096 -> 8191 : 2 | |
+
+Back to a microsecond scale, this time there is little run queue latency past 1
+millisecond, as would be expected.
+
+
+Now 16 threads are performing heavy disk I/O:
+
+# ./runqlat 5 1
+Tracing run queue latency... Hit Ctrl-C to end.
+
+ usecs : count distribution
+ 0 -> 1 : 204 | |
+ 2 -> 3 : 944 |* |
+ 4 -> 7 : 16315 |********************* |
+ 8 -> 15 : 29897 |****************************************|
+ 16 -> 31 : 1044 |* |
+ 32 -> 63 : 23 | |
+ 64 -> 127 : 128 | |
+ 128 -> 255 : 24 | |
+ 256 -> 511 : 5 | |
+ 512 -> 1023 : 13 | |
+ 1024 -> 2047 : 15 | |
+ 2048 -> 4095 : 13 | |
+ 4096 -> 8191 : 10 | |
+
+The distribution hasn't changed too much. While the disks are 100% busy, there
+is still plenty of CPU headroom, and threads still don't spend much time
+waiting their turn.
+
+
+A -P option will print a distribution for each PID:
+
+# ./runqlat -P
+Tracing run queue latency... Hit Ctrl-C to end.
+^C
+
+pid = 0
+ usecs : count distribution
+ 0 -> 1 : 351 |******************************** |
+ 2 -> 3 : 96 |******** |
+ 4 -> 7 : 437 |****************************************|
+ 8 -> 15 : 12 |* |
+ 16 -> 31 : 10 | |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 16 |* |
+ 128 -> 255 : 0 | |
+ 256 -> 511 : 0 | |
+ 512 -> 1023 : 0 | |
+ 1024 -> 2047 : 0 | |
+ 2048 -> 4095 : 0 | |
+ 4096 -> 8191 : 0 | |
+ 8192 -> 16383 : 1 | |
+
+pid = 12929
+ usecs : count distribution
+ 0 -> 1 : 1 |****************************************|
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 1 |****************************************|
+
+pid = 12930
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 0 | |
+ 16 -> 31 : 1 |****************************************|
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 1 |****************************************|
+
+pid = 12931
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 1 |******************** |
+ 8 -> 15 : 0 | |
+ 16 -> 31 : 0 | |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 0 | |
+ 128 -> 255 : 0 | |
+ 256 -> 511 : 0 | |
+ 512 -> 1023 : 2 |****************************************|
+
+pid = 12932
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 0 | |
+ 16 -> 31 : 0 | |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 0 | |
+ 128 -> 255 : 1 |****************************************|
+ 256 -> 511 : 0 | |
+ 512 -> 1023 : 1 |****************************************|
+
+pid = 7
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 426 |************************************* |
+ 4 -> 7 : 457 |****************************************|
+ 8 -> 15 : 16 |* |
+
+pid = 9
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 425 |****************************************|
+ 8 -> 15 : 16 |* |
+
+pid = 11
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 10 |****************************************|
+
+pid = 14
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 8 |****************************************|
+ 4 -> 7 : 2 |********** |
+
+pid = 18
+ usecs : count distribution
+ 0 -> 1 : 414 |****************************************|
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 20 |* |
+ 8 -> 15 : 8 | |
+
+pid = 12928
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 1 |****************************************|
+ 8 -> 15 : 0 | |
+ 16 -> 31 : 0 | |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 1 |****************************************|
+
+pid = 1867
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 15 |****************************************|
+ 16 -> 31 : 1 |** |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 0 | |
+ 128 -> 255 : 4 |********** |
+
+pid = 1871
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 2 |****************************************|
+ 16 -> 31 : 0 | |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 0 | |
+ 128 -> 255 : 0 | |
+ 256 -> 511 : 0 | |
+ 512 -> 1023 : 1 |******************** |
+
+pid = 1876
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 1 |****************************************|
+ 16 -> 31 : 0 | |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 0 | |
+ 128 -> 255 : 0 | |
+ 256 -> 511 : 1 |****************************************|
+
+pid = 1878
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 0 | |
+ 16 -> 31 : 3 |****************************************|
+
+pid = 1880
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 3 |****************************************|
+
+pid = 9307
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 1 |****************************************|
+
+pid = 1886
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 1 |******************** |
+ 8 -> 15 : 2 |****************************************|
+
+pid = 1888
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 3 |****************************************|
+
+pid = 3297
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 1 |****************************************|
+
+pid = 1892
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 0 | |
+ 16 -> 31 : 1 |******************** |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 0 | |
+ 128 -> 255 : 0 | |
+ 256 -> 511 : 0 | |
+ 512 -> 1023 : 2 |****************************************|
+
+pid = 7024
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 4 |****************************************|
+
+pid = 16468
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 3 |****************************************|
+
+pid = 12922
+ usecs : count distribution
+ 0 -> 1 : 1 |****************************************|
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 1 |****************************************|
+ 16 -> 31 : 1 |****************************************|
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 1 |****************************************|
+
+pid = 12923
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 1 |******************** |
+ 8 -> 15 : 0 | |
+ 16 -> 31 : 0 | |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 2 |****************************************|
+ 128 -> 255 : 0 | |
+ 256 -> 511 : 0 | |
+ 512 -> 1023 : 1 |******************** |
+ 1024 -> 2047 : 1 |******************** |
+
+pid = 12924
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 2 |******************** |
+ 8 -> 15 : 4 |****************************************|
+ 16 -> 31 : 1 |********** |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 0 | |
+ 128 -> 255 : 0 | |
+ 256 -> 511 : 0 | |
+ 512 -> 1023 : 0 | |
+ 1024 -> 2047 : 1 |********** |
+
+pid = 12925
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 0 | |
+ 16 -> 31 : 0 | |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 1 |****************************************|
+
+pid = 12926
+ usecs : count distribution
+ 0 -> 1 : 0 | |
+ 2 -> 3 : 1 |****************************************|
+ 4 -> 7 : 0 | |
+ 8 -> 15 : 1 |****************************************|
+ 16 -> 31 : 0 | |
+ 32 -> 63 : 0 | |
+ 64 -> 127 : 0 | |
+ 128 -> 255 : 0 | |
+ 256 -> 511 : 0 | |
+ 512 -> 1023 : 1 |****************************************|
+
+pid = 12927
+ usecs : count distribution
+ 0 -> 1 : 1 |****************************************|
+ 2 -> 3 : 0 | |
+ 4 -> 7 : 1 |****************************************|
+
+
+USAGE message:
+
+# ./runqlat -h
+usage: runqlat [-h] [-T] [-m] [-P] [-p PID] [interval] [count]
+
+Summarize run queue (schedular) latency as a histogram
+
+positional arguments:
+ interval output interval, in seconds
+ count number of outputs
+
+optional arguments:
+ -h, --help show this help message and exit
+ -T, --timestamp include timestamp on output
+ -m, --milliseconds millisecond histogram
+ -P, --pids print a histogram per process ID
+ -p PID, --pid PID trace this PID only
+
+examples:
+ ./runqlat # summarize run queue latency as a histogram
+ ./runqlat 1 10 # print 1 second summaries, 10 times
+ ./runqlat -mT 1 # 1s summaries, milliseconds, and timestamps
+ ./runqlat -P # show each PID separately
+ ./runqlat -p 185 # trace PID 185 only