Add lockstat tool
Adding lockstat tool to trace kernel mutex lock events and
display locks statistics and displays following data:
Caller Avg Spin Count Max spin Total spin
psi_avgs_work+0x2e 3675 5 5468 18379
flush_to_ldisc+0x22 2833 2 4210 5667
n_tty_write+0x30c 3914 1 3914 3914
isig+0x5d 2390 1 2390 2390
tty_buffer_flush+0x2a 1604 1 1604 1604
commit_echoes+0x22 1400 1 1400 1400
n_tty_receive_buf_common+0x3b9 1399 1 1399 1399
Caller Avg Hold Count Max hold Total hold
flush_to_ldisc+0x22 42558 2 76135 85116
psi_avgs_work+0x2e 14821 5 20446 74106
n_tty_receive_buf_common+0x3b9 12300 1 12300 12300
n_tty_write+0x30c 10712 1 10712 10712
isig+0x5d 3362 1 3362 3362
tty_buffer_flush+0x2a 3078 1 3078 3078
commit_echoes+0x22 3017 1 3017 3017
Every caller to using kernel's mutex is displayed on every line.
First portion of lines show the lock acquiring data, showing the amount
of time it took to acquired given lock.
'Caller' - symbol acquiring the mutex
'Average Spin' - average time to acquire the mutex
'Count' - number of times mutex was acquired
'Max spin' - maximum time to acquire the mutex
'Total spin' - total time spent in acquiring the mutex
Second portion of lines show the lock holding data, showing the amount
of time it took to hold given lock.
'Caller' - symbol holding the mutex
'Average Hold' - average time mutex was held
'Count' - number of times mutex was held
'Max hold' - maximum time mutex was held
'Total hold' - total time spent in holding the mutex
This works by tracing mutex_lock/unlock kprobes, udating the lock stats
in maps and processing them in the python part.
Examples:
lockstats # trace system wide
lockstats -d 5 # trace for 5 seconds only
lockstats -i 5 # display stats every 5 seconds
lockstats -p 123 # trace locks for PID 123
lockstats -t 321 # trace locks for PID 321
lockstats -c pipe_ # display stats only for lock callers with 'pipe_' substring
lockstats -S acq_count # sort lock acquired results on acquired count
lockstats -S hld_total # sort lock held results on total held time
lockstats -S acq_count,hld_total # combination of above
lockstats -n 3 # display 3 locks
lockstats -s 3 # display 3 levels of stack
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
diff --git a/tools/lockstat.py b/tools/lockstat.py
new file mode 100755
index 0000000..07bff3e
--- /dev/null
+++ b/tools/lockstat.py
@@ -0,0 +1,452 @@
+#!/usr/bin/python
+#
+# lockstats traces lock events and display locks statistics.
+#
+# USAGE: lockstats
+#
+
+from __future__ import print_function
+from bcc import BPF, USDT
+import argparse
+import subprocess
+import ctypes as ct
+from time import sleep, strftime
+from datetime import datetime, timedelta
+import errno
+from sys import stderr
+
+examples = """
+ lockstats # trace system wide
+ lockstats -d 5 # trace for 5 seconds only
+ lockstats -i 5 # display stats every 5 seconds
+ lockstats -p 123 # trace locks for PID 123
+ lockstats -t 321 # trace locks for PID 321
+ lockstats -c pipe_ # display stats only for lock callers with 'pipe_' substring
+ lockstats -S acq_count # sort lock acquired results on acquired count
+ lockstats -S hld_total # sort lock held results on total held time
+ lockstats -S acq_count,hld_total # combination of above
+ lockstats -n 3 # display 3 locks
+ lockstats -s 3 # display 3 levels of stack
+"""
+
+# arg validation
+def positive_int(val):
+ try:
+ ival = int(val)
+ except ValueError:
+ raise argparse.ArgumentTypeError("must be an integer")
+
+ if ival < 0:
+ raise argparse.ArgumentTypeError("must be positive")
+ return ival
+
+def positive_nonzero_int(val):
+ ival = positive_int(val)
+ if ival == 0:
+ raise argparse.ArgumentTypeError("must be nonzero")
+ return ival
+
+def stack_id_err(stack_id):
+ # -EFAULT in get_stackid normally means the stack-trace is not availible,
+ # Such as getting kernel stack trace in userspace code
+ return (stack_id < 0) and (stack_id != -errno.EFAULT)
+
+parser = argparse.ArgumentParser(
+ description="",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=examples)
+
+time_group = parser.add_mutually_exclusive_group()
+time_group.add_argument("-d", "--duration", type=int,
+ help="total duration of trace in seconds")
+time_group.add_argument("-i", "--interval", type=int,
+ help="print summary at this interval (seconds)")
+parser.add_argument("-n", "--locks", type=int, default=999999999,
+ help="print given number of locks")
+parser.add_argument("-s", "--stacks", type=int, default=1,
+ help="print given number of stack entries")
+parser.add_argument("-c", "--caller",
+ help="print locks taken by given caller")
+parser.add_argument("-S", "--sort",
+ help="sort data on <aq_field,hd_field>, fields: acq_[max|total|count] hld_[max|total|count]")
+parser.add_argument("-p", "--pid",
+ help="trace this PID only")
+parser.add_argument("-t", "--tid",
+ help="trace this TID only")
+parser.add_argument("--stack-storage-size", default=16384,
+ type=positive_nonzero_int,
+ help="the number of unique stack traces that can be stored and "
+ "displayed (default 16384)")
+
+args = parser.parse_args()
+
+program = """
+#include <uapi/linux/ptrace.h>
+
+struct depth_id {
+ u64 id;
+ u64 depth;
+};
+
+BPF_ARRAY(enabled, u64, 1);
+BPF_HASH(track, u64, u64);
+BPF_HASH(time_aq, u64, u64);
+BPF_HASH(lock_depth, u64, u64);
+BPF_HASH(time_held, struct depth_id, u64);
+BPF_HASH(stack, struct depth_id, int);
+
+BPF_HASH(aq_report_count, int, u64);
+BPF_HASH(aq_report_max, int, u64);
+BPF_HASH(aq_report_total, int, u64);
+
+BPF_HASH(hl_report_count, int, u64);
+BPF_HASH(hl_report_max, int, u64);
+BPF_HASH(hl_report_total, int, u64);
+
+BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
+
+static bool is_enabled(void)
+{
+ int key = 0;
+ u64 *ret;
+
+ ret = enabled.lookup(&key);
+ return ret && *ret == 1;
+}
+
+static bool allow_pid(u64 id)
+{
+ u32 pid = id >> 32; // PID is higher part
+ u32 tid = id; // Cast and get the lower part
+
+ FILTER
+
+ return 1;
+}
+
+int mutex_lock_enter(struct pt_regs *ctx)
+{
+ if (!is_enabled())
+ return 0;
+
+ u64 id = bpf_get_current_pid_tgid();
+
+ if (!allow_pid(id))
+ return 0;
+
+ u64 one = 1, zero = 0;
+
+ track.update(&id, &one);
+
+ u64 *depth = lock_depth.lookup(&id);
+
+ if (!depth) {
+ lock_depth.update(&id, &zero);
+
+ depth = lock_depth.lookup(&id);
+ /* something is wrong.. */
+ if (!depth)
+ return 0;
+ }
+
+ int stackid = stack_traces.get_stackid(ctx, 0);
+ struct depth_id did = {
+ .id = id,
+ .depth = *depth,
+ };
+ stack.update(&did, &stackid);
+
+ u64 ts = bpf_ktime_get_ns();
+ time_aq.update(&id, &ts);
+
+ *depth += 1;
+ return 0;
+}
+
+static void update_aq_report_count(int *stackid)
+{
+ u64 *count, one = 1;
+
+ count = aq_report_count.lookup(stackid);
+ if (!count) {
+ aq_report_count.update(stackid, &one);
+ } else {
+ *count += 1;
+ }
+}
+
+static void update_hl_report_count(int *stackid)
+{
+ u64 *count, one = 1;
+
+ count = hl_report_count.lookup(stackid);
+ if (!count) {
+ hl_report_count.update(stackid, &one);
+ } else {
+ *count += 1;
+ }
+}
+
+static void update_aq_report_max(int *stackid, u64 time)
+{
+ u64 *max;
+
+ max = aq_report_max.lookup(stackid);
+ if (!max || *max < time)
+ aq_report_max.update(stackid, &time);
+}
+
+static void update_hl_report_max(int *stackid, u64 time)
+{
+ u64 *max;
+
+ max = hl_report_max.lookup(stackid);
+ if (!max || *max < time)
+ hl_report_max.update(stackid, &time);
+}
+
+static void update_aq_report_total(int *stackid, u64 delta)
+{
+ u64 *count, *time;
+
+ count = aq_report_count.lookup(stackid);
+ if (!count)
+ return;
+
+ time = aq_report_total.lookup(stackid);
+ if (!time) {
+ aq_report_total.update(stackid, &delta);
+ } else {
+ *time = *time + delta;
+ }
+}
+
+static void update_hl_report_total(int *stackid, u64 delta)
+{
+ u64 *count, *time;
+
+ count = hl_report_count.lookup(stackid);
+ if (!count)
+ return;
+
+ time = hl_report_total.lookup(stackid);
+ if (!time) {
+ hl_report_total.update(stackid, &delta);
+ } else {
+ *time = *time + delta;
+ }
+}
+
+int mutex_lock_return(struct pt_regs *ctx)
+{
+ if (!is_enabled())
+ return 0;
+
+ u64 id = bpf_get_current_pid_tgid();
+
+ if (!allow_pid(id))
+ return 0;
+
+ u64 *one = track.lookup(&id);
+
+ if (!one)
+ return 0;
+
+ track.delete(&id);
+
+ u64 *depth = lock_depth.lookup(&id);
+ if (!depth)
+ return 0;
+
+ struct depth_id did = {
+ .id = id,
+ .depth = *depth - 1,
+ };
+
+ u64 *aq = time_aq.lookup(&id);
+ if (!aq)
+ return 0;
+
+ int *stackid = stack.lookup(&did);
+ if (!stackid)
+ return 0;
+
+ u64 cur = bpf_ktime_get_ns();
+
+ if (cur > *aq) {
+ int val = cur - *aq;
+ update_aq_report_count(stackid);
+ update_aq_report_max(stackid, val);
+ update_aq_report_total(stackid, val);
+ }
+
+ time_held.update(&did, &cur);
+ return 0;
+}
+
+int mutex_unlock_enter(struct pt_regs *ctx)
+{
+ if (!is_enabled())
+ return 0;
+
+ u64 id = bpf_get_current_pid_tgid();
+
+ if (!allow_pid(id))
+ return 0;
+
+ u64 *depth = lock_depth.lookup(&id);
+
+ if (!depth || *depth == 0)
+ return 0;
+
+ *depth -= 1;
+
+ struct depth_id did = {
+ .id = id,
+ .depth = *depth,
+ };
+
+ u64 *held = time_held.lookup(&did);
+ if (!held)
+ return 0;
+
+ int *stackid = stack.lookup(&did);
+ if (!stackid)
+ return 0;
+
+ u64 cur = bpf_ktime_get_ns();
+
+ if (cur > *held) {
+ u64 val = cur - *held;
+ update_hl_report_count(stackid);
+ update_hl_report_max(stackid, val);
+ update_hl_report_total(stackid, val);
+ }
+
+ stack.delete(&did);
+ time_held.delete(&did);
+ return 0;
+}
+
+"""
+
+def sort_list(maxs, totals, counts):
+ if (not args.sort):
+ return maxs;
+
+ for field in args.sort.split(','):
+ if (field == "acq_max" or field == "hld_max"):
+ return maxs
+ if (field == "acq_total" or field == "hld_total"):
+ return totals
+ if (field == "acq_count" or field == "hld_count"):
+ return counts
+
+ print("Wrong sort argument: %s", args.sort)
+ exit(-1)
+
+def display(sort, maxs, totals, counts):
+ global missing_stacks
+ global has_enomem
+
+ for k, v in sorted(sort.items(), key=lambda sort: sort[1].value, reverse=True)[:args.locks]:
+ missing_stacks += int(stack_id_err(k.value))
+ has_enomem = has_enomem or (k.value == -errno.ENOMEM)
+
+ caller = "[Missed Kernel Stack]"
+ stack = []
+
+ if (k.value >= 0):
+ stack = list(stack_traces.walk(k.value))
+ caller = b.ksym(stack[1], show_offset=True)
+
+ if (args.caller and caller.find(args.caller)):
+ continue
+
+ avg = totals[k].value / counts[k].value
+
+ print("%40s %10lu %6lu %10lu %10lu" % (caller, avg, counts[k].value, maxs[k].value, totals[k].value))
+
+ for addr in stack[1:args.stacks]:
+ print("%40s" % b.ksym(addr, show_offset=True))
+
+
+if args.tid: # TID trumps PID
+ program = program.replace('FILTER',
+ 'if (tid != %s) { return 0; }' % args.tid)
+elif args.pid:
+ program = program.replace('FILTER',
+ 'if (pid != %s) { return 0; }' % args.pid)
+else:
+ program = program.replace('FILTER', '')
+
+program = program.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
+
+b = BPF(text=program)
+
+b.attach_kprobe(event="mutex_unlock", fn_name="mutex_unlock_enter")
+b.attach_kretprobe(event="mutex_lock", fn_name="mutex_lock_return")
+b.attach_kprobe(event="mutex_lock", fn_name="mutex_lock_enter")
+
+enabled = b.get_table("enabled");
+
+stack_traces = b.get_table("stack_traces")
+aq_counts = b.get_table("aq_report_count")
+aq_maxs = b.get_table("aq_report_max")
+aq_totals = b.get_table("aq_report_total")
+
+hl_counts = b.get_table("hl_report_count")
+hl_maxs = b.get_table("hl_report_max")
+hl_totals = b.get_table("hl_report_total")
+
+aq_sort = sort_list(aq_maxs, aq_totals, aq_counts)
+hl_sort = sort_list(hl_maxs, hl_totals, hl_counts)
+
+print("Tracing lock events... Hit Ctrl-C to end.")
+
+# duration and interval are mutualy exclusive
+exiting = 0 if args.interval else 1
+exiting = 1 if args.duration else 0
+
+seconds = 999999999
+if args.interval:
+ seconds = args.interval
+if args.duration:
+ seconds = args.duration
+
+missing_stacks = 0
+has_enomem = False
+
+while (1):
+ enabled[ct.c_int(0)] = ct.c_int(1)
+
+ try:
+ sleep(seconds)
+ except KeyboardInterrupt:
+ exiting = 1
+
+ enabled[ct.c_int(0)] = ct.c_int(0)
+
+ print("\n%40s %10s %6s %10s %10s" % ("Caller", "Avg Spin", "Count", "Max spin", "Total spin"))
+ display(aq_sort, aq_maxs, aq_totals, aq_counts)
+
+
+ print("\n%40s %10s %6s %10s %10s" % ("Caller", "Avg Hold", "Count", "Max hold", "Total hold"))
+ display(hl_sort, hl_maxs, hl_totals, hl_counts)
+
+ if exiting:
+ break;
+
+ stack_traces.clear()
+ aq_counts.clear()
+ aq_maxs.clear()
+ aq_totals.clear()
+ hl_counts.clear()
+ hl_maxs.clear()
+ hl_totals.clear()
+
+if missing_stacks > 0:
+ enomem_str = " Consider increasing --stack-storage-size."
+ print("WARNING: %d stack traces lost and could not be displayed.%s" %
+ (missing_stacks, (enomem_str if has_enomem else "")),
+ file=stderr)