Add lockstat tool

Adding lockstat tool to trace kernel mutex lock events and
display locks statistics and displays following data:

                                  Caller   Avg Spin  Count   Max spin Total spin
                      psi_avgs_work+0x2e       3675      5       5468      18379
                     flush_to_ldisc+0x22       2833      2       4210       5667
                       n_tty_write+0x30c       3914      1       3914       3914
                               isig+0x5d       2390      1       2390       2390
                   tty_buffer_flush+0x2a       1604      1       1604       1604
                      commit_echoes+0x22       1400      1       1400       1400
          n_tty_receive_buf_common+0x3b9       1399      1       1399       1399

                                  Caller   Avg Hold  Count   Max hold Total hold
                     flush_to_ldisc+0x22      42558      2      76135      85116
                      psi_avgs_work+0x2e      14821      5      20446      74106
          n_tty_receive_buf_common+0x3b9      12300      1      12300      12300
                       n_tty_write+0x30c      10712      1      10712      10712
                               isig+0x5d       3362      1       3362       3362
                   tty_buffer_flush+0x2a       3078      1       3078       3078
                      commit_echoes+0x22       3017      1       3017       3017

Every caller to using kernel's mutex is displayed on every line.

First portion of lines show the lock acquiring data, showing the amount
of time it took to acquired given lock.

  'Caller'       - symbol acquiring the mutex
  'Average Spin' - average time to acquire the mutex
  'Count'        - number of times mutex was acquired
  'Max spin'     - maximum time to acquire the mutex
  'Total spin'   - total time spent in acquiring the mutex

Second portion of lines show the lock holding data, showing the amount
of time it took to hold given lock.

  'Caller'       - symbol holding the mutex
  'Average Hold' - average time mutex was held
  'Count'        - number of times mutex was held
  'Max hold'     - maximum time mutex was held
  'Total hold'   - total time spent in holding the mutex

This works by tracing mutex_lock/unlock kprobes, udating the lock stats
in maps and processing them in the python part.

Examples:
    lockstats                           # trace system wide
    lockstats -d 5                      # trace for 5 seconds only
    lockstats -i 5                      # display stats every 5 seconds
    lockstats -p 123                    # trace locks for PID 123
    lockstats -t 321                    # trace locks for PID 321
    lockstats -c pipe_                  # display stats only for lock callers with 'pipe_' substring
    lockstats -S acq_count              # sort lock acquired results on acquired count
    lockstats -S hld_total              # sort lock held results on total held time
    lockstats -S acq_count,hld_total    # combination of above
    lockstats -n 3                      # display 3 locks
    lockstats -s 3                      # display 3 levels of stack

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
diff --git a/tools/lockstat.py b/tools/lockstat.py
new file mode 100755
index 0000000..07bff3e
--- /dev/null
+++ b/tools/lockstat.py
@@ -0,0 +1,452 @@
+#!/usr/bin/python
+#
+# lockstats traces lock events and display locks statistics.
+#
+# USAGE: lockstats
+#
+
+from __future__ import print_function
+from bcc import BPF, USDT
+import argparse
+import subprocess
+import ctypes as ct
+from time import sleep, strftime
+from datetime import datetime, timedelta
+import errno
+from sys import stderr
+
+examples = """
+    lockstats                           # trace system wide
+    lockstats -d 5                      # trace for 5 seconds only
+    lockstats -i 5                      # display stats every 5 seconds
+    lockstats -p 123                    # trace locks for PID 123
+    lockstats -t 321                    # trace locks for PID 321
+    lockstats -c pipe_                  # display stats only for lock callers with 'pipe_' substring
+    lockstats -S acq_count              # sort lock acquired results on acquired count
+    lockstats -S hld_total              # sort lock held results on total held time
+    lockstats -S acq_count,hld_total    # combination of above
+    lockstats -n 3                      # display 3 locks
+    lockstats -s 3                      # display 3 levels of stack
+"""
+
+# arg validation
+def positive_int(val):
+    try:
+        ival = int(val)
+    except ValueError:
+        raise argparse.ArgumentTypeError("must be an integer")
+
+    if ival < 0:
+        raise argparse.ArgumentTypeError("must be positive")
+    return ival
+
+def positive_nonzero_int(val):
+    ival = positive_int(val)
+    if ival == 0:
+        raise argparse.ArgumentTypeError("must be nonzero")
+    return ival
+
+def stack_id_err(stack_id):
+    # -EFAULT in get_stackid normally means the stack-trace is not availible,
+    # Such as getting kernel stack trace in userspace code
+    return (stack_id < 0) and (stack_id != -errno.EFAULT)
+
+parser = argparse.ArgumentParser(
+    description="",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+
+time_group = parser.add_mutually_exclusive_group()
+time_group.add_argument("-d", "--duration", type=int,
+    help="total duration of trace in seconds")
+time_group.add_argument("-i", "--interval", type=int,
+    help="print summary at this interval (seconds)")
+parser.add_argument("-n", "--locks", type=int, default=999999999,
+    help="print given number of locks")
+parser.add_argument("-s", "--stacks", type=int, default=1,
+    help="print given number of stack entries")
+parser.add_argument("-c", "--caller",
+    help="print locks taken by given caller")
+parser.add_argument("-S", "--sort",
+    help="sort data on <aq_field,hd_field>, fields: acq_[max|total|count] hld_[max|total|count]")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("-t", "--tid",
+    help="trace this TID only")
+parser.add_argument("--stack-storage-size", default=16384,
+    type=positive_nonzero_int,
+    help="the number of unique stack traces that can be stored and "
+         "displayed (default 16384)")
+
+args = parser.parse_args()
+
+program = """
+#include <uapi/linux/ptrace.h>
+
+struct depth_id {
+  u64 id;
+  u64 depth;
+};
+
+BPF_ARRAY(enabled,   u64, 1);
+BPF_HASH(track,      u64, u64);
+BPF_HASH(time_aq,    u64, u64);
+BPF_HASH(lock_depth, u64, u64);
+BPF_HASH(time_held,  struct depth_id, u64);
+BPF_HASH(stack,      struct depth_id, int);
+
+BPF_HASH(aq_report_count, int, u64);
+BPF_HASH(aq_report_max,   int, u64);
+BPF_HASH(aq_report_total, int, u64);
+
+BPF_HASH(hl_report_count, int, u64);
+BPF_HASH(hl_report_max,   int, u64);
+BPF_HASH(hl_report_total, int, u64);
+
+BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
+
+static bool is_enabled(void)
+{
+    int key = 0;
+    u64 *ret;
+
+    ret = enabled.lookup(&key);
+    return ret && *ret == 1;
+}
+
+static bool allow_pid(u64 id)
+{
+    u32 pid = id >> 32; // PID is higher part
+    u32 tid = id;       // Cast and get the lower part
+
+    FILTER
+
+    return 1;
+}
+
+int mutex_lock_enter(struct pt_regs *ctx)
+{
+    if (!is_enabled())
+        return 0;
+
+    u64 id = bpf_get_current_pid_tgid();
+
+    if (!allow_pid(id))
+        return 0;
+
+    u64 one = 1, zero = 0;
+
+    track.update(&id, &one);
+
+    u64 *depth = lock_depth.lookup(&id);
+
+    if (!depth) {
+        lock_depth.update(&id, &zero);
+
+        depth = lock_depth.lookup(&id);
+        /* something is wrong.. */
+        if (!depth)
+            return 0;
+    }
+
+    int stackid = stack_traces.get_stackid(ctx, 0);
+    struct depth_id did = {
+      .id    = id,
+      .depth = *depth,
+    };
+    stack.update(&did, &stackid);
+
+    u64 ts = bpf_ktime_get_ns();
+    time_aq.update(&id, &ts);
+
+    *depth += 1;
+    return 0;
+}
+
+static void update_aq_report_count(int *stackid)
+{
+    u64 *count, one = 1;
+
+    count = aq_report_count.lookup(stackid);
+    if (!count) {
+        aq_report_count.update(stackid, &one);
+    } else {
+        *count += 1;
+    }
+}
+
+static void update_hl_report_count(int *stackid)
+{
+    u64 *count, one = 1;
+
+    count = hl_report_count.lookup(stackid);
+    if (!count) {
+        hl_report_count.update(stackid, &one);
+    } else {
+        *count += 1;
+    }
+}
+
+static void update_aq_report_max(int *stackid, u64 time)
+{
+    u64 *max;
+
+    max = aq_report_max.lookup(stackid);
+    if (!max || *max < time)
+        aq_report_max.update(stackid, &time);
+}
+
+static void update_hl_report_max(int *stackid, u64 time)
+{
+    u64 *max;
+
+    max = hl_report_max.lookup(stackid);
+    if (!max || *max < time)
+        hl_report_max.update(stackid, &time);
+}
+
+static void update_aq_report_total(int *stackid, u64 delta)
+{
+    u64 *count, *time;
+
+    count = aq_report_count.lookup(stackid);
+    if (!count)
+        return;
+
+    time = aq_report_total.lookup(stackid);
+    if (!time) {
+        aq_report_total.update(stackid, &delta);
+    } else {
+        *time = *time + delta;
+    }
+}
+
+static void update_hl_report_total(int *stackid, u64 delta)
+{
+    u64 *count, *time;
+
+    count = hl_report_count.lookup(stackid);
+    if (!count)
+        return;
+
+    time = hl_report_total.lookup(stackid);
+    if (!time) {
+        hl_report_total.update(stackid, &delta);
+    } else {
+        *time = *time + delta;
+    }
+}
+
+int mutex_lock_return(struct pt_regs *ctx)
+{
+    if (!is_enabled())
+        return 0;
+
+    u64 id = bpf_get_current_pid_tgid();
+
+    if (!allow_pid(id))
+        return 0;
+
+    u64 *one = track.lookup(&id);
+
+    if (!one)
+        return 0;
+
+    track.delete(&id);
+
+    u64 *depth = lock_depth.lookup(&id);
+    if (!depth)
+        return 0;
+
+    struct depth_id did = {
+      .id    = id,
+      .depth = *depth - 1,
+    };
+
+    u64 *aq = time_aq.lookup(&id);
+    if (!aq)
+        return 0;
+
+    int *stackid = stack.lookup(&did);
+    if (!stackid)
+        return 0;
+
+    u64 cur = bpf_ktime_get_ns();
+
+    if (cur > *aq) {
+        int val = cur - *aq;
+        update_aq_report_count(stackid);
+        update_aq_report_max(stackid, val);
+        update_aq_report_total(stackid, val);
+    }
+
+    time_held.update(&did, &cur);
+    return 0;
+}
+
+int mutex_unlock_enter(struct pt_regs *ctx)
+{
+    if (!is_enabled())
+        return 0;
+
+    u64 id = bpf_get_current_pid_tgid();
+
+    if (!allow_pid(id))
+        return 0;
+
+    u64 *depth = lock_depth.lookup(&id);
+
+    if (!depth || *depth == 0)
+        return 0;
+
+    *depth -= 1;
+
+    struct depth_id did = {
+      .id    = id,
+      .depth = *depth,
+    };
+
+    u64 *held = time_held.lookup(&did);
+    if (!held)
+        return 0;
+
+    int *stackid = stack.lookup(&did);
+    if (!stackid)
+        return 0;
+
+    u64 cur = bpf_ktime_get_ns();
+
+    if (cur > *held) {
+        u64 val = cur - *held;
+        update_hl_report_count(stackid);
+        update_hl_report_max(stackid, val);
+        update_hl_report_total(stackid, val);
+    }
+
+    stack.delete(&did);
+    time_held.delete(&did);
+    return 0;
+}
+
+"""
+
+def sort_list(maxs, totals, counts):
+    if (not args.sort):
+        return maxs;
+
+    for field in args.sort.split(','):
+        if (field == "acq_max" or field == "hld_max"):
+            return maxs
+        if (field == "acq_total" or field == "hld_total"):
+            return totals
+        if (field == "acq_count" or field == "hld_count"):
+            return counts
+
+    print("Wrong sort argument: %s", args.sort)
+    exit(-1)
+
+def display(sort, maxs, totals, counts):
+    global missing_stacks
+    global has_enomem
+
+    for k, v in sorted(sort.items(), key=lambda sort: sort[1].value, reverse=True)[:args.locks]:
+        missing_stacks += int(stack_id_err(k.value))
+        has_enomem      = has_enomem or (k.value == -errno.ENOMEM)
+
+        caller = "[Missed Kernel Stack]"
+        stack  = []
+
+        if (k.value >= 0):
+            stack  = list(stack_traces.walk(k.value))
+            caller = b.ksym(stack[1], show_offset=True)
+
+            if (args.caller and caller.find(args.caller)):
+                continue
+
+        avg = totals[k].value / counts[k].value
+
+        print("%40s %10lu %6lu %10lu %10lu" % (caller, avg, counts[k].value, maxs[k].value, totals[k].value))
+
+        for addr in stack[1:args.stacks]:
+            print("%40s" %  b.ksym(addr, show_offset=True))
+
+
+if args.tid:  # TID trumps PID
+    program = program.replace('FILTER',
+        'if (tid != %s) { return 0; }' % args.tid)
+elif args.pid:
+    program = program.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    program = program.replace('FILTER', '')
+
+program = program.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
+
+b = BPF(text=program)
+
+b.attach_kprobe(event="mutex_unlock", fn_name="mutex_unlock_enter")
+b.attach_kretprobe(event="mutex_lock", fn_name="mutex_lock_return")
+b.attach_kprobe(event="mutex_lock", fn_name="mutex_lock_enter")
+
+enabled = b.get_table("enabled");
+
+stack_traces = b.get_table("stack_traces")
+aq_counts = b.get_table("aq_report_count")
+aq_maxs   = b.get_table("aq_report_max")
+aq_totals = b.get_table("aq_report_total")
+
+hl_counts = b.get_table("hl_report_count")
+hl_maxs   = b.get_table("hl_report_max")
+hl_totals = b.get_table("hl_report_total")
+
+aq_sort = sort_list(aq_maxs, aq_totals, aq_counts)
+hl_sort = sort_list(hl_maxs, hl_totals, hl_counts)
+
+print("Tracing lock events... Hit Ctrl-C to end.")
+
+# duration and interval are mutualy exclusive
+exiting = 0 if args.interval else 1
+exiting = 1 if args.duration else 0
+
+seconds = 999999999
+if args.interval:
+    seconds = args.interval
+if args.duration:
+    seconds = args.duration
+
+missing_stacks = 0
+has_enomem     = False
+
+while (1):
+    enabled[ct.c_int(0)] = ct.c_int(1)
+
+    try:
+        sleep(seconds)
+    except KeyboardInterrupt:
+        exiting = 1
+
+    enabled[ct.c_int(0)] = ct.c_int(0)
+
+    print("\n%40s %10s %6s %10s %10s" % ("Caller", "Avg Spin", "Count", "Max spin", "Total spin"))
+    display(aq_sort, aq_maxs, aq_totals, aq_counts)
+
+
+    print("\n%40s %10s %6s %10s %10s" % ("Caller", "Avg Hold", "Count", "Max hold", "Total hold"))
+    display(hl_sort, hl_maxs, hl_totals, hl_counts)
+
+    if exiting:
+        break;
+
+    stack_traces.clear()
+    aq_counts.clear()
+    aq_maxs.clear()
+    aq_totals.clear()
+    hl_counts.clear()
+    hl_maxs.clear()
+    hl_totals.clear()
+
+if missing_stacks > 0:
+    enomem_str = " Consider increasing --stack-storage-size."
+    print("WARNING: %d stack traces lost and could not be displayed.%s" %
+        (missing_stacks, (enomem_str if has_enomem else "")),
+        file=stderr)