rename tool lockstat.py to klockstat.py

The current lockstat.py is tracing three kernel functions
  mutex_lock_enter(), mutex_unlock_enter(), mutex_lock_return()
for kernel locking statistics.

There are some other efforts trying to get user lock stats by
tracing e.g. pthread locking primitives. For example,
Sasha Goldshtein's linux-tracing-workshop
   https://github.com/goldshtn/linux-tracing-workshop
is referenced in bcc/docs/tutorial_bcc_python_developer.md.
It has a tool called lockstat.py which traces pthread_mutex_init
to collect some lock statistics for userspace locks.

In bcc, in the past, we also had an effort to gather userspace
lock statistics with the same name lockstat.py.
   https://github.com/iovisor/bcc/pull/1268

In the future, bcc could have a lockstat tool tracing userspace
locks. So let us rename the current lockstat.py to klockstat.py
to clearly express its scope.
diff --git a/tools/klockstat.py b/tools/klockstat.py
new file mode 100755
index 0000000..343d436
--- /dev/null
+++ b/tools/klockstat.py
@@ -0,0 +1,455 @@
+#!/usr/bin/python
+#
+# klockstat traces lock events and display locks statistics.
+#
+# USAGE: klockstat
+#
+
+from __future__ import print_function
+from bcc import BPF, USDT
+import argparse
+import subprocess
+import ctypes as ct
+from time import sleep, strftime
+from datetime import datetime, timedelta
+import errno
+from sys import stderr
+
+examples = """
+    klockstat                           # trace system wide
+    klockstat -d 5                      # trace for 5 seconds only
+    klockstat -i 5                      # display stats every 5 seconds
+    klockstat -p 123                    # trace locks for PID 123
+    klockstat -t 321                    # trace locks for PID 321
+    klockstat -c pipe_                  # display stats only for lock callers with 'pipe_' substring
+    klockstat -S acq_count              # sort lock acquired results on acquired count
+    klockstat -S hld_total              # sort lock held results on total held time
+    klockstat -S acq_count,hld_total    # combination of above
+    klockstat -n 3                      # display 3 locks
+    klockstat -s 3                      # display 3 levels of stack
+"""
+
+# arg validation
+def positive_int(val):
+    try:
+        ival = int(val)
+    except ValueError:
+        raise argparse.ArgumentTypeError("must be an integer")
+
+    if ival < 0:
+        raise argparse.ArgumentTypeError("must be positive")
+    return ival
+
+def positive_nonzero_int(val):
+    ival = positive_int(val)
+    if ival == 0:
+        raise argparse.ArgumentTypeError("must be nonzero")
+    return ival
+
+def stack_id_err(stack_id):
+    # -EFAULT in get_stackid normally means the stack-trace is not availible,
+    # Such as getting kernel stack trace in userspace code
+    return (stack_id < 0) and (stack_id != -errno.EFAULT)
+
+parser = argparse.ArgumentParser(
+    description="",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+
+time_group = parser.add_mutually_exclusive_group()
+time_group.add_argument("-d", "--duration", type=int,
+    help="total duration of trace in seconds")
+time_group.add_argument("-i", "--interval", type=int,
+    help="print summary at this interval (seconds)")
+parser.add_argument("-n", "--locks", type=int, default=999999999,
+    help="print given number of locks")
+parser.add_argument("-s", "--stacks", type=int, default=1,
+    help="print given number of stack entries")
+parser.add_argument("-c", "--caller",
+    help="print locks taken by given caller")
+parser.add_argument("-S", "--sort",
+    help="sort data on <aq_field,hd_field>, fields: acq_[max|total|count] hld_[max|total|count]")
+parser.add_argument("-p", "--pid",
+    help="trace this PID only")
+parser.add_argument("-t", "--tid",
+    help="trace this TID only")
+parser.add_argument("--stack-storage-size", default=16384,
+    type=positive_nonzero_int,
+    help="the number of unique stack traces that can be stored and "
+         "displayed (default 16384)")
+
+args = parser.parse_args()
+
+program = """
+#include <uapi/linux/ptrace.h>
+
+struct depth_id {
+  u64 id;
+  u64 depth;
+};
+
+BPF_ARRAY(enabled,   u64, 1);
+BPF_HASH(track,      u64, u64);
+BPF_HASH(time_aq,    u64, u64);
+BPF_HASH(lock_depth, u64, u64);
+BPF_HASH(time_held,  struct depth_id, u64);
+BPF_HASH(stack,      struct depth_id, int);
+
+BPF_HASH(aq_report_count, int, u64);
+BPF_HASH(aq_report_max,   int, u64);
+BPF_HASH(aq_report_total, int, u64);
+
+BPF_HASH(hl_report_count, int, u64);
+BPF_HASH(hl_report_max,   int, u64);
+BPF_HASH(hl_report_total, int, u64);
+
+BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
+
+static bool is_enabled(void)
+{
+    int key = 0;
+    u64 *ret;
+
+    ret = enabled.lookup(&key);
+    return ret && *ret == 1;
+}
+
+static bool allow_pid(u64 id)
+{
+    u32 pid = id >> 32; // PID is higher part
+    u32 tid = id;       // Cast and get the lower part
+
+    FILTER
+
+    return 1;
+}
+
+int mutex_lock_enter(struct pt_regs *ctx)
+{
+    if (!is_enabled())
+        return 0;
+
+    u64 id = bpf_get_current_pid_tgid();
+
+    if (!allow_pid(id))
+        return 0;
+
+    u64 one = 1, zero = 0;
+
+    track.update(&id, &one);
+
+    u64 *depth = lock_depth.lookup(&id);
+
+    if (!depth) {
+        lock_depth.update(&id, &zero);
+
+        depth = lock_depth.lookup(&id);
+        /* something is wrong.. */
+        if (!depth)
+            return 0;
+    }
+
+    int stackid = stack_traces.get_stackid(ctx, 0);
+    struct depth_id did = {
+      .id    = id,
+      .depth = *depth,
+    };
+    stack.update(&did, &stackid);
+
+    u64 ts = bpf_ktime_get_ns();
+    time_aq.update(&id, &ts);
+
+    *depth += 1;
+    return 0;
+}
+
+static void update_aq_report_count(int *stackid)
+{
+    u64 *count, one = 1;
+
+    count = aq_report_count.lookup(stackid);
+    if (!count) {
+        aq_report_count.update(stackid, &one);
+    } else {
+        *count += 1;
+    }
+}
+
+static void update_hl_report_count(int *stackid)
+{
+    u64 *count, one = 1;
+
+    count = hl_report_count.lookup(stackid);
+    if (!count) {
+        hl_report_count.update(stackid, &one);
+    } else {
+        *count += 1;
+    }
+}
+
+static void update_aq_report_max(int *stackid, u64 time)
+{
+    u64 *max;
+
+    max = aq_report_max.lookup(stackid);
+    if (!max || *max < time)
+        aq_report_max.update(stackid, &time);
+}
+
+static void update_hl_report_max(int *stackid, u64 time)
+{
+    u64 *max;
+
+    max = hl_report_max.lookup(stackid);
+    if (!max || *max < time)
+        hl_report_max.update(stackid, &time);
+}
+
+static void update_aq_report_total(int *stackid, u64 delta)
+{
+    u64 *count, *time;
+
+    count = aq_report_count.lookup(stackid);
+    if (!count)
+        return;
+
+    time = aq_report_total.lookup(stackid);
+    if (!time) {
+        aq_report_total.update(stackid, &delta);
+    } else {
+        *time = *time + delta;
+    }
+}
+
+static void update_hl_report_total(int *stackid, u64 delta)
+{
+    u64 *count, *time;
+
+    count = hl_report_count.lookup(stackid);
+    if (!count)
+        return;
+
+    time = hl_report_total.lookup(stackid);
+    if (!time) {
+        hl_report_total.update(stackid, &delta);
+    } else {
+        *time = *time + delta;
+    }
+}
+
+int mutex_lock_return(struct pt_regs *ctx)
+{
+    if (!is_enabled())
+        return 0;
+
+    u64 id = bpf_get_current_pid_tgid();
+
+    if (!allow_pid(id))
+        return 0;
+
+    u64 *one = track.lookup(&id);
+
+    if (!one)
+        return 0;
+
+    track.delete(&id);
+
+    u64 *depth = lock_depth.lookup(&id);
+    if (!depth)
+        return 0;
+
+    struct depth_id did = {
+      .id    = id,
+      .depth = *depth - 1,
+    };
+
+    u64 *aq = time_aq.lookup(&id);
+    if (!aq)
+        return 0;
+
+    int *stackid = stack.lookup(&did);
+    if (!stackid)
+        return 0;
+
+    int stackid_ = *stackid;
+    u64 cur = bpf_ktime_get_ns();
+
+    if (cur > *aq) {
+        int val = cur - *aq;
+        update_aq_report_count(&stackid_);
+        update_aq_report_max(&stackid_, val);
+        update_aq_report_total(&stackid_, val);
+    }
+
+    time_held.update(&did, &cur);
+    return 0;
+}
+
+int mutex_unlock_enter(struct pt_regs *ctx)
+{
+    if (!is_enabled())
+        return 0;
+
+    u64 id = bpf_get_current_pid_tgid();
+
+    if (!allow_pid(id))
+        return 0;
+
+    u64 *depth = lock_depth.lookup(&id);
+
+    if (!depth || *depth == 0)
+        return 0;
+
+    *depth -= 1;
+
+    struct depth_id did = {
+      .id    = id,
+      .depth = *depth,
+    };
+
+    u64 *held = time_held.lookup(&did);
+    if (!held)
+        return 0;
+
+    int *stackid = stack.lookup(&did);
+    if (!stackid)
+        return 0;
+
+
+    int stackid_ = *stackid;
+    u64 cur = bpf_ktime_get_ns();
+
+    if (cur > *held) {
+        u64 val = cur - *held;
+        update_hl_report_count(&stackid_);
+        update_hl_report_max(&stackid_, val);
+        update_hl_report_total(&stackid_, val);
+    }
+
+    stack.delete(&did);
+    time_held.delete(&did);
+    return 0;
+}
+
+"""
+
+def sort_list(maxs, totals, counts):
+    if (not args.sort):
+        return maxs;
+
+    for field in args.sort.split(','):
+        if (field == "acq_max" or field == "hld_max"):
+            return maxs
+        if (field == "acq_total" or field == "hld_total"):
+            return totals
+        if (field == "acq_count" or field == "hld_count"):
+            return counts
+
+    print("Wrong sort argument: %s", args.sort)
+    exit(-1)
+
+def display(sort, maxs, totals, counts):
+    global missing_stacks
+    global has_enomem
+
+    for k, v in sorted(sort.items(), key=lambda sort: sort[1].value, reverse=True)[:args.locks]:
+        missing_stacks += int(stack_id_err(k.value))
+        has_enomem      = has_enomem or (k.value == -errno.ENOMEM)
+
+        caller = "[Missed Kernel Stack]"
+        stack  = []
+
+        if (k.value >= 0):
+            stack  = list(stack_traces.walk(k.value))
+            caller = b.ksym(stack[1], show_offset=True)
+
+            if (args.caller and caller.find(args.caller)):
+                continue
+
+        avg = totals[k].value / counts[k].value
+
+        print("%40s %10lu %6lu %10lu %10lu" % (caller, avg, counts[k].value, maxs[k].value, totals[k].value))
+
+        for addr in stack[1:args.stacks]:
+            print("%40s" %  b.ksym(addr, show_offset=True))
+
+
+if args.tid:  # TID trumps PID
+    program = program.replace('FILTER',
+        'if (tid != %s) { return 0; }' % args.tid)
+elif args.pid:
+    program = program.replace('FILTER',
+        'if (pid != %s) { return 0; }' % args.pid)
+else:
+    program = program.replace('FILTER', '')
+
+program = program.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
+
+b = BPF(text=program)
+
+b.attach_kprobe(event="mutex_unlock", fn_name="mutex_unlock_enter")
+b.attach_kretprobe(event="mutex_lock", fn_name="mutex_lock_return")
+b.attach_kprobe(event="mutex_lock", fn_name="mutex_lock_enter")
+
+enabled = b.get_table("enabled");
+
+stack_traces = b.get_table("stack_traces")
+aq_counts = b.get_table("aq_report_count")
+aq_maxs   = b.get_table("aq_report_max")
+aq_totals = b.get_table("aq_report_total")
+
+hl_counts = b.get_table("hl_report_count")
+hl_maxs   = b.get_table("hl_report_max")
+hl_totals = b.get_table("hl_report_total")
+
+aq_sort = sort_list(aq_maxs, aq_totals, aq_counts)
+hl_sort = sort_list(hl_maxs, hl_totals, hl_counts)
+
+print("Tracing lock events... Hit Ctrl-C to end.")
+
+# duration and interval are mutualy exclusive
+exiting = 0 if args.interval else 1
+exiting = 1 if args.duration else 0
+
+seconds = 999999999
+if args.interval:
+    seconds = args.interval
+if args.duration:
+    seconds = args.duration
+
+missing_stacks = 0
+has_enomem     = False
+
+while (1):
+    enabled[ct.c_int(0)] = ct.c_int(1)
+
+    try:
+        sleep(seconds)
+    except KeyboardInterrupt:
+        exiting = 1
+
+    enabled[ct.c_int(0)] = ct.c_int(0)
+
+    print("\n%40s %10s %6s %10s %10s" % ("Caller", "Avg Spin", "Count", "Max spin", "Total spin"))
+    display(aq_sort, aq_maxs, aq_totals, aq_counts)
+
+
+    print("\n%40s %10s %6s %10s %10s" % ("Caller", "Avg Hold", "Count", "Max hold", "Total hold"))
+    display(hl_sort, hl_maxs, hl_totals, hl_counts)
+
+    if exiting:
+        break;
+
+    stack_traces.clear()
+    aq_counts.clear()
+    aq_maxs.clear()
+    aq_totals.clear()
+    hl_counts.clear()
+    hl_maxs.clear()
+    hl_totals.clear()
+
+if missing_stacks > 0:
+    enomem_str = " Consider increasing --stack-storage-size."
+    print("WARNING: %d stack traces lost and could not be displayed.%s" %
+        (missing_stacks, (enomem_str if has_enomem else "")),
+        file=stderr)