Merge pull request #1797 from dpayne/feat/add_stack_frames_to_funcslower

Feat/add stack frames to funcslower
diff --git a/man/man8/funcslower.8 b/man/man8/funcslower.8
index 4432191..06f1793 100644
--- a/man/man8/funcslower.8
+++ b/man/man8/funcslower.8
@@ -1,8 +1,8 @@
 .TH funcslower 8  "2017-03-30" "USER COMMANDS"
 .SH NAME
-fileslower \- Trace slow kernel or user function calls.
+funcslower \- Trace slow kernel or user function calls.
 .SH SYNOPSIS
-.B fileslower [\-h] [\-p PID] [-m MIN_MS] [-u MIN_US] [-a ARGUMENTS] [-T] [-t] [-v] function [function ...]
+.B funcslower [\-hf] [\-p PID] [\-U | \-K] [-m MIN_MS] [-u MIN_US] [-a ARGUMENTS] [-T] [-t] [-v] function [function ...]
 .SH DESCRIPTION
 This script traces a kernel or user function's entry and return points, and
 prints a message when the function's latency exceeded the specified threshold.
@@ -36,6 +36,15 @@
 \-t
 Print a seconds timestamp with each entry, at microsecond resolution.
 .TP
+\-f
+Print output in folded stack format.
+.TP
+\-U
+Show stacks from user space only (no kernel space stacks).
+.TP
+\-K
+Show stacks from kernel space only (no user space stacks).
+.TP
 \-v
 Print the resulting BPF program, for debugging purposes.
 .TP
@@ -61,6 +70,10 @@
 Trace the write syscall and print its first 4 arguments:
 #
 .B funcslower -a 4 SyS_write
+.TP
+Trace opens from libc and print the user and kernel stack frames:
+#
+.B funcslower -UK c:open
 .SH FIELDS
 .TP
 TIME
diff --git a/tools/funcslower.py b/tools/funcslower.py
index 959a5af..93fb846 100755
--- a/tools/funcslower.py
+++ b/tools/funcslower.py
@@ -28,12 +28,14 @@
 import time
 
 examples = """examples:
-  ./funcslower vfs_write       # trace vfs_write calls slower than 1ms
-  ./funcslower -m 10 vfs_write # same, but slower than 10ms
-  ./funcslower -u 10 c:open    # trace open calls slower than 10us
-  ./funcslower -p 135 c:open   # trace pid 135 only
-  ./funcslower c:malloc c:free # trace both malloc and free slower than 1ms
-  ./funcslower -a 2 c:open     # show first two arguments to open
+  ./funcslower vfs_write        # trace vfs_write calls slower than 1ms
+  ./funcslower -m 10 vfs_write  # same, but slower than 10ms
+  ./funcslower -u 10 c:open     # trace open calls slower than 10us
+  ./funcslower -p 135 c:open    # trace pid 135 only
+  ./funcslower c:malloc c:free  # trace both malloc and free slower than 1ms
+  ./funcslower -a 2 c:open      # show first two arguments to open
+  ./funcslower -UK -m 10 c:open # Show user and kernel stack frame of open calls slower than 10ms
+  ./funcslower -f -UK c:open    # Output in folded format for flame graphs
 """
 parser = argparse.ArgumentParser(
     description="Trace slow kernel or user function calls.",
@@ -57,6 +59,12 @@
     help="function(s) to trace")
 parser.add_argument("--ebpf", action="store_true",
     help=argparse.SUPPRESS)
+parser.add_argument("-f", "--folded", action="store_true",
+    help="output folded format, one line per stack (for flame graphs)")
+parser.add_argument("-U", "--user-stack",
+  action="store_true", help="output user stack trace")
+parser.add_argument("-K", "--kernel-stack",
+  action="store_true", help="output kernel stack trace")
 
 args = parser.parse_args()
 # fractions are allowed, but rounded to an integer nanosecond
@@ -86,6 +94,13 @@
     u64 duration_ns;
     u64 retval;
     char comm[TASK_COMM_LEN];
+#ifdef USER_STACKS
+    int user_stack_id;
+#endif
+#ifdef KERNEL_STACKS
+    int kernel_stack_id;
+    u64 kernel_ip;
+#endif
 #ifdef GRAB_ARGS
     u64 args[6];
 #endif
@@ -94,6 +109,10 @@
 BPF_HASH(entryinfo, u64, struct entry_t);
 BPF_PERF_OUTPUT(events);
 
+#if defined(USER_STACKS) || defined(KERNEL_STACKS)
+BPF_STACK_TRACE(stacks, 2048);
+#endif
+
 static int trace_entry(struct pt_regs *ctx, int id)
 {
     u64 tgid_pid = bpf_get_current_pid_tgid();
@@ -142,6 +161,41 @@
     data.start_ns = entryp->start_ns;
     data.duration_ns = delta_ns;
     data.retval = PT_REGS_RC(ctx);
+
+#ifdef USER_STACKS
+    data.user_stack_id = stacks.get_stackid(ctx, BPF_F_USER_STACK);
+#endif
+
+#ifdef KERNEL_STACKS
+    data.kernel_stack_id = stacks.get_stackid(ctx, 0);
+
+    if (data.kernel_stack_id >= 0) {
+        u64 ip = PT_REGS_IP(ctx);
+        u64 page_offset;
+
+        // if ip isn't sane, leave key ips as zero for later checking
+#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE)
+        // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it
+        page_offset = __PAGE_OFFSET_BASE;
+#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4)
+        // x64, 4.17, and later
+#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL)
+        page_offset = __PAGE_OFFSET_BASE_L5;
+#else
+        page_offset = __PAGE_OFFSET_BASE_L4;
+#endif
+#else
+        // earlier x86_64 kernels, e.g., 4.6, comes here
+        // arm64, s390, powerpc, x86_32
+        page_offset = PAGE_OFFSET;
+#endif
+
+        if (ip > page_offset) {
+            data.kernel_ip = ip;
+        }
+    }
+#endif
+
 #ifdef GRAB_ARGS
     bpf_probe_read(&data.args[0], sizeof(data.args), entryp->args);
 #endif
@@ -155,6 +209,10 @@
 bpf_text = bpf_text.replace('DURATION_NS', str(duration_ns))
 if args.arguments:
     bpf_text = "#define GRAB_ARGS\n" + bpf_text
+if args.user_stack:
+    bpf_text = "#define USER_STACKS\n" + bpf_text
+if args.kernel_stack:
+    bpf_text = "#define KERNEL_STACKS\n" + bpf_text
 if args.tgid:
     bpf_text = bpf_text.replace('TGID_FILTER', 'tgid != %d' % args.tgid)
 else:
@@ -193,17 +251,22 @@
         ("duration_ns", ct.c_ulonglong),
         ("retval", ct.c_ulonglong),
         ("comm", ct.c_char * TASK_COMM_LEN)
-    ] + ([("args", ct.c_ulonglong * 6)] if args.arguments else [])
+    ] + ([("args", ct.c_ulonglong * 6)] if args.arguments else []) + \
+            ([("user_stack_id", ct.c_int)] if args.user_stack else []) + \
+            ([("kernel_stack_id", ct.c_int),("kernel_ip", ct.c_ulonglong)] if args.kernel_stack else [])
 
 time_designator = "us" if args.min_us else "ms"
 time_value = args.min_us or args.min_ms or 1
 time_multiplier = 1000 if args.min_us else 1000000
 time_col = args.time or args.timestamp
-print("Tracing function calls slower than %g %s... Ctrl+C to quit." %
-      (time_value, time_designator))
-print((("%-10s " % "TIME" if time_col else "") + "%-14s %-6s %7s %16s %s") %
-    ("COMM", "PID", "LAT(%s)" % time_designator, "RVAL",
-    "FUNC" + (" ARGS" if args.arguments else "")))
+
+# Do not print header when folded
+if not args.folded:
+    print("Tracing function calls slower than %g %s... Ctrl+C to quit." %
+          (time_value, time_designator))
+    print((("%-10s " % "TIME" if time_col else "") + "%-14s %-6s %7s %16s %s") %
+        ("COMM", "PID", "LAT(%s)" % time_designator, "RVAL",
+        "FUNC" + (" ARGS" if args.arguments else "")))
 
 earliest_ts = 0
 
@@ -222,12 +285,48 @@
         return ""
     return str.join(" ", ["0x%x" % arg for arg in event.args[:args.arguments]])
 
+def print_stack(event):
+    user_stack = []
+    stack_traces = b.get_table("stacks")
+
+    if args.user_stack and event.user_stack_id > 0:
+        user_stack = stack_traces.walk(event.user_stack_id)
+
+    kernel_stack = []
+    if args.kernel_stack and event.kernel_stack_id > 0:
+        kernel_tmp = stack_traces.walk(event.kernel_stack_id)
+
+        # fix kernel stack
+        for addr in kernel_tmp:
+            kernel_stack.append(addr)
+
+    do_delimiter = user_stack and kernel_stack
+
+    if args.folded:
+        # print folded stack output
+        user_stack = list(user_stack)
+        kernel_stack = list(kernel_stack)
+        line = [event.comm.decode()] + \
+            [b.sym(addr, event.tgid_pid) for addr in reversed(user_stack)] + \
+            (do_delimiter and ["-"] or []) + \
+            [b.ksym(addr) for addr in reversed(kernel_stack)]
+        print("%s %d" % (";".join(line), 1))
+    else:
+        # print default multi-line stack output.
+        for addr in kernel_stack:
+            print("    %s" % b.ksym(addr))
+        for addr in user_stack:
+            print("    %s" % b.sym(addr, event.tgid_pid))
+
 def print_event(cpu, data, size):
     event = ct.cast(data, ct.POINTER(Data)).contents
     ts = float(event.duration_ns) / time_multiplier
-    print((time_str(event) + "%-14.14s %-6s %7.2f %16x %s %s") %
-        (event.comm.decode(), event.tgid_pid >> 32,
-         ts, event.retval, args.functions[event.id], args_str(event)))
+    if not args.folded:
+        print((time_str(event) + "%-14.14s %-6s %7.2f %16x %s %s") %
+            (event.comm.decode(), event.tgid_pid >> 32,
+             ts, event.retval, args.functions[event.id], args_str(event)))
+    if args.user_stack or args.kernel_stack:
+        print_stack(event)
 
 b["events"].open_perf_buffer(print_event, page_cnt=64)
 while True:
diff --git a/tools/funcslower_example.txt b/tools/funcslower_example.txt
index ab50439..86524c2 100644
--- a/tools/funcslower_example.txt
+++ b/tools/funcslower_example.txt
@@ -75,9 +75,46 @@
 of the requested allocation. The return value is also shown (null return values
 would indicate a failure).
 
+# ./funcslower -U -m 30 '/usr/sbin/nginx:database_write'
+Tracing function calls slower than 30 ms... Ctrl+C to quit.
+COMM           PID    LAT(ms)             RVAL FUNC
+nginx          1617     30.15                9 /usr/sbin/nginx:database_write
+    DataBaseProvider::setData(std::string const&, record_s&)
+    UserDataProvider::saveRecordData(RecordData const&)
+    RequestProcessor::writeResponse(int)
+    RequestProcessor::processRequest()
+    RequestRouter::processRequest(RequestWrapper*, ResponseWrapper*)
+    ngx_http_core_content_phase
+    ngx_http_core_run_phases
+    ngx_http_process_request
+    ngx_process_events_and_timers
+    ngx_spawn_process
+    ngx_master_process_cycle
+    main
+    __libc_start_main
+    [unknown]
+nginx          1629     30.14                9 /usr/sbin/nginx:database_write
+    DataBaseProvider::setData(std::string const&, record_s&)
+    UserDataProvider::saveRecordData(RecordData const&)
+    RequestProcessor::writeResponse(int)
+    RequestProcessor::processRequest()
+    RequestRouter::processRequest(RequestWrapper*, ResponseWrapper*)
+    ngx_http_core_content_phase
+    ngx_http_core_run_phases
+    ngx_http_process_request
+    ngx_process_events_and_timers
+    ngx_spawn_process
+    ngx_master_process_cycle
+    main
+    __libc_start_main
+    [unknown]
+^C
+
+Shows the user space stack trace of calls to the user space function call open taking longer than 30 ms.
+
 USAGE message:
 
-usage: funcslower.py [-h] [-p PID] [-m MIN_MS] [-u MIN_US] [-a ARGUMENTS] [-T]
+usage: funcslower.py [-hf] [-p PID] [-U | -K] [-m MIN_MS] [-u MIN_US] [-a ARGUMENTS] [-T]
                      [-t] [-v]
                      function [function ...]
 
@@ -93,6 +130,11 @@
                         minimum duration to trace (ms)
   -u MIN_US, --min-us MIN_US
                         minimum duration to trace (us)
+  -U, --user-stack
+                        show stacks from user space
+  -K, --kernel-stack
+                        show stacks from kernel space
+  -f                    print output in folded stack format.
   -a ARGUMENTS, --arguments ARGUMENTS
                         print this many entry arguments, as hex
   -T, --time            show HH:MM:SS timestamp