tools/trace.py: aggregate trace events by '-A' option

Orignally we can save the backtrace of a specified function into a
text file, then try to analyze the messy trace to guess why the
function is called.

To make the work easier, aggregate amount of trace by -A with -M EVENTS.
A typical example:
1, if we find that the sys CPU utilization is higher by 'top' command
2, then find that the timer interrupt is more normal by 'irqtop' command
3, to confirm kernel timer setting frequence by 'funccount -i 1 clockevents_program_event'
4, to trace timer setting by 'trace clockevents_program_event -K -A -M 1000'

1294576 1294584 CPU 0/KVM       clockevents_program_event
        clockevents_program_event+0x1 [kernel]
        hrtimer_start_range_ns+0x209 [kernel]
        start_sw_timer+0x173 [kvm]
        restart_apic_timer+0x6c [kvm]
        kvm_set_msr_common+0x442 [kvm]
        __kvm_set_msr+0xa2 [kvm]
        kvm_emulate_wrmsr+0x36 [kvm]
        vcpu_enter_guest+0x326 [kvm]
        kvm_arch_vcpu_ioctl_run+0xcc [kvm]
        kvm_vcpu_ioctl+0x22f [kvm]
        do_vfs_ioctl+0xa1 [kernel]
        ksys_ioctl+0x60 [kernel]
        __x64_sys_ioctl+0x16 [kernel]
        do_syscall_64+0x59 [kernel]
        entry_SYSCALL_64_after_hwframe+0x44 [kernel]
-->COUNT 271

...
Then we can know that 271 timer setting in recent 1000(~27%) occurs in
KVM code path.

To avoid too large size of a dictionary in python, -A works always with
-M.

Also fix several minor changes to keep 'trace -h' aligned with trace_example.txt.

Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
diff --git a/tools/trace.py b/tools/trace.py
index 0f6d90e..b51cccf 100755
--- a/tools/trace.py
+++ b/tools/trace.py
@@ -5,6 +5,7 @@
 #
 # usage: trace [-h] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S] [-c cgroup_path]
 #              [-M MAX_EVENTS] [-s SYMBOLFILES] [-T] [-t] [-K] [-U] [-a] [-I header]
+#              [-A]
 #              probe [probe ...]
 #
 # Licensed under the Apache License, Version 2.0 (the "License")
@@ -40,6 +41,8 @@
         uid = -1
         page_cnt = None
         build_id_enabled = False
+        aggregate = False
+        symcount = {}
 
         @classmethod
         def configure(cls, args):
@@ -58,6 +61,10 @@
                 cls.page_cnt = args.buffer_pages
                 cls.bin_cmp = args.bin_cmp
                 cls.build_id_enabled = args.sym_file_list is not None
+                cls.aggregate = args.aggregate
+                if cls.aggregate and cls.max_events is None:
+                        raise ValueError("-M/--max-events should be specified"
+                                         " with -A/--aggregate")
 
         def __init__(self, probe, string_size, kernel_stack, user_stack,
                      cgroup_map_name, name, msg_filter):
@@ -584,18 +591,20 @@
                 else:   # self.probe_type == 't'
                         return self.tp_event
 
-        def print_stack(self, bpf, stack_id, tgid):
+        def _stack_to_string(self, bpf, stack_id, tgid):
             if stack_id < 0:
-                print("        %d" % stack_id)
-                return
+                return ("        %d" % stack_id)
 
+            stackstr = ''
             stack = list(bpf.get_table(self.stacks_name).walk(stack_id))
             for addr in stack:
-                print("        ", end="")
+                stackstr += '        '
                 if Probe.print_address:
-                    print("%16x " % addr, end="")
-                print("%s" % (bpf.sym(addr, tgid,
-                                     show_module=True, show_offset=True)))
+                    stackstr += ("%16x " % addr)
+                symstr = bpf.sym(addr, tgid, show_module=True, show_offset=True)
+                stackstr += ('%s\n' % (symstr.decode('utf-8')))
+
+            return stackstr
 
         def _format_message(self, bpf, tgid, values):
                 # Replace each %K with kernel sym and %U with user sym in tgid
@@ -610,6 +619,11 @@
                                            show_module=True, show_offset=True)
                 return self.python_format % tuple(values)
 
+        def print_aggregate_events(self):
+                for k, v in sorted(self.symcount.items(), key=lambda item: \
+                                   item[1], reverse=True):
+                    print("%s-->COUNT %d\n\n" % (k, v), end="")
+
         def print_event(self, bpf, cpu, data, size):
                 # Cast as the generated structure type and display
                 # according to the format string in the probe.
@@ -621,32 +635,43 @@
                 msg = self._format_message(bpf, event.tgid, values)
                 if self.msg_filter and self.msg_filter not in msg:
                     return
+                eventstr = ''
                 if Probe.print_time:
                     time = strftime("%H:%M:%S") if Probe.use_localtime else \
                            Probe._time_off_str(event.timestamp_ns)
                     if Probe.print_unix_timestamp:
-                        print("%-17s " % time[:17], end="")
+                        eventstr += ("%-17s " % time[:17])
                     else:
-                        print("%-8s " % time[:8], end="")
+                        eventstr += ("%-8s " % time[:8])
                 if Probe.print_cpu:
-                    print("%-3s " % event.cpu, end="")
-                print("%-7d %-7d %-15s %-16s %s" %
+                    eventstr += ("%-3s " % event.cpu)
+                eventstr += ("%-7d %-7d %-15s %-16s %s\n" %
                       (event.tgid, event.pid,
                        event.comm.decode('utf-8', 'replace'),
                        self._display_function(), msg))
 
                 if self.kernel_stack:
-                        self.print_stack(bpf, event.kernel_stack_id, -1)
+                        eventstr += self._stack_to_string(bpf, event.kernel_stack_id, -1)
                 if self.user_stack:
-                        self.print_stack(bpf, event.user_stack_id, event.tgid)
-                if self.user_stack or self.kernel_stack:
+                        eventstr += self._stack_to_string(bpf, event.user_stack_id, event.tgid)
+
+                if self.aggregate is False:
+                    print(eventstr, end="")
+                    if self.kernel_stack or self.user_stack:
                         print("")
+                else:
+                    if eventstr in self.symcount:
+                        self.symcount[eventstr] += 1
+                    else:
+                        self.symcount[eventstr] = 1
 
                 Probe.event_count += 1
                 if Probe.max_events is not None and \
                    Probe.event_count >= Probe.max_events:
-                        exit()
-                sys.stdout.flush()
+                    if self.aggregate:
+                        self.print_aggregate_events()
+                    sys.stdout.flush()
+                    exit()
 
         def attach(self, bpf, verbose):
                 if len(self.library) == 0:
@@ -700,7 +725,7 @@
 trace kfree_skb+0x12
         Trace the kfree_skb kernel function after the instruction on the 0x12 offset
 trace 'do_sys_open "%s", arg2@user'
-        Trace the open syscall and print the filename. being opened @user is
+        Trace the open syscall and print the filename being opened @user is
         added to arg2 in kprobes to ensure that char * should be copied from
         the userspace stack to the bpf stack. If not specified, previous
         behaviour is expected.
@@ -752,6 +777,9 @@
         to 53 (DNS; 13568 in big endian order)
 trace -I 'linux/fs_struct.h' 'mntns_install "users = %d", $task->fs->users'
         Trace the number of users accessing the file system of the current task
+trace -s /lib/x86_64-linux-gnu/libc.so.6,/bin/ping 'p:c:inet_pton' -U
+        Trace inet_pton system call and use the specified libraries/executables for
+        symbol resolution.
 """
 
         def __init__(self):
@@ -815,6 +843,8 @@
                        "as either full path, "
                        "or relative to current working directory, "
                        "or relative to default kernel header search path")
+                parser.add_argument("-A", "--aggregate", action="store_true",
+                  help="aggregate amount of each trace")
                 parser.add_argument("--ebpf", action="store_true",
                   help=argparse.SUPPRESS)
                 self.args = parser.parse_args()