trace: add pid/tid filtering, fix symbolizing, misc nits (#798)

* support filtering by process ID (-p) or thread ID (-t); previously -p
  actually filtered on thread ID (aka "pid" in kernel-speak)
* include process and thread ID in output
* flip order of user and kernel stacks to flow more naturally
* resolve symbols using process ID instead of thread ID so only one symbol
  cache is instantiated per process
* misc aesthetic fixes here and there
diff --git a/tools/trace_example.txt b/tools/trace_example.txt
index f1e953d..08b9061 100644
--- a/tools/trace_example.txt
+++ b/tools/trace_example.txt
@@ -171,8 +171,9 @@
 USAGE message:
 
 # trace -h
-usage: trace.py [-h] [-p PID] [-v] [-Z STRING_SIZE] [-S] [-M MAX_EVENTS] [-o]
-                probe [probe ...]
+usage: trace [-h] [-p PID] [-t TID] [-v] [-Z STRING_SIZE] [-S]
+             [-M MAX_EVENTS] [-o] [-K] [-U] [-I header]
+             probe [probe ...]
 
 Attach to functions and print trace messages.
 
@@ -182,6 +183,7 @@
 optional arguments:
   -h, --help            show this help message and exit
   -p PID, --pid PID     id of the process to trace (optional)
+  -t TID, --tid TID     id of the thread to trace (optional)
   -v, --verbose         print resulting BPF program code before executing
   -Z STRING_SIZE, --string-size STRING_SIZE
                         maximum size to read from strings
@@ -202,8 +204,8 @@
         Trace the open syscall and print the filename being opened
 trace 'sys_read (arg3 > 20000) "read %d bytes", arg3'
         Trace the read syscall and print a message for reads >20000 bytes
-trace r::do_sys_return
-        Trace the return from the open syscall
+trace 'r::do_sys_return "%llx", retval'
+        Trace the return from the open syscall and print the return value
 trace 'c:open (arg2 == 42) "%s %d", arg1, arg2'
         Trace the open() call from libc only if the flags (arg2) argument is 42
 trace 'c:malloc "size = %d", arg1'
@@ -218,4 +220,3 @@
         Trace the block_rq_complete kernel tracepoint and print # of tx sectors
 trace 'u:pthread:pthread_create (arg4 != 0)'
         Trace the USDT probe pthread_create when its 4th argument is non-zero
-