Finalized $entry, $latency, and $retval implementation including examples and man
diff --git a/man/man8/argdist.8 b/man/man8/argdist.8
index 2f74597..63669b8 100644
--- a/man/man8/argdist.8
+++ b/man/man8/argdist.8
@@ -2,7 +2,7 @@
 .SH NAME
 argdist \- Trace a function and display a histogram or frequency count of its parameter values. Uses Linux eBPF/bcc.
 .SH SYNOPSIS
-.B argdist [-h] [-p PID] [-z STRING_SIZE] [-i INTERVAL] [-n COUNT] [-H specifier [specifier ...]] [-C specifier [specifier ...]]
+.B argdist [-h] [-p PID] [-z STRING_SIZE] [-i INTERVAL] [-n COUNT] [-v] [-T TOP] [-H specifier [specifier ...]] [-C specifier [specifier ...]]
 .SH DESCRIPTION
 argdist attaches to function entry and exit points, collects specified parameter
 values, and stores them in a histogram or a frequency collection that counts
@@ -30,6 +30,12 @@
 -n NUMBER
 Print the collected data COUNT times and then exit.
 .TP
+\-v
+Display the generated BPF program, for debugging purposes.
+.TP
+\-T TOP
+When collecting frequency counts, display only the top TOP entries.
+.TP
 -H SPECIFIER, -C SPECIFIER
 One or more probe specifications that instruct argdist which functions to
 probe, which parameters to collect, how to aggregate them, and whether to perform
@@ -47,8 +53,7 @@
 probes will collect the number of times every parameter value was observed,
 whereas histogram probes will collect the parameter values into a histogram.
 Only integral types can be used with histogram probes; there is no such limitation
-for counting probes. Return probes can only use the pseudo-variable $retval, which
-is an alias for the function's return value.
+for counting probes.
 .TP
 .B [library]
 Library containing the probe.
@@ -73,11 +78,20 @@
 These are the values that are assigned to the histogram or raw event collection.
 You may use the parameters directly, or valid C expressions that involve the
 parameters, such as "size % 10".
+Return probes can use the argument values received by the
+function when it was entered, through the $entry(paramname) special variable.
+Return probes can also access the function's return value in $retval, and the
+function's execution time in nanoseconds in $latency. Note that adding the
+$latency or $entry(paramname) variables to the expression will introduce an
+additional probe at the function's entry to collect this data, and therefore
+introduce additional overhead.
 .TP
 .B [filter]
 The filter applied to the captured data.
 Only parameter values that pass the filter will be collected. This is any valid
 C expression that refers to the parameter values, such as "fd == 1 && length > 16".
+The $entry, $retval, and $latency variables can be used here as well, in return
+probes.
 .TP
 .B [label]
 The label that will be displayed when printing the probed values. By default,
@@ -96,6 +110,10 @@
 #
 .B argdist.py -C 'r:c:gets():char*:$retval'
 .TP
+Print a histogram of read sizes that were longer than 1ms:
+#
+.B argdist.py -H 'r::__vfs_read(void *file, void *buf, size_t count):size_t:$entry(count):$latency > 1000000'
+.TP
 Print frequency counts of how many times writes were issued to a particular file descriptor number, in process 1005:
 #
 .B argdist.py -p 1005 -C 'p:c:write(int fd):int:fd'
@@ -114,7 +132,7 @@
 .TP
 Print histograms of sleep() and nanosleep() parameter values:
 #
-.B argdist.py -H 'p:c:sleep(u32 seconds):u32:seconds' -H 'p:c:nanosleep(struct timespec { time_t tv_sec; long tv_nsec; } *req):long:req->tv_nsec'
+.B argdist.py -H 'p:c:sleep(u32 seconds):u32:seconds' 'p:c:nanosleep(struct timespec { time_t tv_sec; long tv_nsec; } *req):long:req->tv_nsec'
 .TP
 Spy on writes to STDOUT performed by process 2780, up to a string size of 120 characters:
 #
diff --git a/tools/argdist.py b/tools/argdist.py
index b258827..dd111f7 100755
--- a/tools/argdist.py
+++ b/tools/argdist.py
@@ -74,22 +74,30 @@
                 text = text.replace("PID_FILTER", pid_filter)
                 collect = ""
                 for pname in self.args_to_probe:
-                        collect += "%s.update(&pid, &%s);\n" % \
-                                   (self.hashname_prefix + pname, pname)
+                        param_hash = self.hashname_prefix + pname
+                        if pname == "__latency":
+                                collect += """
+u64 __time = bpf_ktime_get_ns();
+%s.update(&pid, &__time);
+"""                             % param_hash
+                        else:
+                                collect += "%s.update(&pid, &%s);\n" % \
+                                           (param_hash, pname)
                 text = text.replace("COLLECT", collect)
                 return text
 
         def _generate_entry_probe(self):
-                # TODO $latency as a special keyword that should be traced
                 # Any $entry(name) expressions result in saving that argument
                 # when entering the function.
                 self.args_to_probe = set()
                 regex = r"\$entry\((\w+)\)"
-                for arg in re.finditer(regex, self.expr or ""):
+                for arg in re.finditer(regex, self.expr):
                         self.args_to_probe.add(arg.group(1))
-                for arg in re.finditer(regex, self.filter or ""):
+                for arg in re.finditer(regex, self.filter):
                         self.args_to_probe.add(arg.group(1))
-
+                if "$latency" in self.expr or "$latency" in self.filter:
+                        self.args_to_probe.add("__latency")
+                        self.param_types["__latency"] = "u64"    # nanoseconds
                 for pname in self.args_to_probe:
                         if pname not in self.param_types:
                                 raise ValueError("$entry(%s): no such param" \
@@ -124,12 +132,15 @@
 
         def _replace_entry_exprs(self):
                 for pname, vname in self.param_val_names.items():
-                        entry_expr = "$entry(%s)" % pname
-                        val_expr = "*" + vname   # dereference the pointer
+                        if pname == "__latency":
+                                entry_expr = "$latency"
+                                val_expr = "(bpf_ktime_get_ns() - *%s)" % vname
+                        else:
+                                entry_expr = "$entry(%s)" % pname
+                                val_expr = "(*%s)" % vname
                         self.expr = self.expr.replace(entry_expr, val_expr)
-                        if self.filter is not None:
-                                self.filter = self.filter.replace(entry_expr,
-                                                                  val_expr)
+                        self.filter = self.filter.replace(entry_expr,
+                                                          val_expr)
 
         def _attach_entry_probe(self):
                 if self.is_user:
@@ -177,14 +188,14 @@
                         self.expr_type = \
                                 "u64" if not self.is_ret_probe else "int"
                         self.expr = "1" if not self.is_ret_probe else "$retval"
-                self.filter = None if len(parts) != 6 else parts[5]
+                self.filter = "" if len(parts) != 6 else parts[5]
                 self._substitute_exprs()
 
                 # Do we need to attach an entry probe so that we can collect an 
                 # argument that is required for an exit (return) probe?
                 self.entry_probe_required = self.is_ret_probe and \
-                       ("$entry" in self.expr or \
-                        "$entry" in (self.filter or ""))
+                       ("$entry" in self.expr or "$entry" in self.filter or
+                        "$latency" in self.expr or "$latency" in self.filter)
 
                 self.pid = pid
                 # Generating unique names for probes means we can attach
@@ -198,9 +209,8 @@
         def _substitute_exprs(self):
                 self.expr = self.expr.replace("$retval",
                                               "(%s)ctx->ax" % self.expr_type)
-                if self.filter is not None:
-                        self.filter = self.filter.replace("$retval",
-                                "(%s)ctx->ax" % self.expr_type)
+                self.filter = self.filter.replace("$retval",
+                                              "(%s)ctx->ax" % self.expr_type)
                 self.expr = self._substitute_aliases(self.expr)
                 self.filter = self._substitute_aliases(self.filter)
 
@@ -264,7 +274,8 @@
                                    (self.expr_type, self.expr)
                 program = program.replace("DATA_DECL", decl)
                 program = program.replace("KEY_EXPR", key_expr) 
-                program = program.replace("FILTER", self.filter or "1") 
+                program = program.replace("FILTER",
+                        "1" if len(self.filter) == 0 else self.filter) 
                 program = program.replace("COLLECT", collect)
                 program = program.replace("PREFIX", prefix)
                 return program
@@ -293,9 +304,9 @@
                         self._attach_entry_probe()
 
         def display(self, top):
-                print(self.label or self.raw_spec)
                 data = self.bpf.get_table(self.probe_hash_name)
                 if self.type == "freq":
+                        print(self.label or self.raw_spec)
                         print("\t%-10s %s" % ("COUNT", "EVENT"))
                         data = sorted(data.items(), key=lambda kv: kv[1].value)
                         if top is not None:
@@ -317,8 +328,9 @@
                                 print("\t%-10s %s" % \
                                       (str(value.value), key_str))
                 elif self.type == "hist":
-                        label = self.expr if not self.is_default_expr \
-                                          else "retval"
+                        label = self.label or \
+                                (self.expr if not self.is_default_expr \
+                                           else "retval")
                         data.print_log2_hist(val_type=label)
 
 examples = """
@@ -326,7 +338,7 @@
         {p,r}:[library]:function(signature)[:type:expr[:filter]][;label]
 Where:
         p,r        -- probe at function entry or at function exit
-                      in exit probes, only $retval is accessible
+                      in exit probes: can use $retval, $entry(param), $latency
         library    -- the library that contains the function
                       (leave empty for kernel functions)
         function   -- the function name to trace
@@ -348,6 +360,9 @@
 argdist.py -C 'r:c:gets():char*:$retval;snooped strings'
         Snoop on all strings returned by gets()
 
+argdist.py -H 'r::__kmalloc(size_t size):u64:$latency/$entry(size);ns per byte'
+        Print a histogram of nanoseconds per byte from kmalloc allocations
+
 argdist.py -p 1005 -C 'p:c:write(int fd):int:fd' -T 5
         Print frequency counts of how many times writes were issued to a
         particular file descriptor number, in process 1005, but only show
@@ -356,6 +371,12 @@
 argdist.py -p 1005 -H 'r:c:read()'
         Print a histogram of error codes returned by read() in process 1005
 
+argdist.py -C 'r::__vfs_read():u32:$PID:$latency > 100000'
+        Print frequency of reads by process where the latency was >0.1ms
+
+argdist.py -H 'r::__vfs_read(void *file, void *buf, size_t count):size_t:$entry(count):$latency > 1000000' 
+        Print a histogram of read sizes that were longer than 1ms
+
 argdist.py -H \\
         'p:c:write(int fd, const void *buf, size_t count):size_t:count:fd==1'
         Print a histogram of buffer sizes passed to write() across all
diff --git a/tools/argdist_examples.txt b/tools/argdist_examples.txt
index 46cb8d9..999b927 100644
--- a/tools/argdist_examples.txt
+++ b/tools/argdist_examples.txt
@@ -183,6 +183,54 @@
       1024 -> 2047       : 0        |                                        |
       2048 -> 4095       : 2        |**                                      |
 
+In return probes, you can also trace the latency of the function (unless it is
+recursive) and the parameters it had on entry. For example, we can identify
+which processes are performing slow synchronous filesystem reads -- say,
+longer than 0.1ms (100,000ns):
+
+# ./argdist.py -C 'r::__vfs_read():u32:$PID:$latency > 100000'
+[01:08:48]
+r::__vfs_read():u32:$PID:$latency > 100000
+        COUNT      EVENT
+        1          bpf_get_current_pid_tgid() = 10457
+        21         bpf_get_current_pid_tgid() = 2780
+[01:08:49]
+r::__vfs_read():u32:$PID:$latency > 100000
+        COUNT      EVENT
+        1          bpf_get_current_pid_tgid() = 10457
+        21         bpf_get_current_pid_tgid() = 2780
+^C
+
+As you see, the $PID alias is expanded to the BPF function bpf_get_current_pid_tgid(),
+which returns the current process' pid. It looks like process 2780 performed
+21 slow reads.
+
+Occasionally, entry parameter values are also interesting. For example, you
+might be curious how long it takes malloc() to allocate memory -- nanoseconds
+per byte allocated. Let's go:
+
+# ./argdist.py -H 'r:c:malloc(size_t size):u64:$latency/$entry(size);ns per byte' -n 1 -i 10
+[01:11:13]
+     ns per byte         : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 4        |*****************                       |
+         4 -> 7          : 3        |*************                           |
+         8 -> 15         : 2        |********                                |
+        16 -> 31         : 1        |****                                    |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 7        |*******************************         |
+       128 -> 255        : 1        |****                                    |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 1        |****                                    |
+      1024 -> 2047       : 1        |****                                    |
+      2048 -> 4095       : 9        |****************************************|
+      4096 -> 8191       : 1        |****                                    | 
+
+It looks like a tri-modal distribution. Some allocations are extremely cheap,
+and take 2-15 nanoseconds per byte. Other allocations are slower, and take
+64-127 nanoseconds per byte. And some allocations are slower still, and take
+multiple microseconds per byte.
+
 Here's a final example that finds how many write() system calls are performed
 by each process on the system:
 
@@ -200,15 +248,13 @@
         23         bpf_get_current_pid_tgid() = 7615
         23         bpf_get_current_pid_tgid() = 2480
 
-As you see, the $PID alias is expanded to the BPF function bpf_get_current_pid_tgid(),
-which returns the current process' pid.
-
 
 USAGE message:
 
-usage: argdist.py [-h] [-p PID] [-z STRING_SIZE] [-i INTERVAL] [-n COUNT]
-                  [-H [HISTSPECIFIER [HISTSPECIFIER ...]]]
-                  [-C [COUNTSPECIFIER [COUNTSPECIFIER ...]]] [-v]
+# argdist.py -h
+usage: argdist.py [-h] [-p PID] [-z STRING_SIZE] [-i INTERVAL] [-n COUNT] [-v]
+                  [-T TOP] [-H [HISTSPECIFIER [HISTSPECIFIER ...]]]
+                  [-C [COUNTSPECIFIER [COUNTSPECIFIER ...]]]
 
 Trace a function and display a summary of its parameter values.
 
@@ -221,19 +267,21 @@
                         output interval, in seconds
   -n COUNT, --number COUNT
                         number of outputs
+  -v, --verbose         print resulting BPF program code before executing
+  -T TOP, --top TOP     number of top results to show (not applicable to
+                        histograms)
   -H [HISTSPECIFIER [HISTSPECIFIER ...]], --histogram [HISTSPECIFIER [HISTSPECIFIER ...]]
                         probe specifier to capture histogram of (see examples
                         below)
   -C [COUNTSPECIFIER [COUNTSPECIFIER ...]], --count [COUNTSPECIFIER [COUNTSPECIFIER ...]]
                         probe specifier to capture count of (see examples
                         below)
-  -v, --verbose         print resulting BPF program code before executing
 
 Probe specifier syntax:
         {p,r}:[library]:function(signature)[:type:expr[:filter]][;label]
 Where:
         p,r        -- probe at function entry or at function exit
-                      in exit probes, only $retval is accessible
+                      in exit probes: can use $retval, $entry(param), $latency
         library    -- the library that contains the function
                       (leave empty for kernel functions)
         function   -- the function name to trace
@@ -255,13 +303,23 @@
 argdist.py -C 'r:c:gets():char*:$retval;snooped strings'
         Snoop on all strings returned by gets()
 
-argdist.py -p 1005 -C 'p:c:write(int fd):int:fd'
+argdist.py -H 'r::__kmalloc(size_t size):u64:$latency/$entry(size);ns per byte'
+        Print a histogram of nanoseconds per byte from kmalloc allocations
+
+argdist.py -p 1005 -C 'p:c:write(int fd):int:fd' -T 5
         Print frequency counts of how many times writes were issued to a
-        particular file descriptor number, in process 1005
+        particular file descriptor number, in process 1005, but only show
+        the top 5 busiest fds
 
 argdist.py -p 1005 -H 'r:c:read()'
         Print a histogram of error codes returned by read() in process 1005
 
+argdist.py -C 'r::__vfs_read():u32:$PID:$latency > 100000'
+        Print frequency of reads by process where the latency was >0.1ms
+
+argdist.py -H 'r::__vfs_read(void *file, void *buf, size_t count):size_t:$entry(count):$latency > 1000000' 
+        Print a histogram of read sizes that were longer than 1ms
+
 argdist.py -H \
         'p:c:write(int fd, const void *buf, size_t count):size_t:count:fd==1'
         Print a histogram of buffer sizes passed to write() across all
@@ -269,15 +327,15 @@
 
 argdist.py -C 'p:c:fork();fork calls'
         Count fork() calls in libc across all processes
-        Can also use funccount.py, which is easier and more flexible
+        Can also use funccount.py, which is easier and more flexible 
 
-argdist.py \
-        -H 'p:c:sleep(u32 seconds):u32:seconds' \
-        -H 'p:c:nanosleep(struct timespec { time_t tv_sec; long tv_nsec; } *req):long:req->tv_nsec'
+argdist.py -H \
+        'p:c:sleep(u32 seconds):u32:seconds' \
+        'p:c:nanosleep(struct timespec { time_t tv_sec; long tv_nsec; } *req):long:req->tv_nsec'
         Print histograms of sleep() and nanosleep() parameter values
 
 argdist.py -p 2780 -z 120 \
         -C 'p:c:write(int fd, char* buf, size_t len):char*:buf:fd==1'
         Spy on writes to STDOUT performed by process 2780, up to a string size
-        of 120 characters
+        of 120 characters