Merge pull request #835 from ColinIanKing/master

Add snapcraft script to package up bcc as a snap
diff --git a/README.md b/README.md
index 719ee12..3fe4dc9 100644
--- a/README.md
+++ b/README.md
@@ -129,6 +129,12 @@
 - tools/[tplist](tools/tplist.py): Display kernel tracepoints or USDT probes and their formats. [Examples](tools/tplist_example.txt).
 - tools/[trace](tools/trace.py): Trace arbitrary functions, with filters. [Examples](tools/trace_example.txt)
 - tools/[ttysnoop](tools/ttysnoop.py): Watch live output from a tty or pts device. [Examples](tools/ttysnoop_example.txt)
+- tools/[ucalls](tools/ucalls.py): Summarize method calls or Linux syscalls in high-level languages. [Examples](tools/ucalls_example.txt)
+- tools/[uflow](tools/uflow.py): Print a method flow graph in high-level languages. [Examples](tools/uflow_example.txt)
+- tools/[ugc](tools/ugc.py): Trace garbage collection events in high-level languages. [Examples](tools/ugc_example.txt)
+- tools/[uobjnew](tools/uobjnew.py): Summarize object allocation events by object type and number of bytes allocated. [Examples](tools/uobjnew_example.txt)
+- tools/[ustat](tools/ustat.py): Collect events such as GCs, thread creations, object allocations, exceptions and more in high-level languages. [Examples](tools/ustat_example.txt)
+- tools/[uthreads](tools/uthreads.py): Trace thread creation events in Java and raw pthreads. [Examples](tools/uthreads_example.txt)
 - tools/[vfscount](tools/vfscount.py) tools/[vfscount.c](tools/vfscount.c): Count VFS calls. [Examples](tools/vfscount_example.txt).
 - tools/[vfsstat](tools/vfsstat.py) tools/[vfsstat.c](tools/vfsstat.c): Count some VFS calls, with column output. [Examples](tools/vfsstat_example.txt).
 - tools/[wakeuptime](tools/wakeuptime.py): Summarize sleep to wakeup time by waker kernel stack. [Examples](tools/wakeuptime_example.txt).
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index b720d5f..998315c 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -26,3 +26,7 @@
 add_executable(LLCStat LLCStat.cc)
 target_link_libraries(LLCStat bcc-static)
 install (TARGETS LLCStat DESTINATION share/bcc/examples/cpp)
+
+add_executable(FollyRequestContextSwitch FollyRequestContextSwitch.cc)
+target_link_libraries(FollyRequestContextSwitch bcc-static)
+install (TARGETS FollyRequestContextSwitch DESTINATION share/bcc/examples/cpp)
diff --git a/examples/cpp/FollyRequestContextSwitch.cc b/examples/cpp/FollyRequestContextSwitch.cc
new file mode 100644
index 0000000..bf3493e
--- /dev/null
+++ b/examples/cpp/FollyRequestContextSwitch.cc
@@ -0,0 +1,105 @@
+/*
+ * FollyRequestContextSwitch Monitor RequestContext switch events for any binary
+ *                           uses the class from [folly](http://bit.ly/2h6S1yx).
+ *                           For Linux, uses BCC, eBPF. Embedded C.
+ *
+ * Basic example of using USDT with BCC.
+ *
+ * USAGE: FollyRequestContextSwitch PATH_TO_BINARY
+ *
+ * Copyright (c) Facebook, Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License")
+ */
+
+#include <signal.h>
+#include <iostream>
+#include <vector>
+
+#include "BPF.h"
+
+const std::string BPF_PROGRAM = R"(
+#include <linux/sched.h>
+#include <uapi/linux/ptrace.h>
+
+struct event_t {
+  int pid;
+  char name[16];
+  uint64_t old_addr;
+  uint64_t new_addr;
+};
+
+BPF_PERF_OUTPUT(events);
+
+int on_context_switch(struct pt_regs *ctx) {
+  struct event_t event = {};
+
+  event.pid = bpf_get_current_pid_tgid();
+  bpf_get_current_comm(&event.name, sizeof(event.name));
+  
+  bpf_usdt_readarg(1, ctx, &event.old_addr);
+  bpf_usdt_readarg(2, ctx, &event.new_addr);
+
+  events.perf_submit(ctx, &event, sizeof(event));
+  return 0;
+}
+)";
+
+// Define the same struct to use in user space.
+struct event_t {
+  int pid;
+  char name[16];
+  uint64_t old_addr;
+  uint64_t new_addr;
+};
+
+void handle_output(void* cb_cookie, void* data, int data_size) {
+  auto event = static_cast<event_t*>(data);
+  std::cout << "PID " << event->pid << " (" << event->name << ") ";
+  std::cout << "folly::RequestContext switch from " << event->old_addr << " to "
+            << event->new_addr << std::endl;
+}
+
+ebpf::BPF* bpf;
+
+void signal_handler(int s) {
+  std::cerr << "Terminating..." << std::endl;
+  delete bpf;
+  exit(0);
+}
+
+int main(int argc, char** argv) {
+  if (argc != 2) {
+    std::cout << "USAGE: FollyRequestContextSwitch PATH_TO_BINARY" << std::endl;
+    exit(1);
+  }
+  std::string binary_path(argv[1]);
+
+  bpf = new ebpf::BPF();
+  std::vector<ebpf::USDT> u;
+  u.emplace_back(binary_path, "folly", "request_context_switch_before",
+                 "on_context_switch");
+  auto init_res = bpf->init(BPF_PROGRAM, {}, u);
+  if (init_res.code() != 0) {
+    std::cerr << init_res.msg() << std::endl;
+    return 1;
+  }
+
+  auto attach_res = bpf->attach_usdt(u[0]);
+  if (attach_res.code() != 0) {
+    std::cerr << attach_res.msg() << std::endl;
+    return 1;
+  }
+
+  auto open_res = bpf->open_perf_buffer("events", &handle_output);
+  if (open_res.code() != 0) {
+    std::cerr << open_res.msg() << std::endl;
+    return 1;
+  }
+
+  signal(SIGINT, signal_handler);
+  std::cout << "Started tracing, hit Ctrl-C to terminate." << std::endl;
+  while (true)
+    bpf->poll_perf_buffer("events");
+
+  return 0;
+}
diff --git a/man/man8/trace.8 b/man/man8/trace.8
index c65f849..536bbb8 100644
--- a/man/man8/trace.8
+++ b/man/man8/trace.8
@@ -2,8 +2,8 @@
 .SH NAME
 trace \- Trace a function and print its arguments or return value, optionally evaluating a filter. Uses Linux eBPF/bcc.
 .SH SYNOPSIS
-.B trace [-h] [-p PID] [-t TID] [-v] [-Z STRING_SIZE] [-S]
-         [-M MAX_EVENTS] [-o] [-K] [-U] [-I header]
+.B trace [-h] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S]
+         [-M MAX_EVENTS] [-t] [-T] [-K] [-U] [-I header]
          probe [probe ...]
 .SH DESCRIPTION
 trace probes functions you specify and displays trace messages if a particular
@@ -21,7 +21,7 @@
 \-p PID
 Trace only functions in the process PID.
 .TP
-\-t TID
+\-L TID
 Trace only functions in the thread TID.
 .TP
 \-v
@@ -39,9 +39,11 @@
 \-M MAX_EVENTS
 Print up to MAX_EVENTS trace messages and then exit.
 .TP
-\-o
-Print times relative to the beginning of the trace (offsets), in seconds. The
-default is to print absolute time.
+\-t
+Print times relative to the beginning of the trace (offsets), in seconds.
+.TP
+\-T
+Print the time column.
 .TP
 \-K
 Print the kernel stack for each event.
diff --git a/man/man8/ucalls.8 b/man/man8/ucalls.8
new file mode 100644
index 0000000..b1f4710
--- /dev/null
+++ b/man/man8/ucalls.8
@@ -0,0 +1,84 @@
+.TH ucalls 8  "2016-11-07" "USER COMMANDS"
+.SH NAME
+ucalls \- Summarize method calls from high-level languages and Linux syscalls.
+.SH SYNOPSIS
+.B ucalls [-l {java,python,ruby}] [-h] [-T TOP] [-L] [-S] [-v] [-m] pid [interval]
+.SH DESCRIPTION
+This tool summarizes method calls from high-level languages such as Python, 
+Java, and Ruby. It can also trace Linux system calls. Whenever a method is 
+invoked, ucalls records the call count and optionally the method's execution
+time (latency) and displays a summary.
+
+This uses in-kernel eBPF maps to store per process summaries for efficiency.
+
+This tool relies on USDT probes embedded in many high-level languages, such as
+Node, Java, Python, and Ruby. It requires a runtime instrumented with these 
+probes, which in some cases requires building from source with a USDT-specific
+flag, such as "--enable-dtrace" or "--with-dtrace". For Java, method probes are
+not enabled by default, and can be turned on by running the Java process with
+the "-XX:+ExtendedDTraceProbes" flag.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-l {java,python,ruby,node}
+The language to trace. If not provided, only syscalls are traced (when the \-S
+option is used).
+.TP
+\-T TOP
+Print only the top methods by frequency or latency.
+.TP
+\-L
+Collect method invocation latency (duration).
+.TP
+\-S
+Collect Linux syscalls frequency and timing.
+.TP
+\-v
+Print the resulting BPF program, for debugging purposes.
+.TP
+\-m
+Print times in milliseconds (the default is microseconds).
+.TP
+pid
+The process id to trace.
+.TP
+interval
+Print summary after this number of seconds and then exit. By default, wait for
+Ctrl+C to terminate.
+.SH EXAMPLES
+.TP
+Trace the top 10 Ruby method calls:
+#
+.B ucalls -T 10 -l ruby 1344
+.TP
+Trace Python method calls and Linux syscalls including latency in milliseconds:
+#
+.B ucalls -l python -mL 2020
+.TP
+Trace only syscalls and print a summary after 10 seconds:
+#
+.B ucalls -S 788 10
+.SH OVERHEAD
+Tracing individual method calls will produce a considerable overhead in all
+high-level languages. For languages with just-in-time compilation, such as 
+Java, the overhead can be more considerable than for interpreted languages. 
+On the other hand, syscall tracing will typically be tolerable for most 
+processes, unless they have a very unusual rate of system calls.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _example.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
+.SH SEE ALSO
+ustat(8), argdist(8)
diff --git a/man/man8/uflow.8 b/man/man8/uflow.8
new file mode 100644
index 0000000..35daff2
--- /dev/null
+++ b/man/man8/uflow.8
@@ -0,0 +1,84 @@
+.TH uflow 8  "2016-11-07" "USER COMMANDS"
+.SH NAME
+uflow \- Print a flow graph of method calls in high-level languages.
+.SH SYNOPSIS
+.B uflow [-h] [-M METHOD] [-C CLAZZ] [-v] {java,python,ruby} pid
+.SH DESCRIPTION
+uflow traces method calls and prints them in a flow graph that can facilitate
+debugging and diagnostics by following the program's execution (method flow).
+
+This tool relies on USDT probes embedded in many high-level languages, such as
+Node, Java, Python, and Ruby. It requires a runtime instrumented with these 
+probes, which in some cases requires building from source with a USDT-specific
+flag, such as "--enable-dtrace" or "--with-dtrace". For Java processes, the
+startup flag "-XX:+ExtendedDTraceProbes" is required.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-M METHOD
+Print only method calls where the method name begins with this string.
+.TP
+\-C CLAZZ
+Print only method calls where the class name begins with this string. The class
+name interpretation strongly depends on the language. For example, in Java use
+"package/subpackage/ClassName" to refer to classes.
+.TP
+\-v
+Print the resulting BPF program, for debugging purposes.
+.TP
+{java,python,ruby}
+The language to trace.
+.TP
+pid
+The process id to trace.
+.SH EXAMPLES
+.TP
+Follow method flow in a Ruby process:
+#
+.B uflow ruby 148
+.TP
+Follow method flow in a Java process where the class name is java.lang.Thread:
+#
+.B uflow -C java/lang/Thread java 1802
+.SH FIELDS
+.TP
+CPU
+The CPU number on which the method was invoked. This is useful to easily see
+where the output skips to a different CPU.
+.TP
+PID
+The process id.
+.TP
+TID
+The thread id.
+.TP
+TIME
+The duration of the method call.
+.TP
+METHOD
+The method name.
+.SH OVERHEAD
+This tool has extremely high overhead because it prints every method call. For
+some scenarios, you might see lost samples in the output as the tool is unable
+to keep up with the rate of data coming from the kernel. Filtering by class 
+or method prefix can help reduce the amount of data printed, but there is still
+a very high overhead in the collection mechanism. Do not use for performance-
+sensitive production scenarios, and always test first.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _example.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
+.SH SEE ALSO
+trace(8), ustat(8)
diff --git a/man/man8/ugc.8 b/man/man8/ugc.8
new file mode 100644
index 0000000..2629fd9
--- /dev/null
+++ b/man/man8/ugc.8
@@ -0,0 +1,71 @@
+.TH ugc 8  "2016-11-07" "USER COMMANDS"
+.SH NAME
+ugc \- Trace garbage collection events in high-level languages.
+.SH SYNOPSIS
+.B ugc [-h] [-v] [-m] {java,python,ruby,node} pid
+.SH DESCRIPTION
+This traces garbage collection events as they occur, including their duration
+and any additional information (such as generation collected or type of GC)
+provided by the respective language's runtime.
+
+This tool relies on USDT probes embedded in many high-level languages, such as
+Node, Java, Python, and Ruby. It requires a runtime instrumented with these 
+probes, which in some cases requires building from source with a USDT-specific
+flag, such as "--enable-dtrace" or "--with-dtrace".
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-v
+Print the resulting BPF program, for debugging purposes.
+.TP
+\-m
+Print times in milliseconds. The default is microseconds.
+.TP
+{java,python,ruby,node}
+The language to trace.
+.TP
+pid
+The process id to trace.
+.SH EXAMPLES
+.TP
+Trace garbage collections in a specific Node process:
+#
+.B ugc node 148
+.TP
+Trace garbage collections in a specific Java process, and print GC times in
+milliseconds:
+#
+.B ugc -m java 6004
+.SH FIELDS
+.TP
+START
+The start time of the GC, in seconds from the beginning of the trace.
+.TP
+DESCRIPTION
+The runtime-provided description of this garbage collection event.
+.TP
+TIME
+The duration of the garbage collection event.
+.SH OVERHEAD
+Garbage collection events, even if frequent, should not produce a considerable
+overhead when traced because they are still not very common. Even hundreds of 
+GCs per second (which is a very high rate) will still produce a fairly 
+negligible overhead.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _example.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
+.SH SEE ALSO
+trace(8), ustat(8), uobjnew(8)
diff --git a/man/man8/uobjnew.8 b/man/man8/uobjnew.8
new file mode 100644
index 0000000..1abaec4
--- /dev/null
+++ b/man/man8/uobjnew.8
@@ -0,0 +1,79 @@
+.TH uobjnew 8  "2016-11-07" "USER COMMANDS"
+.SH NAME
+uobjnew \- Summarize object allocations in high-level languages.
+.SH SYNOPSIS
+.B uobjnew [-h] [-C TOP_COUNT] [-S TOP_SIZE] [-v] {java,ruby,c} pid [interval]
+.SH DESCRIPTION
+uobjnew traces object allocations in high-level languages (including "malloc")
+and prints summaries of the most frequently allocated types by number of 
+objects or number of bytes.
+
+This tool relies on USDT probes embedded in many high-level languages, such as
+Node, Java, Python, and Ruby. It requires a runtime instrumented with these 
+probes, which in some cases requires building from source with a USDT-specific
+flag, such as "--enable-dtrace" or "--with-dtrace". For Java, the Java process
+must be started with the "-XX:+ExtendedDTraceProbes" flag.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-C TOP_COUNT
+Print the top object types sorted by number of instances.
+.TP
+\-S TOP_SIZE
+Print the top object types sorted by size.
+.TP
+\-v
+Print the resulting BPF program, for debugging purposes.
+.TP
+{java,ruby,c}
+The language to trace.
+.TP
+pid
+The process id to trace.
+.TP
+interval
+Wait this many seconds and then print the summary and exit. By default, wait
+for Ctrl+C to exit.
+.SH EXAMPLES
+.TP
+Trace object allocations in a Ruby process:
+#
+.B uobjnew ruby 148
+.TP
+Trace object allocations from "malloc" and print the top 10 by total size:
+#
+.B uobjnew -S 10 c 1788
+.SH FIELDS
+.TP
+TYPE
+The object type being allocated. For C (malloc), this is the block size.
+.TP
+ALLOCS
+The number of objects allocated.
+.TP
+BYTES
+The number of bytes allocated.
+.SH OVERHEAD
+Object allocation events are quite frequent, and therefore the overhead from
+running this tool can be considerable. Use with caution and make sure to 
+test before using in a production environment. Nonetheless, even thousands of
+allocations per second will likely produce a reasonable overhead when 
+investigating a problem.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _example.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
+.SH SEE ALSO
+ustat(8), ugc(8), memleak(8)
diff --git a/man/man8/ustat.8 b/man/man8/ustat.8
new file mode 100644
index 0000000..a55ee09
--- /dev/null
+++ b/man/man8/ustat.8
@@ -0,0 +1,116 @@
+.TH ustat 8  "2016-11-07" "USER COMMANDS"
+.SH NAME
+ustat \- Activity stats from high-level languages.
+.SH SYNOPSIS
+.B ustat [-l {java,python,ruby,node}] [-C] [-S {cload,excp,gc,method,objnew,thread}] [-r MAXROWS] [-d] [interval [count]]
+.SH DESCRIPTION
+This is "top" for high-level language events, such as garbage collections,
+exceptions, thread creations, object allocations, method calls, and more. The
+events are aggregated for each process and printed in a top-like table, which
+can be sorted by various fields.
+
+This uses in-kernel eBPF maps to store per process summaries for efficiency.
+
+This tool relies on USDT probes embedded in many high-level languages, such as
+Node, Java, Python, and Ruby. It requires a runtime instrumented with these 
+probes, which in some cases requires building from source with a USDT-specific
+flag, such as "--enable-dtrace" or "--with-dtrace". For Java, some probes are
+not enabled by default, and can be turned on by running the Java process with
+the "-XX:+ExtendedDTraceProbes" flag.
+
+Newly-created processes will only be traced at the next interval. If you run
+this tool with a short interval (say, 1-5 seconds), this should be virtually
+unnoticeable. For longer intervals, you might miss processes that were started
+and terminated during the interval window.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-l {java,python,ruby,node}
+The language to trace. By default, all languages are traced.
+.TP
+\-C
+Do not clear the screen between updates.
+.TP
+\-S {cload,excp,gc,method,objnew,thread}
+Sort the output by the specified field.
+.TP
+\-r MAXROWS
+Do not print more than this number of rows.
+.TP
+\-d
+Print the resulting BPF program, for debugging purposes.
+.TP
+interval
+Interval between updates, seconds.
+.TP
+count
+Number of interval summaries.
+.SH EXAMPLES
+.TP
+Summarize activity in high-level languages, 1 second refresh:
+#
+.B ustat
+.TP
+Don't clear the screen, and top 8 rows only:
+#
+.B ustat -Cr 8
+.TP
+5 second summaries, 10 times only:
+#
+.B ustat 5 10
+.SH FIELDS
+.TP
+loadavg
+The contents of /proc/loadavg
+.TP
+PID
+Process ID.
+.TP
+CMDLINE
+Process command line (often the second and following arguments will give you a
+hint as to which application is being run.
+.TP
+METHOD/s
+Count of method invocations during interval.
+.TP
+GC/s
+Count of garbage collections during interval.
+.TP
+OBJNEW/s
+Count of objects allocated during interval.
+.TP
+CLOAD/s
+Count of classes loaded during interval.
+.TP
+EXC/s
+Count of exceptions thrown during interval.
+.TP
+THR/s
+Count of threads created during interval.
+.SH OVERHEAD
+When using this tool with high-frequency events, such as method calls, a very
+significant slow-down can be expected. However, many of the high-level 
+languages covered by this tool already have a fairly high per-method invocation
+cost, especially when running in interpreted mode. For the lower-frequency 
+events, such as garbage collections or thread creations, the overhead should 
+not be significant. Specifically, when probing Java processes and not using the
+"-XX:+ExtendedDTraceProbes" flag, the most expensive probes are not emitted,
+and the overhead should be acceptable.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _example.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
+.SH SEE ALSO
+trace(8), argdist(8), tplist(8)
diff --git a/man/man8/uthreads.8 b/man/man8/uthreads.8
new file mode 100644
index 0000000..8d4d2bb
--- /dev/null
+++ b/man/man8/uthreads.8
@@ -0,0 +1,64 @@
+.TH uthreads 8  "2016-11-07" "USER COMMANDS"
+.SH NAME
+uthreads \- Trace thread creation events in Java or pthreads.
+.SH SYNOPSIS
+.B uthreads [-h] [-l {java}] [-v] pid
+.SH DESCRIPTION
+This traces thread creation events in Java processes, or pthread creation
+events in any process. When a thread is created, its name or start address
+is printed.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-l {java}
+The language to trace (currently only Java is supported). When no language is
+specified, only pthread creations are traced.
+.TP
+\-v
+Print the resulting BPF program, for debugging purposes.
+.TP
+pid
+The process id to trace.
+.SH EXAMPLES
+.TP
+Trace Java thread creations:
+#
+.B uthreads -l java 148
+.TP
+Trace pthread creations:
+#
+.B uthreads 1802
+.SH FIELDS
+.TP
+TIME
+The event's time in seconds from the beginning of the trace.
+.TP
+ID
+The thread's ID. The information in this column depends on the runtime.
+.TP
+TYPE
+Event type -- thread start, stop, or pthread event.
+.TP
+DESCRIPTION
+The thread's name or start address function name.
+.SH OVERHEAD
+Thread start and stop events are usually not very frequent, which makes this
+tool's overhead negligible.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _example.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Sasha Goldshtein
+.SH SEE ALSO
+ustat(8), trace(8)
diff --git a/src/cc/BPF.cc b/src/cc/BPF.cc
index 265b4df..4a7ca2c 100644
--- a/src/cc/BPF.cc
+++ b/src/cc/BPF.cc
@@ -30,6 +30,7 @@
 #include "bpf_module.h"
 #include "libbpf.h"
 #include "perf_reader.h"
+#include "usdt.h"
 
 #include "BPF.h"
 
@@ -50,13 +51,25 @@
 }
 
 StatusTuple BPF::init(const std::string& bpf_program,
-                      std::vector<std::string> cflags) {
+                      std::vector<std::string> cflags, std::vector<USDT> usdt) {
+  std::string all_bpf_program;
+
+  for (auto u : usdt) {
+    if (!u.initialized_)
+      TRY2(u.init());
+    all_bpf_program += u.program_text_;
+    usdt_.push_back(std::move(u));
+  }
+
   auto flags_len = cflags.size();
   const char* flags[flags_len];
   for (size_t i = 0; i < flags_len; i++)
     flags[i] = cflags[i].c_str();
-  if (bpf_module_->load_string(bpf_program, flags, flags_len) != 0)
+
+  all_bpf_program += bpf_program;
+  if (bpf_module_->load_string(all_bpf_program, flags, flags_len) != 0)
     return StatusTuple(-1, "Unable to initialize BPF program");
+
   return StatusTuple(0);
 };
 
@@ -206,6 +219,37 @@
   return StatusTuple(0);
 }
 
+StatusTuple BPF::attach_usdt(const USDT& usdt, pid_t pid, int cpu,
+                             int group_fd) {
+  for (auto& u : usdt_)
+    if (u == usdt) {
+      bool failed = false;
+      std::string err_msg;
+      int cnt = 0;
+      for (auto addr : u.addresses_) {
+        auto res =
+            attach_uprobe(u.binary_path_, std::string(), u.probe_func_, addr);
+        if (res.code() != 0) {
+          failed = true;
+          err_msg += "USDT " + u.print_name() + " at " + std::to_string(addr);
+          err_msg += ": " + res.msg() + "\n";
+          break;
+        }
+        cnt++;
+      }
+      if (failed) {
+        for (int i = 0; i < cnt; i++) {
+          auto res =
+              detach_uprobe(u.binary_path_, std::string(), u.addresses_[i]);
+          err_msg += "During clean up: " + res.msg() + "\n";
+        }
+        return StatusTuple(-1, err_msg);
+      } else
+        return StatusTuple(0);
+    }
+  return StatusTuple(-1, "USDT %s not found", usdt.print_name().c_str());
+}
+
 StatusTuple BPF::attach_tracepoint(const std::string& tracepoint,
                                    const std::string& probe_func,
                                    pid_t pid, int cpu, int group_fd,
@@ -311,6 +355,27 @@
   return StatusTuple(0);
 }
 
+StatusTuple BPF::detach_usdt(const USDT& usdt) {
+  for (auto& u : usdt_)
+    if (u == usdt) {
+      bool failed = false;
+      std::string err_msg;
+      for (auto addr : u.addresses_) {
+        auto res = detach_uprobe(u.binary_path_, std::string(), addr);
+        if (res.code() != 0) {
+          failed = true;
+          err_msg += "USDT " + u.print_name() + " at " + std::to_string(addr);
+          err_msg += ": " + res.msg() + "\n";
+        }
+      }
+      if (failed)
+        return StatusTuple(-1, err_msg);
+      else
+        return StatusTuple(0);
+    }
+  return StatusTuple(-1, "USDT %s not found", usdt.print_name().c_str());
+}
+
 StatusTuple BPF::detach_tracepoint(const std::string& tracepoint) {
   auto it = tracepoints_.find(tracepoint);
   if (it == tracepoints_.end())
@@ -383,7 +448,7 @@
 StatusTuple BPF::unload_func(const std::string& func_name) {
   auto it = funcs_.find(func_name);
   if (it == funcs_.end())
-    return StatusTuple(-1, "Probe function %s not loaded", func_name.c_str());
+    return StatusTuple(0);
 
   int res = close(it->second);
   if (res != 0)
@@ -478,4 +543,28 @@
   return StatusTuple(0);
 }
 
+StatusTuple USDT::init() {
+  auto ctx =
+      std::unique_ptr<::USDT::Context>(new ::USDT::Context(binary_path_));
+  if (!ctx->loaded())
+    return StatusTuple(-1, "Unable to load USDT " + print_name());
+  auto probe = ctx->get(name_);
+  if (probe == nullptr)
+    return StatusTuple(-1, "Unable to find USDT " + print_name());
+
+  if (!probe->enable(probe_func_))
+    return StatusTuple(-1, "Failed to enable USDT " + print_name());
+  std::ostringstream stream;
+  if (!probe->usdt_getarg(stream))
+    return StatusTuple(
+        -1, "Unable to generate program text for USDT " + print_name());
+  program_text_ = ::USDT::USDT_PROGRAM_HEADER + stream.str();
+
+  for (size_t i = 0; i < probe->num_locations(); i++)
+    addresses_.push_back(probe->address(i));
+
+  initialized_ = true;
+  return StatusTuple(0);
+}
+
 }  // namespace ebpf
diff --git a/src/cc/BPF.h b/src/cc/BPF.h
index 420dd6b..b96c66f 100644
--- a/src/cc/BPF.h
+++ b/src/cc/BPF.h
@@ -40,13 +40,16 @@
   std::map<int, int>* per_cpu_fd;
 };
 
+class USDT;
+
 class BPF {
 public:
   static const int BPF_MAX_STACK_DEPTH = 127;
 
   explicit BPF(unsigned int flag = 0) : bpf_module_(new BPFModule(flag)) {}
   StatusTuple init(const std::string& bpf_program,
-                   std::vector<std::string> cflags = {});
+                   std::vector<std::string> cflags = {},
+                   std::vector<USDT> usdt = {});
 
   ~BPF();
   StatusTuple detach_all();
@@ -70,6 +73,9 @@
       const std::string& binary_path, const std::string& symbol,
       uint64_t symbol_addr = 0,
       bpf_attach_type attach_type = bpf_attach_type::probe_entry);
+  StatusTuple attach_usdt(const USDT& usdt, pid_t pid = -1, int cpu = 0,
+                          int group_fd = -1);
+  StatusTuple detach_usdt(const USDT& usdt);
 
   StatusTuple attach_tracepoint(const std::string& tracepoint,
                                 const std::string& probe_func,
@@ -151,6 +157,8 @@
 
   std::map<std::string, int> funcs_;
 
+  std::vector<USDT> usdt_;
+
   std::map<std::string, open_probe_t> kprobes_;
   std::map<std::string, open_probe_t> uprobes_;
   std::map<std::string, open_probe_t> tracepoints_;
@@ -158,4 +166,40 @@
   std::map<std::pair<uint32_t, uint32_t>, open_probe_t> perf_events_;
 };
 
+class USDT {
+public:
+  USDT(const std::string& binary_path, const std::string& provider,
+       const std::string& name, const std::string& probe_func)
+      : initialized_(false),
+        binary_path_(binary_path),
+        provider_(provider),
+        name_(name),
+        probe_func_(probe_func) {}
+
+  bool operator==(const USDT& other) const {
+    return (provider_ == other.provider_) && (name_ == other.name_) &&
+           (binary_path_ == other.binary_path_) &&
+           (probe_func_ == other.probe_func_);
+  }
+
+  std::string print_name() const {
+    return provider_ + ":" + name_ + " from " + binary_path_;
+  }
+
+private:
+  StatusTuple init();
+  bool initialized_;
+
+  std::string binary_path_;
+  std::string provider_;
+  std::string name_;
+  std::string probe_func_;
+
+  std::vector<intptr_t> addresses_;
+
+  std::string program_text_;
+
+  friend class BPF;
+};
+
 }  // namespace ebpf
diff --git a/src/cc/CMakeLists.txt b/src/cc/CMakeLists.txt
index febcee2..fed6d3a 100644
--- a/src/cc/CMakeLists.txt
+++ b/src/cc/CMakeLists.txt
@@ -67,7 +67,7 @@
 
 install(TARGETS bcc-shared LIBRARY COMPONENT libbcc
   DESTINATION ${CMAKE_INSTALL_LIBDIR})
-install(FILES bpf_common.h bpf_module.h bcc_syms.h bcc_exception.h libbpf.h perf_reader.h BPF.h BPFTable.h COMPONENT libbcc
+install(FILES bpf_common.h bpf_module.h bcc_syms.h bcc_exception.h libbpf.h perf_reader.h BPF.h BPFTable.h shared_table.h COMPONENT libbcc
   DESTINATION include/bcc)
 install(DIRECTORY compat/linux/ COMPONENT libbcc
   DESTINATION include/bcc/compat/linux
diff --git a/src/cc/bpf_module.cc b/src/cc/bpf_module.cc
index be0a524..ee39c00 100644
--- a/src/cc/bpf_module.cc
+++ b/src/cc/bpf_module.cc
@@ -118,10 +118,11 @@
   ctx_.reset();
   if (tables_) {
     for (auto table : *tables_) {
-      if (table.is_shared)
+      if (table.is_shared) {
         SharedTables::instance()->remove_fd(table.name);
-      else
+      } else if (!table.is_extern) {
         close(table.fd);
+      }
     }
   }
 }
diff --git a/src/cc/frontends/clang/b_frontend_action.cc b/src/cc/frontends/clang/b_frontend_action.cc
index 9370386..397ecc6 100644
--- a/src/cc/frontends/clang/b_frontend_action.cc
+++ b/src/cc/frontends/clang/b_frontend_action.cc
@@ -336,6 +336,12 @@
 // to:
 //  bpf_table_foo_elem(bpf_pseudo_fd(table), &key [,&leaf])
 bool BTypeVisitor::VisitCallExpr(CallExpr *Call) {
+  // Get rewritten text given a source range, w/ expansion range applied
+  auto getRewrittenText = [this] (SourceRange R) {
+    auto r = rewriter_.getSourceMgr().getExpansionRange(R);
+    return rewriter_.getRewrittenText(r);
+  };
+
   // make sure node is a reference to a bpf table, which is assured by the
   // presence of the section("maps/<typename>") GNU __attribute__
   if (MemberExpr *Memb = dyn_cast<MemberExpr>(Call->getCallee()->IgnoreImplicit())) {
@@ -345,9 +351,8 @@
         if (!A->getName().startswith("maps"))
           return true;
 
-        SourceRange argRange(Call->getArg(0)->getLocStart(),
-                             Call->getArg(Call->getNumArgs()-1)->getLocEnd());
-        string args = rewriter_.getRewrittenText(argRange);
+        string args = getRewrittenText(SourceRange(Call->getArg(0)->getLocStart(),
+                                                   Call->getArg(Call->getNumArgs() - 1)->getLocEnd()));
 
         // find the table fd, which was opened at declaration time
         auto table_it = tables_.begin();
@@ -366,10 +371,8 @@
         if (memb_name == "lookup_or_init") {
           map_update_policy = "BPF_NOEXIST";
           string name = Ref->getDecl()->getName();
-          string arg0 = rewriter_.getRewrittenText(SourceRange(Call->getArg(0)->getLocStart(),
-                                                               Call->getArg(0)->getLocEnd()));
-          string arg1 = rewriter_.getRewrittenText(SourceRange(Call->getArg(1)->getLocStart(),
-                                                               Call->getArg(1)->getLocEnd()));
+          string arg0 = getRewrittenText(Call->getArg(0)->getSourceRange());
+          string arg1 = getRewrittenText(Call->getArg(1)->getSourceRange());
           string lookup = "bpf_map_lookup_elem_(bpf_pseudo_fd(1, " + fd + ")";
           string update = "bpf_map_update_elem_(bpf_pseudo_fd(1, " + fd + ")";
           txt  = "({typeof(" + name + ".leaf) *leaf = " + lookup + ", " + arg0 + "); ";
@@ -381,8 +384,7 @@
           txt += "leaf;})";
         } else if (memb_name == "increment") {
           string name = Ref->getDecl()->getName();
-          string arg0 = rewriter_.getRewrittenText(SourceRange(Call->getArg(0)->getLocStart(),
-                                                               Call->getArg(0)->getLocEnd()));
+          string arg0 = getRewrittenText(Call->getArg(0)->getSourceRange());
           string lookup = "bpf_map_lookup_elem_(bpf_pseudo_fd(1, " + fd + ")";
           string update = "bpf_map_update_elem_(bpf_pseudo_fd(1, " + fd + ")";
           txt  = "({ typeof(" + name + ".key) _key = " + arg0 + "; ";
@@ -394,21 +396,16 @@
           txt += "if (_leaf) (*_leaf)++; })";
         } else if (memb_name == "perf_submit") {
           string name = Ref->getDecl()->getName();
-          string arg0 = rewriter_.getRewrittenText(SourceRange(Call->getArg(0)->getLocStart(),
-                                                               Call->getArg(0)->getLocEnd()));
-          string args_other = rewriter_.getRewrittenText(SourceRange(Call->getArg(1)->getLocStart(),
-                                                                     Call->getArg(2)->getLocEnd()));
+          string arg0 = getRewrittenText(Call->getArg(0)->getSourceRange());
+          string args_other = getRewrittenText(SourceRange(Call->getArg(1)->getLocStart(),
+                                                           Call->getArg(2)->getLocEnd()));
           txt = "bpf_perf_event_output(" + arg0 + ", bpf_pseudo_fd(1, " + fd + ")";
           txt += ", bpf_get_smp_processor_id(), " + args_other + ")";
         } else if (memb_name == "perf_submit_skb") {
-          string skb = rewriter_.getRewrittenText(SourceRange(Call->getArg(0)->getLocStart(),
-                                                               Call->getArg(0)->getLocEnd()));
-          string skb_len = rewriter_.getRewrittenText(SourceRange(Call->getArg(1)->getLocStart(),
-                                                                  Call->getArg(1)->getLocEnd()));
-          string meta = rewriter_.getRewrittenText(SourceRange(Call->getArg(2)->getLocStart(),
-                                                               Call->getArg(2)->getLocEnd()));
-          string meta_len = rewriter_.getRewrittenText(SourceRange(Call->getArg(3)->getLocStart(),
-                                                                   Call->getArg(3)->getLocEnd()));
+          string skb = getRewrittenText(Call->getArg(0)->getSourceRange());
+          string skb_len = getRewrittenText(Call->getArg(1)->getSourceRange());
+          string meta = getRewrittenText(Call->getArg(2)->getSourceRange());
+          string meta_len = getRewrittenText(Call->getArg(3)->getSourceRange());
           txt = "bpf_perf_event_output(" +
             skb + ", " +
             "bpf_pseudo_fd(1, " + fd + "), " +
@@ -417,8 +414,7 @@
             meta_len + ");";
         } else if (memb_name == "get_stackid") {
             if (table_it->type == BPF_MAP_TYPE_STACK_TRACE) {
-              string arg0 = rewriter_.getRewrittenText(SourceRange(Call->getArg(0)->getLocStart(),
-                                                                   Call->getArg(0)->getLocEnd()));
+              string arg0 = getRewrittenText(Call->getArg(0)->getSourceRange());
               txt = "bpf_get_stackid(";
               txt += "bpf_pseudo_fd(1, " + fd + "), " + arg0;
               rewrite_end = Call->getArg(0)->getLocEnd();
@@ -474,7 +470,7 @@
 
         vector<string> args;
         for (auto arg : Call->arguments())
-          args.push_back(rewriter_.getRewrittenText(SourceRange(arg->getLocStart(), arg->getLocEnd())));
+          args.push_back(getRewrittenText(arg->getSourceRange()));
 
         string text;
         if (Decl->getName() == "incr_cksum_l3") {
@@ -635,7 +631,6 @@
       ++i;
     }
 
-    bool is_extern = false;
     bpf_map_type map_type = BPF_MAP_TYPE_UNSPEC;
     if (A->getName() == "maps/hash") {
       map_type = BPF_MAP_TYPE_HASH;
@@ -670,8 +665,9 @@
     } else if (A->getName() == "maps/stacktrace") {
       map_type = BPF_MAP_TYPE_STACK_TRACE;
     } else if (A->getName() == "maps/extern") {
-      is_extern = true;
+      table.is_extern = true;
       table.fd = SharedTables::instance()->lookup_fd(table.name);
+      table.type = SharedTables::instance()->lookup_type(table.name);
     } else if (A->getName() == "maps/export") {
       if (table.name.substr(0, 2) == "__")
         table.name = table.name.substr(2);
@@ -682,7 +678,7 @@
         error(Decl->getLocStart(), "reference to undefined table");
         return false;
       }
-      if (!SharedTables::instance()->insert_fd(table.name, table_it->fd)) {
+      if (!SharedTables::instance()->insert_fd(table.name, table_it->fd, table_it->type)) {
         error(Decl->getLocStart(), "could not export bpf map %0: %1") << table.name << "already in use";
         return false;
       }
@@ -690,7 +686,7 @@
       return true;
     }
 
-    if (!is_extern) {
+    if (!table.is_extern) {
       if (map_type == BPF_MAP_TYPE_UNSPEC) {
         error(Decl->getLocStart(), "unsupported map type: %0") << A->getName();
         return false;
diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c
index 24967b9..4d34c62 100644
--- a/src/cc/libbpf.c
+++ b/src/cc/libbpf.c
@@ -631,3 +631,22 @@
   // callers to detach anything they attach.
   return 0;
 }
+
+int bpf_obj_pin(int fd, const char *pathname)
+{
+  union bpf_attr attr = {
+    .pathname = ptr_to_u64((void *)pathname),
+    .bpf_fd = fd,
+  };
+
+  return syscall(__NR_bpf, BPF_OBJ_PIN, &attr, sizeof(attr));
+}
+
+int bpf_obj_get(const char *pathname)
+{
+  union bpf_attr attr = {
+    .pathname = ptr_to_u64((void *)pathname),
+  };
+
+  return syscall(__NR_bpf, BPF_OBJ_GET, &attr, sizeof(attr));
+}
diff --git a/src/cc/libbpf.h b/src/cc/libbpf.h
index cc4e0f3..b4499ec 100644
--- a/src/cc/libbpf.h
+++ b/src/cc/libbpf.h
@@ -71,6 +71,9 @@
                           pid_t pid, int cpu, int group_fd);
 int bpf_detach_perf_event(uint32_t ev_type, uint32_t ev_config);
 
+int bpf_obj_pin(int fd, const char *pathname);
+int bpf_obj_get(const char *pathname);
+
 #define LOG_BUF_SIZE 65536
 
 // Put non-static/inline functions in their own section with this prefix +
diff --git a/src/cc/shared_table.cc b/src/cc/shared_table.cc
index c27f768..f389fad 100644
--- a/src/cc/shared_table.cc
+++ b/src/cc/shared_table.cc
@@ -17,6 +17,7 @@
 #include <unistd.h>
 
 #include "shared_table.h"
+#include "compat/linux/bpf.h"
 
 namespace ebpf {
 
@@ -35,13 +36,20 @@
   auto table = tables_.find(name);
   if (table == tables_.end())
     return -1;
-  return table->second;
+  return table->second.first;
 }
 
-bool SharedTables::insert_fd(const string &name, int fd) {
+int SharedTables::lookup_type(const string &name) const {
+  auto table = tables_.find(name);
+  if (table == tables_.end())
+    return BPF_MAP_TYPE_UNSPEC;
+  return table->second.second;
+}
+
+bool SharedTables::insert_fd(const string &name, int fd, int type) {
   if (tables_.find(name) != tables_.end())
     return false;
-  tables_[name] = fd;
+  tables_[name] = std::make_pair(fd, type);
   return true;
 }
 
@@ -49,7 +57,7 @@
   auto table = tables_.find(name);
   if (table == tables_.end())
     return false;
-  close(table->second);
+  close(table->second.first);
   tables_.erase(table);
   return true;
 }
diff --git a/src/cc/shared_table.h b/src/cc/shared_table.h
index 051dfbd..7b92914 100644
--- a/src/cc/shared_table.h
+++ b/src/cc/shared_table.h
@@ -27,14 +27,16 @@
  public:
   static SharedTables * instance();
   // add an fd to the shared table, return true if successfully inserted
-  bool insert_fd(const std::string &name, int fd);
+  bool insert_fd(const std::string &name, int fd, int type);
   // lookup an fd in the shared table, or -1 if not found
   int lookup_fd(const std::string &name) const;
+  // lookup on map type in the shared table, or BPF_MAP_TYPE_UNSPEC if not found
+  int lookup_type(const std::string &name) const;
   // close and remove a shared fd. return true if the value was found
   bool remove_fd(const std::string &name);
  private:
   static SharedTables *instance_;
-  std::map<std::string, int> tables_;
+  std::map<std::string, std::pair<int, int>> tables_;
 };
 
 }
diff --git a/src/cc/table_desc.h b/src/cc/table_desc.h
index a5196e2..d299f5d 100644
--- a/src/cc/table_desc.h
+++ b/src/cc/table_desc.h
@@ -40,6 +40,7 @@
   llvm::Function *key_snprintf;
   llvm::Function *leaf_snprintf;
   bool is_shared;
+  bool is_extern;
 };
 
 }  // namespace ebpf
diff --git a/src/cc/usdt.cc b/src/cc/usdt.cc
index 0bbc9dc..4f1b00a 100644
--- a/src/cc/usdt.cc
+++ b/src/cc/usdt.cc
@@ -239,7 +239,7 @@
 }
 
 bool Context::generate_usdt_args(std::ostream &stream) {
-  stream << "#include <uapi/linux/ptrace.h>\n";
+  stream << USDT_PROGRAM_HEADER;
   for (auto &p : probes_) {
     if (p->enabled() && !p->usdt_getarg(stream))
       return false;
diff --git a/src/cc/usdt.h b/src/cc/usdt.h
index bdf9412..49251f6 100644
--- a/src/cc/usdt.h
+++ b/src/cc/usdt.h
@@ -31,6 +31,9 @@
 using std::experimental::nullopt;
 class ArgumentParser;
 
+static const std::string USDT_PROGRAM_HEADER =
+    "#include <uapi/linux/ptrace.h>\n";
+
 class Argument {
 private:
   optional<int> arg_size_;
diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py
index d95dc77..347f491 100644
--- a/src/python/bcc/__init__.py
+++ b/src/python/bcc/__init__.py
@@ -221,7 +221,7 @@
                                     "possible cause is missing pid when a " +
                                     "probe in a shared object has multiple " +
                                     "locations")
-                text = usdt_context.get_text() + text
+                text = usdt_text + text
 
         if text:
             self.module = lib.bpf_module_create_c_from_string(text.encode("ascii"),
@@ -1058,5 +1058,11 @@
             lib.bpf_module_destroy(self.module)
             self.module = None
 
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.cleanup()
+
 
 from .usdt import USDT
diff --git a/src/python/bcc/usdt.py b/src/python/bcc/usdt.py
index adcd3d7..19a3a98 100644
--- a/src/python/bcc/usdt.py
+++ b/src/python/bcc/usdt.py
@@ -145,13 +145,15 @@
     # This is called by the BPF module's __init__ when it realizes that there
     # is a USDT context and probes need to be attached.
     def attach_uprobes(self, bpf):
+        probes = self.enumerate_active_probes()
+        for (binpath, fn_name, addr, pid) in probes:
+            bpf.attach_uprobe(name=binpath, fn_name=fn_name,
+                              addr=addr, pid=pid)
+
+    def enumerate_active_probes(self):
         probes = []
         def _add_probe(binpath, fn_name, addr, pid):
             probes.append((binpath, fn_name, addr, pid))
 
         lib.bcc_usdt_foreach_uprobe(self.context, _USDT_PROBE_CB(_add_probe))
-
-        for (binpath, fn_name, addr, pid) in probes:
-            bpf.attach_uprobe(name=binpath, fn_name=fn_name,
-                              addr=addr, pid=pid)
-
+        return probes
diff --git a/tests/python/test_clang.py b/tests/python/test_clang.py
index 2d6e5bf..4725a84 100755
--- a/tests/python/test_clang.py
+++ b/tests/python/test_clang.py
@@ -352,5 +352,45 @@
         with self.assertRaises(Exception):
             b = BPF(text=text)
 
+    def test_call_macro_arg(self):
+        text = """
+BPF_TABLE("prog", u32, u32, jmp, 32);
+
+#define JMP_IDX_PIPE (1U << 1)
+
+enum action {
+    ACTION_PASS
+};
+
+int process(struct xdp_md *ctx) {
+    jmp.call((void *)ctx, ACTION_PASS);
+    jmp.call((void *)ctx, JMP_IDX_PIPE);
+    return XDP_PASS;
+}
+        """
+        b = BPF(text=text)
+        t = b["jmp"]
+        self.assertEquals(len(t), 32);
+
+    def test_update_macro_arg(self):
+        text = """
+BPF_TABLE("array", u32, u32, act, 32);
+
+#define JMP_IDX_PIPE (1U << 1)
+
+enum action {
+    ACTION_PASS
+};
+
+int process(struct xdp_md *ctx) {
+    act.increment(ACTION_PASS);
+    act.increment(JMP_IDX_PIPE);
+    return XDP_PASS;
+}
+        """
+        b = BPF(text=text)
+        t = b["act"]
+        self.assertEquals(len(t), 32);
+
 if __name__ == "__main__":
     main()
diff --git a/tests/python/test_shared_table.py b/tests/python/test_shared_table.py
new file mode 100644
index 0000000..10dd63f
--- /dev/null
+++ b/tests/python/test_shared_table.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 Facebook, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+
+import ctypes as ct
+import unittest
+from bcc import BPF
+
+class TestSharedTable(unittest.TestCase):
+    def test_close_extern(self):
+        b1 = BPF(text="""BPF_TABLE_PUBLIC("array", int, int, table1, 10);""")
+
+        with BPF(text="""BPF_TABLE("extern", int, int, table1, 10);""") as b2:
+            t2 = b2["table1"]
+            t2[ct.c_int(1)] = ct.c_int(10)
+            self.assertEqual(len(t2), 10)
+
+        t1 = b1["table1"]
+        self.assertEqual(t1[ct.c_int(1)].value, 10)
+        self.assertEqual(len(t1), 10)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/trace.py b/tools/trace.py
index ba93998..cd211fd 100755
--- a/tools/trace.py
+++ b/tools/trace.py
@@ -3,8 +3,8 @@
 # trace         Trace a function and print a trace message based on its
 #               parameters, with an optional filter.
 #
-# usage: trace [-h] [-p PID] [-t TID] [-v] [-Z STRING_SIZE] [-S]
-#              [-M MAX_EVENTS] [-o] [-K] [-U] [-I header]
+# usage: trace [-h] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S]
+#              [-M MAX_EVENTS] [-T] [-t] [-K] [-U] [-I header]
 #              probe [probe ...]
 #
 # Licensed under the Apache License, Version 2.0 (the "License")
@@ -58,7 +58,8 @@
         @classmethod
         def configure(cls, args):
                 cls.max_events = args.max_events
-                cls.use_localtime = not args.offset
+                cls.print_time = args.timestamp or args.time
+                cls.use_localtime = not args.timestamp
                 cls.first_ts = Time.monotonic_time()
                 cls.tgid = args.tgid or -1
                 cls.pid = args.pid or -1
@@ -485,11 +486,16 @@
                 values = map(lambda i: getattr(event, "v%d" % i),
                              range(0, len(self.values)))
                 msg = self._format_message(bpf, event.tgid, values)
-                time = strftime("%H:%M:%S") if Probe.use_localtime else \
-                       Probe._time_off_str(event.timestamp_ns)
-                print("%-8s %-6d %-6d %-12s %-16s %s" %
-                    (time[:8], event.tgid, event.pid, event.comm,
-                     self._display_function(), msg))
+                if not Probe.print_time:
+                    print("%-6d %-6d %-12s %-16s %s" %
+                          (event.tgid, event.pid, event.comm,
+                           self._display_function(), msg))
+                else:
+                    time = strftime("%H:%M:%S") if Probe.use_localtime else \
+                           Probe._time_off_str(event.timestamp_ns)
+                    print("%-8s %-6d %-6d %-12s %-16s %s" %
+                          (time[:8], event.tgid, event.pid, event.comm,
+                           self._display_function(), msg))
 
                 if self.kernel_stack:
                         self.print_stack(bpf, event.kernel_stack_id, -1)
@@ -579,7 +585,7 @@
                 # their kernel names -- tgid and pid -- inside the script
                 parser.add_argument("-p", "--pid", type=int, metavar="PID",
                   dest="tgid", help="id of the process to trace (optional)")
-                parser.add_argument("-t", "--tid", type=int, metavar="TID",
+                parser.add_argument("-L", "--tid", type=int, metavar="TID",
                   dest="pid", help="id of the thread to trace (optional)")
                 parser.add_argument("-v", "--verbose", action="store_true",
                   help="print resulting BPF program code before executing")
@@ -590,8 +596,10 @@
                   help="do not filter trace's own pid from the trace")
                 parser.add_argument("-M", "--max-events", type=int,
                   help="number of events to print before quitting")
-                parser.add_argument("-o", "--offset", action="store_true",
-                  help="use relative time from first traced message")
+                parser.add_argument("-t", "--timestamp", action="store_true",
+                  help="print timestamp column (offset from trace start)")
+                parser.add_argument("-T", "--time", action="store_true",
+                  help="print time column")
                 parser.add_argument("-K", "--kernel-stack",
                   action="store_true", help="output kernel stack trace")
                 parser.add_argument("-U", "--user-stack",
@@ -653,9 +661,14 @@
                                              self.probes))
 
                 # Print header
-                print("%-8s %-6s %-6s %-12s %-16s %s" %
-                      ("TIME", "PID", "TID", "COMM", "FUNC",
-                      "-" if not all_probes_trivial else ""))
+                if self.args.timestamp or self.args.time:
+                    print("%-8s %-6s %-6s %-12s %-16s %s" %
+                          ("TIME", "PID", "TID", "COMM", "FUNC",
+                          "-" if not all_probes_trivial else ""))
+                else:
+                    print("%-6s %-6s %-12s %-16s %s" %
+                          ("PID", "TID", "COMM", "FUNC",
+                          "-" if not all_probes_trivial else ""))
 
                 while True:
                         self.bpf.kprobe_poll()
diff --git a/tools/trace_example.txt b/tools/trace_example.txt
index 08b9061..46dc843 100644
--- a/tools/trace_example.txt
+++ b/tools/trace_example.txt
@@ -9,20 +9,20 @@
 system:
 
 # trace 'sys_execve "%s", arg1'
-TIME     PID    COMM         FUNC             -
-05:11:51 4402   bash         sys_execve       /usr/bin/man
-05:11:51 4411   man          sys_execve       /usr/local/bin/less
-05:11:51 4411   man          sys_execve       /usr/bin/less
-05:11:51 4410   man          sys_execve       /usr/local/bin/nroff
-05:11:51 4410   man          sys_execve       /usr/bin/nroff
-05:11:51 4409   man          sys_execve       /usr/local/bin/tbl
-05:11:51 4409   man          sys_execve       /usr/bin/tbl
-05:11:51 4408   man          sys_execve       /usr/local/bin/preconv
-05:11:51 4408   man          sys_execve       /usr/bin/preconv
-05:11:51 4415   nroff        sys_execve       /usr/bin/locale
-05:11:51 4416   nroff        sys_execve       /usr/bin/groff
-05:11:51 4418   groff        sys_execve       /usr/bin/grotty
-05:11:51 4417   groff        sys_execve       /usr/bin/troff
+PID    COMM         FUNC             -
+4402   bash         sys_execve       /usr/bin/man
+4411   man          sys_execve       /usr/local/bin/less
+4411   man          sys_execve       /usr/bin/less
+4410   man          sys_execve       /usr/local/bin/nroff
+4410   man          sys_execve       /usr/bin/nroff
+4409   man          sys_execve       /usr/local/bin/tbl
+4409   man          sys_execve       /usr/bin/tbl
+4408   man          sys_execve       /usr/local/bin/preconv
+4408   man          sys_execve       /usr/bin/preconv
+4415   nroff        sys_execve       /usr/bin/locale
+4416   nroff        sys_execve       /usr/bin/groff
+4418   groff        sys_execve       /usr/bin/grotty
+4417   groff        sys_execve       /usr/bin/troff
 ^C
 
 The ::sys_execve syntax specifies that you want an entry probe (which is the
@@ -38,11 +38,11 @@
 bytes to be read:
 
 # trace 'sys_read (arg3 > 20000) "read %d bytes", arg3'
-TIME     PID    COMM         FUNC             -
-05:18:23 4490   dd           sys_read         read 1048576 bytes
-05:18:23 4490   dd           sys_read         read 1048576 bytes
-05:18:23 4490   dd           sys_read         read 1048576 bytes
-05:18:23 4490   dd           sys_read         read 1048576 bytes
+PID    COMM         FUNC             -
+4490   dd           sys_read         read 1048576 bytes
+4490   dd           sys_read         read 1048576 bytes
+4490   dd           sys_read         read 1048576 bytes
+4490   dd           sys_read         read 1048576 bytes
 ^C
 
 During the trace, I executed "dd if=/dev/zero of=/dev/null bs=1M count=4".
@@ -55,9 +55,9 @@
 value, effectively snooping all bash shell input across the system:
 
 # trace 'r:bash:readline "%s", retval'
-TIME     PID    COMM         FUNC             -
-05:24:50 2740   bash         readline         echo hi!
-05:24:53 2740   bash         readline         man ls
+PID    COMM         FUNC             -
+2740   bash         readline         echo hi!
+2740   bash         readline         man ls
 ^C
 
 The special retval keywords stands for the function's return value, and can
@@ -67,10 +67,10 @@
 can specify the full path to the executable (e.g. "/usr/bin/bash").
 
 Multiple probes can be combined on the same command line. For example, let's
-trace failed read and write calls on the libc level:
+trace failed read and write calls on the libc level, and include a time column:
 
 # trace 'r:c:read ((int)retval < 0) "read failed: %d", retval' \
-        'r:c:write ((int)retval < 0) "write failed: %d", retval'
+        'r:c:write ((int)retval < 0) "write failed: %d", retval' -T
 TIME     PID    COMM         FUNC             -
 05:31:57 3388   bash         write            write failed: -1
 05:32:00 3388   bash         write            write failed: -1
@@ -84,7 +84,7 @@
 trace the block:block_rq_complete tracepoint and print out the number of sectors
 transferred:
 
-# trace 't:block:block_rq_complete "sectors=%d", args->nr_sector'
+# trace 't:block:block_rq_complete "sectors=%d", args->nr_sector' -T
 TIME     PID    COMM         FUNC             -
 01:23:51 0      swapper/0    block_rq_complete sectors=8
 01:23:55 10017  kworker/u64: block_rq_complete sectors=1
@@ -110,7 +110,7 @@
 These probes can be traced by trace just like kernel tracepoints. For example,
 trace new threads being created and their function name:
 
-# trace 'u:pthread:pthread_create "%U", arg3'
+# trace 'u:pthread:pthread_create "%U", arg3' -T
 TIME     PID    COMM         FUNC             -
 02:07:29 4051   contentions  pthread_create   primes_thread+0x0
 02:07:29 4051   contentions  pthread_create   primes_thread+0x0
@@ -125,7 +125,7 @@
 trace Ruby methods being called (this requires a version of Ruby built with 
 the --enable-dtrace configure flag):
 
-# trace 'u:ruby:method__entry "%s.%s", arg1, arg2' -p $(pidof irb)
+# trace 'u:ruby:method__entry "%s.%s", arg1, arg2' -p $(pidof irb) -T
 TIME     PID    COMM         FUNC             -
 12:08:43 18420  irb          method__entry    IRB::Context.verbose?
 12:08:43 18420  irb          method__entry    RubyLex.ungetc
@@ -139,7 +139,7 @@
 Occasionally, it can be useful to filter specific strings. For example, you
 might be interested in open() calls that open a specific file:
 
-# trace 'p:c:open (STRCMP("test.txt", arg1)) "opening %s", arg1'
+# trace 'p:c:open (STRCMP("test.txt", arg1)) "opening %s", arg1' -T
 TIME     PID    COMM         FUNC             -
 01:43:15 10938  cat          open             opening test.txt
 01:43:20 10939  cat          open             opening test.txt
@@ -149,7 +149,7 @@
 As a final example, let's trace open syscalls for a specific process. By 
 default, tracing is system-wide, but the -p switch overrides this:
 
-# trace -p 2740 'do_sys_open "%s", arg2'
+# trace -p 2740 'do_sys_open "%s", arg2' -T
 TIME     PID    COMM         FUNC             -
 05:36:16 15872  ls           do_sys_open      /etc/ld.so.cache
 05:36:16 15872  ls           do_sys_open      /lib64/libselinux.so.1
@@ -171,8 +171,8 @@
 USAGE message:
 
 # trace -h
-usage: trace [-h] [-p PID] [-t TID] [-v] [-Z STRING_SIZE] [-S]
-             [-M MAX_EVENTS] [-o] [-K] [-U] [-I header]
+usage: trace [-h] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S]
+             [-M MAX_EVENTS] [-t] [-T] [-K] [-U] [-I header]
              probe [probe ...]
 
 Attach to functions and print trace messages.
@@ -183,14 +183,15 @@
 optional arguments:
   -h, --help            show this help message and exit
   -p PID, --pid PID     id of the process to trace (optional)
-  -t TID, --tid TID     id of the thread to trace (optional)
+  -L TID, --tid TID     id of the thread to trace (optional)
   -v, --verbose         print resulting BPF program code before executing
   -Z STRING_SIZE, --string-size STRING_SIZE
                         maximum size to read from strings
   -S, --include-self    do not filter trace's own pid from the trace
   -M MAX_EVENTS, --max-events MAX_EVENTS
                         number of events to print before quitting
-  -o, --offset          use relative time from first traced message
+  -t, --timestamp       print timestamp column (offset from trace start)
+  -T, --time		print time column
   -K, --kernel-stack    output kernel stack trace
   -U, --user-stack      output user stack trace
   -I header, --include header
diff --git a/tools/ucalls.py b/tools/ucalls.py
new file mode 100755
index 0000000..ed476cd
--- /dev/null
+++ b/tools/ucalls.py
@@ -0,0 +1,300 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# ucalls  Summarize method calls in high-level languages and/or system calls.
+#         For Linux, uses BCC, eBPF.
+#
+# USAGE: ucalls [-l {java,python,ruby}] [-h] [-T TOP] [-L] [-S] [-v] [-m]
+#        pid [interval]
+#
+# Copyright 2016 Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 19-Oct-2016   Sasha Goldshtein   Created this.
+
+from __future__ import print_function
+import argparse
+from bcc import BPF, USDT
+from time import sleep
+
+examples = """examples:
+    ./ucalls -l java 185        # trace Java calls and print statistics on ^C
+    ./ucalls -l python 2020 1   # trace Python calls and print every second
+    ./ucalls -l java 185 -S     # trace Java calls and syscalls
+    ./ucalls 6712 -S            # trace only syscall counts
+    ./ucalls -l ruby 1344 -T 10 # trace top 10 Ruby method calls
+    ./ucalls -l ruby 1344 -L    # trace Ruby calls including latency
+    ./ucalls -l ruby 1344 -LS   # trace Ruby calls and syscalls with latency
+    ./ucalls -l python 2020 -mL # trace Python calls including latency in ms
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize method calls in high-level languages.",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("pid", type=int, help="process id to attach to")
+parser.add_argument("interval", type=int, nargs='?',
+    help="print every specified number of seconds")
+parser.add_argument("-l", "--language", choices=["java", "python", "ruby"],
+    help="language to trace (if none, trace syscalls only)")
+parser.add_argument("-T", "--top", type=int,
+    help="number of most frequent/slow calls to print")
+parser.add_argument("-L", "--latency", action="store_true",
+    help="record method latency from enter to exit (except recursive calls)")
+parser.add_argument("-S", "--syscalls", action="store_true",
+    help="record syscall latency (adds overhead)")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="verbose mode: print the BPF program (for debugging purposes)")
+parser.add_argument("-m", "--milliseconds", action="store_true",
+    help="report times in milliseconds (default is microseconds)")
+args = parser.parse_args()
+
+# We assume that the entry and return probes have the same arguments. This is
+# the case for Java, Python, and Ruby. If there's a language where it's not the
+# case, we will need to build a custom correlator from entry to exit.
+if args.language == "java":
+    # TODO for JVM entries, we actually have the real length of the class
+    #      and method strings in arg3 and arg5 respectively, so we can insert
+    #      the null terminator in its proper position.
+    entry_probe = "method__entry"
+    return_probe = "method__return"
+    read_class = "bpf_usdt_readarg(2, ctx, &clazz);"
+    read_method = "bpf_usdt_readarg(4, ctx, &method);"
+elif args.language == "python":
+    entry_probe = "function__entry"
+    return_probe = "function__return"
+    read_class = "bpf_usdt_readarg(1, ctx, &clazz);"    # filename really
+    read_method = "bpf_usdt_readarg(2, ctx, &method);"
+elif args.language == "ruby":
+    # TODO Also probe cmethod__entry and cmethod__return with same arguments
+    entry_probe = "method__entry"
+    return_probe = "method__return"
+    read_class = "bpf_usdt_readarg(1, ctx, &clazz);"
+    read_method = "bpf_usdt_readarg(2, ctx, &method);"
+elif not args.language:
+    if not args.syscalls:
+        print("Nothing to do; use -S to trace syscalls.")
+        exit(1)
+    entry_probe, return_probe, read_class, read_method = ("", "", "", "")
+
+program = """
+#include <linux/ptrace.h>
+
+#define MAX_STRING_LENGTH 80
+DEFINE_NOLANG
+DEFINE_LATENCY
+DEFINE_SYSCALLS
+
+struct method_t {
+    char clazz[MAX_STRING_LENGTH];
+    char method[MAX_STRING_LENGTH];
+};
+struct entry_t {
+    u64 pid;
+    struct method_t method;
+};
+struct info_t {
+    u64 num_calls;
+    u64 total_ns;
+};
+struct syscall_entry_t {
+    u64 timestamp;
+    u64 ip;
+};
+
+#ifndef LATENCY
+  BPF_HASH(counts, struct method_t, u64);            // number of calls
+  #ifdef SYSCALLS
+    BPF_HASH(syscounts, u64, u64);                   // number of calls per IP
+  #endif  // SYSCALLS
+#else
+  BPF_HASH(times, struct method_t, struct info_t);
+  BPF_HASH(entry, struct entry_t, u64);              // timestamp at entry
+  #ifdef SYSCALLS
+    BPF_HASH(systimes, u64, struct info_t);          // latency per IP
+    BPF_HASH(sysentry, u64, struct syscall_entry_t); // ts + IP at entry
+  #endif  // SYSCALLS
+#endif
+
+#ifndef NOLANG
+int trace_entry(struct pt_regs *ctx) {
+    u64 clazz = 0, method = 0, val = 0;
+    u64 *valp;
+    struct entry_t data = {0};
+#ifdef LATENCY
+    u64 timestamp = bpf_ktime_get_ns();
+    data.pid = bpf_get_current_pid_tgid();
+#endif
+    READ_CLASS
+    READ_METHOD
+    bpf_probe_read(&data.method.clazz, sizeof(data.method.clazz),
+                   (void *)clazz);
+    bpf_probe_read(&data.method.method, sizeof(data.method.method),
+                   (void *)method);
+#ifndef LATENCY
+    valp = counts.lookup_or_init(&data.method, &val);
+    ++(*valp);
+#endif
+#ifdef LATENCY
+    entry.update(&data, &timestamp);
+#endif
+    return 0;
+}
+
+#ifdef LATENCY
+int trace_return(struct pt_regs *ctx) {
+    u64 *entry_timestamp, clazz = 0, method = 0;
+    struct info_t *info, zero = {};
+    struct entry_t data = {};
+    data.pid = bpf_get_current_pid_tgid();
+    READ_CLASS
+    READ_METHOD
+    bpf_probe_read(&data.method.clazz, sizeof(data.method.clazz),
+                   (void *)clazz);
+    bpf_probe_read(&data.method.method, sizeof(data.method.method),
+                   (void *)method);
+    entry_timestamp = entry.lookup(&data);
+    if (!entry_timestamp) {
+        return 0;   // missed the entry event
+    }
+    info = times.lookup_or_init(&data.method, &zero);
+    info->num_calls += 1;
+    info->total_ns += bpf_ktime_get_ns() - *entry_timestamp;
+    entry.delete(&data);
+    return 0;
+}
+#endif  // LATENCY
+#endif  // NOLANG
+
+#ifdef SYSCALLS
+int syscall_entry(struct pt_regs *ctx) {
+    u64 pid = bpf_get_current_pid_tgid();
+    u64 *valp, ip = ctx->ip, val = 0;
+    PID_FILTER
+#ifdef LATENCY
+    struct syscall_entry_t data = {};
+    data.timestamp = bpf_ktime_get_ns();
+    data.ip = ip;
+#endif
+#ifndef LATENCY
+    valp = syscounts.lookup_or_init(&ip, &val);
+    ++(*valp);
+#endif
+#ifdef LATENCY
+    sysentry.update(&pid, &data);
+#endif
+    return 0;
+}
+
+#ifdef LATENCY
+int syscall_return(struct pt_regs *ctx) {
+    struct syscall_entry_t *e;
+    struct info_t *info, zero = {};
+    u64 pid = bpf_get_current_pid_tgid(), ip;
+    PID_FILTER
+    e = sysentry.lookup(&pid);
+    if (!e) {
+        return 0;   // missed the entry event
+    }
+    ip = e->ip;
+    info = systimes.lookup_or_init(&ip, &zero);
+    info->num_calls += 1;
+    info->total_ns += bpf_ktime_get_ns() - e->timestamp;
+    sysentry.delete(&pid);
+    return 0;
+}
+#endif  // LATENCY
+#endif  // SYSCALLS
+""".replace("READ_CLASS", read_class) \
+   .replace("READ_METHOD", read_method) \
+   .replace("PID_FILTER", "if ((pid >> 32) != %d) { return 0; }" % args.pid) \
+   .replace("DEFINE_NOLANG", "#define NOLANG" if not args.language else "") \
+   .replace("DEFINE_LATENCY", "#define LATENCY" if args.latency else "") \
+   .replace("DEFINE_SYSCALLS", "#define SYSCALLS" if args.syscalls else "")
+
+if args.language:
+    usdt = USDT(pid=args.pid)
+    usdt.enable_probe(entry_probe, "trace_entry")
+    if args.latency:
+        usdt.enable_probe(return_probe, "trace_return")
+else:
+    usdt = None
+
+if args.verbose:
+    if usdt:
+        print(usdt.get_text())
+    print(program)
+
+bpf = BPF(text=program, usdt_contexts=[usdt] if usdt else [])
+if args.syscalls:
+    syscall_regex = "^[Ss]y[Ss]_.*"
+    bpf.attach_kprobe(event_re=syscall_regex, fn_name="syscall_entry")
+    if args.latency:
+        bpf.attach_kretprobe(event_re=syscall_regex, fn_name="syscall_return")
+    print("Attached %d kernel probes for syscall tracing." %
+          bpf.num_open_kprobes())
+
+def get_data():
+    # Will be empty when no language was specified for tracing
+    if args.latency:
+        data = map(lambda (k, v): (k.clazz + "." + k.method,
+                                   (v.num_calls, v.total_ns)),
+                   bpf["times"].items())
+    else:
+        data = map(lambda (k, v): (k.clazz + "." + k.method, (v.value, 0)),
+                   bpf["counts"].items())
+
+    if args.syscalls:
+        if args.latency:
+            syscalls = map(lambda (k, v): (bpf.ksym(k.value),
+                                           (v.num_calls, v.total_ns)),
+                           bpf["systimes"].items())
+            data.extend(syscalls)
+        else:
+            syscalls = map(lambda (k, v): (bpf.ksym(k.value), (v.value, 0)),
+                           bpf["syscounts"].items())
+            data.extend(syscalls)
+
+    return sorted(data, key=lambda (k, v): v[1 if args.latency else 0])
+
+def clear_data():
+    if args.latency:
+        bpf["times"].clear()
+    else:
+        bpf["counts"].clear()
+
+    if args.syscalls:
+        if args.latency:
+            bpf["systimes"].clear()
+        else:
+            bpf["syscounts"].clear()
+
+exit_signaled = False
+print("Tracing calls in process %d (language: %s)... Ctrl-C to quit." %
+      (args.pid, args.language or "none"))
+while True:
+    try:
+        sleep(args.interval or 99999999)
+    except KeyboardInterrupt:
+        exit_signaled = True
+    print()
+    data = get_data()   # [(function, (num calls, latency in ns))]
+    if args.latency:
+        time_col = "TIME (ms)" if args.milliseconds else "TIME (us)"
+        print("%-50s %8s %8s" % ("METHOD", "# CALLS", time_col))
+    else:
+        print("%-50s %8s" % ("METHOD", "# CALLS"))
+    if args.top:
+        data = data[-args.top:]
+    for key, value in data:
+        if args.latency:
+            time = value[1]/1000000.0 if args.milliseconds else \
+                   value[1]/1000.0
+            print("%-50s %8d %6.2f" % (key, value[0], time))
+        else:
+            print("%-50s %8d" % (key, value[0]))
+    if args.interval and not exit_signaled:
+        clear_data()
+    else:
+        if args.syscalls:
+            print("Detaching kernel probes, please wait...")
+        exit()
diff --git a/tools/ucalls_example.txt b/tools/ucalls_example.txt
new file mode 100644
index 0000000..7410f88
--- /dev/null
+++ b/tools/ucalls_example.txt
@@ -0,0 +1,92 @@
+Demonstrations of ucalls.
+
+
+ucalls summarizes method calls in various high-level languages, including Java,
+Python, Ruby, and Linux system calls. It displays statistics on the most 
+frequently called methods, as well as the latency (duration) of these methods.
+
+Through the syscalls support, ucalls can provide basic information on a 
+process' interaction with the system including syscall counts and latencies. 
+This can then be used for further exploration with other BCC tools like trace,
+argdist, biotop, fileslower, and others.
+
+For example, to trace method call latency in a Java application:
+
+# ucalls -L -l java $(pidof java)
+Tracing calls in process 26877 (language: java)... Ctrl-C to quit.
+
+METHOD                                              # CALLS TIME (us)
+java/io/BufferedInputStream.getBufIfOpen                  1 7.00
+slowy/App.isSimplePrime                                8970 8858.35
+slowy/App.isDivisible                               3228196 3076985.12
+slowy/App.isPrime                                      8969 4841017.64
+^C
+
+
+To trace only syscalls in a particular process and print the top 10 most 
+frequently-invoked ones:
+
+# ucalls -ST 10 3018
+Attached 375 kernel probes for syscall tracing.
+Tracing calls in process 3018 (language: none)... Ctrl-C to quit.
+
+METHOD                                              # CALLS
+sys_rt_sigaction                                          4
+SyS_rt_sigprocmask                                        4
+sys_mprotect                                              5
+sys_read                                                 22
+SyS_write                                                39
+SyS_epoll_wait                                           42
+sys_futex                                               177
+SyS_mmap                                                180
+sys_mmap_pgoff                                          181
+sys_munmap                                              817
+^C
+Detaching kernel probes, please wait...
+
+
+To print only the top 5 methods and report times in milliseconds (the default
+is microseconds):
+
+# ucalls -l python -mT 5 $(pidof python)
+Tracing calls in process 26914 (language: python)... Ctrl-C to quit.
+
+METHOD                                              # CALLS
+<stdin>.<module>                                          1
+<stdin>.fibo                                       14190928
+^C
+
+
+USAGE message:
+
+# ./ucalls.py -h
+usage: ucalls.py [-h] [-l {java,python,ruby}] [-T TOP] [-L] [-S] [-v] [-m]
+                 pid [interval]
+
+Summarize method calls in high-level languages.
+
+positional arguments:
+  pid                   process id to attach to
+  interval              print every specified number of seconds
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -l {java,python,ruby}, --language {java,python,ruby}
+                        language to trace (if none, trace syscalls only)
+  -T TOP, --top TOP     number of most frequent/slow calls to print
+  -L, --latency         record method latency from enter to exit (except
+                        recursive calls)
+  -S, --syscalls        record syscall latency (adds overhead)
+  -v, --verbose         verbose mode: print the BPF program (for debugging
+                        purposes)
+  -m, --milliseconds    report times in milliseconds (default is microseconds)
+
+examples:
+    ./ucalls -l java 185        # trace Java calls and print statistics on ^C
+    ./ucalls -l python 2020 1   # trace Python calls and print every second
+    ./ucalls -l java 185 -S     # trace Java calls and syscalls
+    ./ucalls 6712 -S            # trace only syscall counts
+    ./ucalls -l ruby 1344 -T 10 # trace top 10 Ruby method calls
+    ./ucalls -l ruby 1344 -L    # trace Ruby calls including latency
+    ./ucalls -l ruby 1344 -LS   # trace Ruby calls and syscalls with latency
+    ./ucalls -l python 2020 -mL # trace Python calls including latency in ms
diff --git a/tools/uflow.py b/tools/uflow.py
new file mode 100755
index 0000000..6bf8b53
--- /dev/null
+++ b/tools/uflow.py
@@ -0,0 +1,174 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# uflow  Trace method execution flow in high-level languages.
+#        For Linux, uses BCC, eBPF.
+#
+# USAGE: uflow [-C CLASS] [-M METHOD] [-v] {java,python,ruby} pid
+#
+# Copyright 2016 Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 27-Oct-2016   Sasha Goldshtein   Created this.
+
+from __future__ import print_function
+import argparse
+from bcc import BPF, USDT
+import ctypes as ct
+import time
+
+examples = """examples:
+    ./uflow java 185                # trace Java method calls in process 185
+    ./uflow ruby 1344               # trace Ruby method calls in process 1344
+    ./uflow -M indexOf java 185     # trace only 'indexOf'-prefixed methods
+    ./uflow -C '<stdin>' python 180 # trace only REPL-defined methods
+"""
+parser = argparse.ArgumentParser(
+    description="Trace method execution flow in high-level languages.",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("language", choices=["java", "python", "ruby"],
+    help="language to trace")
+parser.add_argument("pid", type=int, help="process id to attach to")
+parser.add_argument("-M", "--method",
+    help="trace only calls to methods starting with this prefix")
+parser.add_argument("-C", "--class", dest="clazz",
+    help="trace only calls to classes starting with this prefix")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="verbose mode: print the BPF program (for debugging purposes)")
+args = parser.parse_args()
+
+usdt = USDT(pid=args.pid)
+
+program = """
+struct call_t {
+    u64 depth;                  // first bit is direction (0 entry, 1 return)
+    u64 pid;                    // (tgid << 32) + pid from bpf_get_current...
+    u64 timestamp;              // ns
+    char clazz[80];
+    char method[80];
+};
+
+BPF_PERF_OUTPUT(calls);
+BPF_HASH(entry, u64, u64);
+"""
+
+prefix_template = """
+static inline bool prefix_%s(char *actual) {
+    char expected[] = "%s";
+    for (int i = 0; i < sizeof(expected) - 1; ++i) {
+        if (expected[i] != actual[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+"""
+
+if args.clazz:
+    program += prefix_template % ("class", args.clazz)
+if args.method:
+    program += prefix_template % ("method", args.method)
+
+trace_template = """
+int NAME(struct pt_regs *ctx) {
+    u64 *depth, zero = 0, clazz = 0, method = 0 ;
+    struct call_t data = {};
+
+    READ_CLASS
+    READ_METHOD
+    bpf_probe_read(&data.clazz, sizeof(data.clazz), (void *)clazz);
+    bpf_probe_read(&data.method, sizeof(data.method), (void *)method);
+
+    FILTER_CLASS
+    FILTER_METHOD
+
+    data.pid = bpf_get_current_pid_tgid();
+    data.timestamp = bpf_ktime_get_ns();
+    depth = entry.lookup_or_init(&data.pid, &zero);
+    data.depth = DEPTH;
+    UPDATE
+
+    calls.perf_submit(ctx, &data, sizeof(data));
+    return 0;
+}
+"""
+
+def enable_probe(probe_name, func_name, read_class, read_method, is_return):
+    global program, trace_template, usdt
+    depth = "*depth + 1" if not is_return else "*depth | (1ULL << 63)"
+    update = "++(*depth);" if not is_return else  "if (*depth) --(*depth);"
+    filter_class = "if (!prefix_class(data.clazz)) { return 0; }" \
+                   if args.clazz else ""
+    filter_method = "if (!prefix_method(data.method)) { return 0; }" \
+                   if args.method else ""
+    program += trace_template.replace("NAME", func_name)                \
+                             .replace("READ_CLASS", read_class)         \
+                             .replace("READ_METHOD", read_method)       \
+                             .replace("FILTER_CLASS", filter_class)     \
+                             .replace("FILTER_METHOD", filter_method)   \
+                             .replace("DEPTH", depth)                   \
+                             .replace("UPDATE", update)
+    usdt.enable_probe(probe_name, func_name)
+
+usdt = USDT(pid=args.pid)
+
+if args.language == "java":
+    enable_probe("method__entry", "java_entry",
+                 "bpf_usdt_readarg(2, ctx, &clazz);",
+                 "bpf_usdt_readarg(4, ctx, &method);", is_return=False)
+    enable_probe("method__return", "java_return",
+                 "bpf_usdt_readarg(2, ctx, &clazz);",
+                 "bpf_usdt_readarg(4, ctx, &method);", is_return=True)
+elif args.language == "python":
+    enable_probe("function__entry", "python_entry",
+                 "bpf_usdt_readarg(1, ctx, &clazz);",   # filename really
+                 "bpf_usdt_readarg(2, ctx, &method);", is_return=False)
+    enable_probe("function__return", "python_return",
+                 "bpf_usdt_readarg(1, ctx, &clazz);",   # filename really
+                 "bpf_usdt_readarg(2, ctx, &method);", is_return=True)
+elif args.language == "ruby":
+    enable_probe("method__entry", "ruby_entry",
+                 "bpf_usdt_readarg(1, ctx, &clazz);",
+                 "bpf_usdt_readarg(2, ctx, &method);", is_return=False)
+    enable_probe("method__return", "ruby_return",
+                 "bpf_usdt_readarg(1, ctx, &clazz);",
+                 "bpf_usdt_readarg(2, ctx, &method);", is_return=True)
+    enable_probe("cmethod__entry", "ruby_centry",
+                 "bpf_usdt_readarg(1, ctx, &clazz);",
+                 "bpf_usdt_readarg(2, ctx, &method);", is_return=False)
+    enable_probe("cmethod__return", "ruby_creturn",
+                 "bpf_usdt_readarg(1, ctx, &clazz);",
+                 "bpf_usdt_readarg(2, ctx, &method);", is_return=True)
+
+if args.verbose:
+    print(usdt.get_text())
+    print(program)
+
+bpf = BPF(text=program, usdt_contexts=[usdt])
+print("Tracing method calls in %s process %d... Ctrl-C to quit." %
+      (args.language, args.pid))
+print("%-3s %-6s %-6s %-8s %s" % ("CPU", "PID", "TID", "TIME(us)", "METHOD"))
+
+class CallEvent(ct.Structure):
+    _fields_ = [
+        ("depth", ct.c_ulonglong),
+        ("pid", ct.c_ulonglong),
+        ("timestamp", ct.c_ulonglong),
+        ("clazz", ct.c_char * 80),
+        ("method", ct.c_char * 80)
+        ]
+
+start_ts = time.time()
+
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(CallEvent)).contents
+    depth = event.depth & (~(1 << 63))
+    direction = "<- " if event.depth & (1 << 63) else "-> "
+    print("%-3d %-6d %-6d %-8.3f %-40s" % (cpu, event.pid >> 32,
+        event.pid & 0xFFFFFFFF, time.time() - start_ts,
+        ("  " * (depth - 1)) + direction + event.clazz + "." + event.method))
+
+bpf["calls"].open_perf_buffer(print_event)
+while 1:
+    bpf.kprobe_poll()
diff --git a/tools/uflow_example.txt b/tools/uflow_example.txt
new file mode 100644
index 0000000..34dd533
--- /dev/null
+++ b/tools/uflow_example.txt
@@ -0,0 +1,112 @@
+Demonstrations of uflow.
+
+
+uflow traces method entry and exit events and prints a visual flow graph that
+shows how methods are entered and exited, similar to a tracing debugger with
+breakpoints. This can be useful for understanding program flow in high-level
+languages such as Java, Python, and Ruby, which provide USDT probes for method
+invocations.
+
+
+For example, trace all Ruby method calls in a specific process:
+
+# ./uflow ruby 27245
+Tracing method calls in ruby process 27245... Ctrl-C to quit.
+CPU PID    TID    TIME(us) METHOD
+3   27245  27245  4.536    <- IO.gets                              
+3   27245  27245  4.536    <- IRB::StdioInputMethod.gets           
+3   27245  27245  4.536    -> IRB::Context.verbose?                
+3   27245  27245  4.536      -> NilClass.nil?                      
+3   27245  27245  4.536      <- NilClass.nil?                      
+3   27245  27245  4.536      -> IO.tty?                            
+3   27245  27245  4.536      <- IO.tty?                            
+3   27245  27245  4.536      -> Kernel.kind_of?                    
+3   27245  27245  4.536      <- Kernel.kind_of?                    
+3   27245  27245  4.536    <- IRB::Context.verbose?                
+3   27245  27245  4.536    <- IRB::Irb.signal_status               
+3   27245  27245  4.536    -> String.chars                         
+3   27245  27245  4.536    <- String.chars                         
+^C
+
+In the preceding output, indentation indicates the depth of the flow graph,
+and the <- and -> arrows indicate the direction of the event (exit or entry).
+
+Often, the amount of output can be overwhelming. You can filter specific 
+classes or methods. For example, trace only methods from the Thread class:
+
+# ./uflow -C java/lang/Thread java $(pidof java)
+Tracing method calls in java process 27722... Ctrl-C to quit.
+CPU PID    TID    TIME(us) METHOD
+3   27722  27731  3.144    -> java/lang/Thread.<init>              
+3   27722  27731  3.144      -> java/lang/Thread.init              
+3   27722  27731  3.144        -> java/lang/Thread.init            
+3   27722  27731  3.144          -> java/lang/Thread.currentThread 
+3   27722  27731  3.144          <- java/lang/Thread.currentThread 
+3   27722  27731  3.144          -> java/lang/Thread.getThreadGroup
+3   27722  27731  3.144          <- java/lang/Thread.getThreadGroup
+3   27722  27731  3.144          -> java/lang/ThreadGroup.checkAccess
+3   27722  27731  3.144          <- java/lang/ThreadGroup.checkAccess
+3   27722  27731  3.144          -> java/lang/ThreadGroup.addUnstarted
+3   27722  27731  3.144          <- java/lang/ThreadGroup.addUnstarted
+3   27722  27731  3.145          -> java/lang/Thread.isDaemon     
+3   27722  27731  3.145          <- java/lang/Thread.isDaemon     
+3   27722  27731  3.145          -> java/lang/Thread.getPriority   
+3   27722  27731  3.145          <- java/lang/Thread.getPriority   
+3   27722  27731  3.145          -> java/lang/Thread.getContextClassLoader
+3   27722  27731  3.145          <- java/lang/Thread.getContextClassLoader
+3   27722  27731  3.145          -> java/lang/Thread.setPriority   
+3   27722  27731  3.145            -> java/lang/Thread.checkAccess 
+3   27722  27731  3.145            <- java/lang/Thread.checkAccess 
+3   27722  27731  3.145            -> java/lang/Thread.getThreadGroup
+3   27722  27731  3.145            <- java/lang/Thread.getThreadGroup
+3   27722  27731  3.145            -> java/lang/ThreadGroup.getMaxPriority
+3   27722  27731  3.145            <- java/lang/ThreadGroup.getMaxPriority
+3   27722  27731  3.145            -> java/lang/Thread.setPriority0
+3   27722  27731  3.145            <- java/lang/Thread.setPriority0
+3   27722  27731  3.145          <- java/lang/Thread.setPriority   
+3   27722  27731  3.145          -> java/lang/Thread.nextThreadID  
+3   27722  27731  3.145          <- java/lang/Thread.nextThreadID  
+3   27722  27731  3.145        <- java/lang/Thread.init            
+3   27722  27731  3.145      <- java/lang/Thread.init              
+3   27722  27731  3.145    <- java/lang/Thread.<init>              
+3   27722  27731  3.145    -> java/lang/Thread.start               
+3   27722  27731  3.145      -> java/lang/ThreadGroup.add          
+3   27722  27731  3.145      <- java/lang/ThreadGroup.add          
+3   27722  27731  3.145      -> java/lang/Thread.start0            
+3   27722  27731  3.145      <- java/lang/Thread.start0            
+3   27722  27731  3.146    <- java/lang/Thread.start               
+2   27722  27742  3.146    -> java/lang/Thread.run                 
+^C
+
+The reason that the CPU number is printed in the first column is that events
+from different threads can be reordered when running on different CPUs, and
+produce non-sensible output. By looking for changes in the CPU column, you can
+easily see if the events you're following make sense and belong to the same
+thread running on the same CPU.
+
+
+USAGE message:
+
+# ./uflow -h
+usage: uflow.py [-h] [-M METHOD] [-C CLAZZ] [-v] {java,python,ruby} pid
+
+Trace method execution flow in high-level languages.
+
+positional arguments:
+  {java,python,ruby}    language to trace
+  pid                   process id to attach to
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -M METHOD, --method METHOD
+                        trace only calls to methods starting with this prefix
+  -C CLAZZ, --class CLAZZ
+                        trace only calls to classes starting with this prefix
+  -v, --verbose         verbose mode: print the BPF program (for debugging
+                        purposes)
+
+examples:
+    ./uflow java 185                # trace Java method calls in process 185
+    ./uflow ruby 1344               # trace Ruby method calls in process 1344
+    ./uflow -M indexOf java 185     # trace only 'indexOf'-prefixed methods
+    ./uflow -C '<stdin>' python 180 # trace only REPL-defined methods
diff --git a/tools/ugc.py b/tools/ugc.py
new file mode 100755
index 0000000..8638a25
--- /dev/null
+++ b/tools/ugc.py
@@ -0,0 +1,216 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# ugc  Summarize garbage collection events in high-level languages.
+#      For Linux, uses BCC, eBPF.
+#
+# USAGE: ugc [-v] [-m] {java,python,ruby,node} pid
+#
+# Copyright 2016 Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 19-Oct-2016   Sasha Goldshtein   Created this.
+
+from __future__ import print_function
+import argparse
+from bcc import BPF, USDT
+import ctypes as ct
+import time
+
+examples = """examples:
+    ./ugc java 185           # trace Java GCs in process 185
+    ./ugc ruby 1344 -m       # trace Ruby GCs reporting in ms
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize garbage collection events in high-level languages.",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("language", choices=["java", "python", "ruby", "node"],
+    help="language to trace")
+parser.add_argument("pid", type=int, help="process id to attach to")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="verbose mode: print the BPF program (for debugging purposes)")
+parser.add_argument("-m", "--milliseconds", action="store_true",
+    help="report times in milliseconds (default is microseconds)")
+args = parser.parse_args()
+
+usdt = USDT(pid=args.pid)
+
+program = """
+struct gc_event_t {
+    u64 probe_index;
+    u64 elapsed_ns;
+    u64 field1;
+    u64 field2;
+    u64 field3;
+    u64 field4;
+    char string1[32];
+    char string2[32];
+};
+struct entry_t {
+    u64 start_ns;
+    u64 field1;
+    u64 field2;
+};
+
+BPF_PERF_OUTPUT(gcs);
+BPF_HASH(entry, u64, struct entry_t);
+"""
+
+class Probe(object):
+    def __init__(self, begin, end, begin_save, end_save, formatter):
+        self.begin = begin
+        self.end = end
+        self.begin_save = begin_save
+        self.end_save = end_save
+        self.formatter = formatter
+
+    def generate(self):
+        text = """
+int trace_%s(struct pt_regs *ctx) {
+    u64 pid = bpf_get_current_pid_tgid();
+    struct entry_t e = {};
+    e.start_ns = bpf_ktime_get_ns();
+    %s
+    entry.update(&pid, &e);
+    return 0;
+}
+int trace_%s(struct pt_regs *ctx) {
+    u64 elapsed;
+    struct entry_t *e;
+    struct gc_event_t event = {};
+    u64 pid = bpf_get_current_pid_tgid();
+    e = entry.lookup(&pid);
+    if (!e) {
+        return 0;   // missed the entry event on this thread
+    }
+    elapsed = bpf_ktime_get_ns() - e->start_ns;
+    event.elapsed_ns = elapsed;
+    %s
+    gcs.perf_submit(ctx, &event, sizeof(event));
+    return 0;
+}
+        """ % (self.begin, self.begin_save, self.end, self.end_save)
+        return text
+
+    def attach(self):
+        usdt.enable_probe(self.begin, "trace_%s" % self.begin)
+        usdt.enable_probe(self.end, "trace_%s" % self.end)
+
+    def format(self, data):
+        return self.formatter(data)
+
+probes = []
+
+#
+# Java
+#
+if args.language == "java":
+    # Oddly, the gc__begin/gc__end probes don't really have any useful
+    # information, while the mem__pool* ones do. There's also a bunch of
+    # probes described in the hotspot_gc*.stp file which aren't there
+    # when looking at a live Java process.
+    begin_save = """
+    bpf_usdt_readarg(6, ctx, &e.field1);    // used bytes
+    bpf_usdt_readarg(8, ctx, &e.field2);    // max bytes
+    """
+    end_save = """
+    event.field1 = e->field1;                  // used bytes at start
+    event.field2 = e->field2;                  // max bytes at start
+    bpf_usdt_readarg(6, ctx, &event.field3);   // used bytes at end
+    bpf_usdt_readarg(8, ctx, &event.field4);   // max bytes at end
+    u64 manager = 0, pool = 0;
+    bpf_usdt_readarg(1, ctx, &manager);        // ptr to manager name
+    bpf_usdt_readarg(3, ctx, &pool);           // ptr to pool name
+    bpf_probe_read(&event.string1, sizeof(event.string1), (void *)manager);
+    bpf_probe_read(&event.string2, sizeof(event.string2), (void *)pool);
+    """
+    formatter = lambda e: "%s %s used=%d->%d max=%d->%d" % \
+                (e.string1, e.string2, e.field1, e.field3, e.field2, e.field4)
+    probes.append(Probe("mem__pool__gc__begin", "mem__pool__gc__end",
+                        begin_save, end_save, formatter))
+    probes.append(Probe("gc__begin", "gc__end",
+                        "", "", lambda _: "no additional info available"))
+#
+# Python
+#
+elif args.language == "python":
+    begin_save = """
+    int gen = 0;
+    bpf_usdt_readarg(1, ctx, &gen);
+    e.field1 = gen;
+    """
+    end_save = """
+    long objs = 0;
+    bpf_usdt_readarg(1, ctx, &objs);
+    event.field1 = e->field1;
+    event.field2 = objs;
+    """
+    formatter = lambda event: "gen %d GC collected %d objects" % \
+                              (event.field1, event.field2)
+    probes.append(Probe("gc__start", "gc__done",
+                        begin_save, end_save, formatter))
+#
+# Ruby
+#
+elif args.language == "ruby":
+    # Ruby GC probes do not have any additional information available.
+    probes.append(Probe("gc__mark__begin", "gc__mark__end",
+                        "", "", lambda _: "GC mark stage"))
+    probes.append(Probe("gc__sweep__begin", "gc__sweep__end",
+                        "", "", lambda _: "GC sweep stage"))
+#
+# Node
+#
+elif args.language == "node":
+    end_save = """
+    u32 gc_type = 0;
+    bpf_usdt_readarg(1, ctx, &gc_type);
+    event.field1 = gc_type;
+    """
+    descs = {"GC scavenge": 1, "GC mark-sweep-compact": 2,
+             "GC incremental mark": 4, "GC weak callbacks": 8}
+    probes.append(Probe("gc__start", "gc__done", "", end_save,
+                  lambda e: str.join(", ",
+                                     [desc for desc, val in descs.items()
+                                      if e.field1 & val != 0])))
+
+for probe in probes:
+    program += probe.generate()
+    probe.attach()
+
+if args.verbose:
+    print(usdt.get_text())
+    print(program)
+
+bpf = BPF(text=program, usdt_contexts=[usdt])
+print("Tracing garbage collections in %s process %d... Ctrl-C to quit." %
+      (args.language, args.pid))
+time_col = "TIME (ms)" if args.milliseconds else "TIME (us)"
+print("%-8s %-40s %-8s" % ("START", "DESCRIPTION", time_col))
+
+class GCEvent(ct.Structure):
+    _fields_ = [
+        ("probe_index", ct.c_ulonglong),
+        ("elapsed_ns", ct.c_ulonglong),
+        ("field1", ct.c_ulonglong),
+        ("field2", ct.c_ulonglong),
+        ("field3", ct.c_ulonglong),
+        ("field4", ct.c_ulonglong),
+        ("string1", ct.c_char * 32),
+        ("string2", ct.c_char * 32)
+        ]
+
+start_ts = time.time()
+
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(GCEvent)).contents
+    elapsed = event.elapsed_ns/1000000 if args.milliseconds else \
+              event.elapsed_ns/1000
+    print("%-8.3f %-40s %-8.2f" % (time.time() - start_ts,
+                                   probes[event.probe_index].format(event),
+                                   elapsed))
+
+bpf["gcs"].open_perf_buffer(print_event)
+while 1:
+    bpf.kprobe_poll()
diff --git a/tools/ugc_example.txt b/tools/ugc_example.txt
new file mode 100644
index 0000000..27f1e51
--- /dev/null
+++ b/tools/ugc_example.txt
@@ -0,0 +1,66 @@
+Demonstrations of ugc.
+
+
+ugc traces garbage collection events in high-level languages, including Java,
+Python, Ruby, and Node. Each GC event is printed with some additional 
+information provided by that language's runtime, if available. The duration of
+the GC event is also provided.
+
+For example, to trace all garbage collection events in a specific Node process:
+
+# ./ugc node $(pidof node)
+Tracing garbage collections in node process 3018... Ctrl-C to quit.
+START    DESCRIPTION                              TIME (us)
+3.864    GC mark-sweep-compact                    3189.00 
+4.937    GC scavenge                              1254.00 
+4.940    GC scavenge                              1657.00 
+4.943    GC scavenge                              1171.00 
+4.949    GC scavenge                              2216.00 
+4.954    GC scavenge                              2515.00 
+4.960    GC scavenge                              2243.00 
+4.966    GC scavenge                              2410.00 
+4.976    GC scavenge                              3003.00 
+4.986    GC scavenge                              4174.00 
+4.994    GC scavenge                              1508.00 
+5.003    GC scavenge                              1966.00 
+5.010    GC scavenge                              1636.00 
+5.022    GC scavenge                              3564.00 
+5.035    GC scavenge                              3275.00 
+5.045    GC incremental mark                      157.00  
+5.049    GC mark-sweep-compact                    3248.00 
+5.060    GC scavenge                              4785.00 
+5.081    GC scavenge                              6616.00 
+5.094    GC scavenge                              8570.00 
+5.144    GC scavenge                              456.00  
+7.188    GC scavenge                              2345.00 
+7.227    GC scavenge                              12054.00
+7.253    GC scavenge                              15626.00
+7.304    GC scavenge                              15329.00
+7.384    GC scavenge                              7168.00 
+7.411    GC scavenge                              3794.00 
+7.414    GC incremental mark                      123.00  
+7.430    GC mark-sweep-compact                    7110.00 
+^C
+
+
+USAGE message:
+
+# ./ugc -h
+usage: ugc.py [-h] [-v] [-m] {java,python,ruby,node} pid
+
+Summarize garbage collection events in high-level languages.
+
+positional arguments:
+  {java,python,ruby,node}
+                        language to trace
+  pid                   process id to attach to
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -v, --verbose         verbose mode: print the BPF program (for debugging
+                        purposes)
+  -m, --milliseconds    report times in milliseconds (default is microseconds)
+
+examples:
+    ./ugc java 185           # trace Java GCs in process 185
+    ./ugc ruby 1344 -m       # trace Ruby GCs reporting in ms
diff --git a/tools/uobjnew.py b/tools/uobjnew.py
new file mode 100755
index 0000000..993bca8
--- /dev/null
+++ b/tools/uobjnew.py
@@ -0,0 +1,168 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# uobjnew  Summarize object allocations in high-level languages.
+#          For Linux, uses BCC, eBPF.
+#
+# USAGE: uobjnew [-h] [-T TOP] [-v] {java,ruby,c} pid [interval]
+#
+# Copyright 2016 Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 25-Oct-2016   Sasha Goldshtein   Created this.
+
+from __future__ import print_function
+import argparse
+from bcc import BPF, USDT
+from time import sleep
+
+examples = """examples:
+    ./uobjnew java 145         # summarize Java allocations in process 145
+    ./uobjnew c 2020 1         # grab malloc() sizes and print every second
+    ./uobjnew ruby 6712 -C 10  # top 10 Ruby types by number of allocations
+    ./uobjnew ruby 6712 -S 10  # top 10 Ruby types by total size
+"""
+parser = argparse.ArgumentParser(
+    description="Summarize object allocations in high-level languages.",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("language", choices=["java", "ruby", "c"],
+    help="language to trace")
+parser.add_argument("pid", type=int, help="process id to attach to")
+parser.add_argument("interval", type=int, nargs='?',
+    help="print every specified number of seconds")
+parser.add_argument("-C", "--top-count", type=int,
+    help="number of most frequently allocated types to print")
+parser.add_argument("-S", "--top-size", type=int,
+    help="number of largest types by allocated bytes to print")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="verbose mode: print the BPF program (for debugging purposes)")
+args = parser.parse_args()
+
+program = """
+#include <linux/ptrace.h>
+
+struct key_t {
+#if MALLOC_TRACING
+    u64 size;
+#else
+    char name[50];
+#endif
+};
+
+struct val_t {
+    u64 total_size;
+    u64 num_allocs;
+};
+
+BPF_HASH(allocs, struct key_t, struct val_t);
+""".replace("MALLOC_TRACING", "1" if args.language == "c" else "0")
+
+usdt = USDT(pid=args.pid)
+
+#
+# Java
+#
+if args.language == "java":
+    program += """
+int alloc_entry(struct pt_regs *ctx) {
+    struct key_t key = {};
+    struct val_t *valp, zero = {};
+    u64 classptr = 0, size = 0;
+    bpf_usdt_readarg(2, ctx, &classptr);
+    bpf_usdt_readarg(4, ctx, &size);
+    bpf_probe_read(&key.name, sizeof(key.name), (void *)classptr);
+    valp = allocs.lookup_or_init(&key, &zero);
+    valp->total_size += size;
+    valp->num_allocs += 1;
+    return 0;
+}
+    """
+    usdt.enable_probe("object__alloc", "alloc_entry")
+#
+# Ruby
+#
+elif args.language == "ruby":
+    create_template = """
+int THETHING_alloc_entry(struct pt_regs *ctx) {
+    struct key_t key = { .name = "THETHING" };
+    struct val_t *valp, zero = {};
+    u64 size = 0;
+    bpf_usdt_readarg(1, ctx, &size);
+    valp = allocs.lookup_or_init(&key, &zero);
+    valp->total_size += size;
+    valp->num_allocs += 1;
+    return 0;
+}
+    """
+    program += """
+int object_alloc_entry(struct pt_regs *ctx) {
+    struct key_t key = {};
+    struct val_t *valp, zero = {};
+    u64 classptr = 0;
+    bpf_usdt_readarg(1, ctx, &classptr);
+    bpf_probe_read(&key.name, sizeof(key.name), (void *)classptr);
+    valp = allocs.lookup_or_init(&key, &zero);
+    valp->num_allocs += 1;  // We don't know the size, unfortunately
+    return 0;
+}
+    """
+    usdt.enable_probe("object__create", "object_alloc_entry")
+    for thing in ["string", "hash", "array"]:
+        program += create_template.replace("THETHING", thing)
+        usdt.enable_probe("%s__create" % thing, "%s_alloc_entry" % thing)
+#
+# C
+#
+elif args.language == "c":
+    program += """
+int alloc_entry(struct pt_regs *ctx, size_t size) {
+    struct key_t key = {};
+    struct val_t *valp, zero = {};
+    key.size = size;
+    valp = allocs.lookup_or_init(&key, &zero);
+    valp->total_size += size;
+    valp->num_allocs += 1;
+    return 0;
+}
+    """
+
+if args.verbose:
+    print(usdt.get_text())
+    print(program)
+
+bpf = BPF(text=program, usdt_contexts=[usdt])
+if args.language == "c":
+    bpf.attach_uprobe(name="c", sym="malloc", fn_name="alloc_entry",
+                      pid=args.pid)
+
+exit_signaled = False
+print("Tracing allocations in process %d (language: %s)... Ctrl-C to quit." %
+      (args.pid, args.language or "none"))
+while True:
+    try:
+        sleep(args.interval or 99999999)
+    except KeyboardInterrupt:
+        exit_signaled = True
+    print()
+    data = bpf["allocs"]
+    if args.top_count:
+        data = sorted(data.items(), key=lambda (k, v): v.num_allocs)
+        data = data[-args.top_count:]
+    elif args.top_size:
+        data = sorted(data.items(), key=lambda (k, v): v.total_size)
+        data = data[-args.top_size:]
+    else:
+        data = sorted(data.items(), key=lambda (k, v): v.total_size)
+    print("%-30s %8s %12s" % ("TYPE", "# ALLOCS", "# BYTES"))
+    for key, value in data:
+        if args.language == "c":
+            obj_type = "block size %d" % key.size
+        else:
+            obj_type = key.name
+        print("%-30s %8d %12d" %
+              (obj_type, value.num_allocs, value.total_size))
+    if args.interval and not exit_signaled:
+        bpf["allocs"].clear()
+    else:
+        exit()
diff --git a/tools/uobjnew_example.txt b/tools/uobjnew_example.txt
new file mode 100644
index 0000000..61d2afb
--- /dev/null
+++ b/tools/uobjnew_example.txt
@@ -0,0 +1,74 @@
+Demonstrations of uobjnew.
+
+
+uobjnew summarizes new object allocation events and prints out statistics on
+which object type has been allocated frequently, and how many bytes of that
+type have been allocated. This helps diagnose common allocation paths, which
+can in turn cause heavy garbage collection.
+
+For example, trace Ruby object allocations when running some simple commands
+in irb (the Ruby REPL):
+
+# ./uobjnew ruby 27245
+Tracing allocations in process 27245 (language: ruby)... Ctrl-C to quit.
+
+TYPE                           # ALLOCS      # BYTES
+NameError                             1            0
+RubyToken::TkSPACE                    1            0
+RubyToken::TkSTRING                   1            0
+String                                7            0
+RubyToken::TkNL                       2            0
+RubyToken::TkIDENTIFIER               2            0
+array                                55          129
+string                              344         1348
+^C
+
+
+Plain C/C++ allocations (through "malloc") are also supported. We can't report
+the type being allocated, but we can report the object sizes at least. Also,
+print only the top 10 rows by number of bytes allocated:
+
+# ./uobjnew -S 10 c 27245
+Tracing allocations in process 27245 (language: c)... Ctrl-C to quit.
+
+TYPE                           # ALLOCS      # BYTES
+block size 64                        22         1408
+block size 992                        2         1984
+block size 32                        68         2176
+block size 48                        48         2304
+block size 944                        4         3776
+block size 1104                       4         4416
+block size 160                       32         5120
+block size 535                       15         8025
+block size 128                      112        14336
+block size 80                       569        45520
+^C
+
+
+USAGE message:
+
+# ./uobjnew -h
+usage: uobjnew.py [-h] [-C TOP_COUNT] [-S TOP_SIZE] [-v]
+                  {java,ruby,c} pid [interval]
+
+Summarize object allocations in high-level languages.
+
+positional arguments:
+  {java,ruby,c}         language to trace
+  pid                   process id to attach to
+  interval              print every specified number of seconds
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -C TOP_COUNT, --top-count TOP_COUNT
+                        number of most frequently allocated types to print
+  -S TOP_SIZE, --top-size TOP_SIZE
+                        number of largest types by allocated bytes to print
+  -v, --verbose         verbose mode: print the BPF program (for debugging
+                        purposes)
+
+examples:
+    ./uobjnew java 145         # summarize Java allocations in process 145
+    ./uobjnew c 2020 1         # grab malloc() sizes and print every second
+    ./uobjnew ruby 6712 -C 10  # top 10 Ruby types by number of allocations
+    ./uobjnew ruby 6712 -S 10  # top 10 Ruby types by total size
diff --git a/tools/ustat.py b/tools/ustat.py
new file mode 100755
index 0000000..cc410df
--- /dev/null
+++ b/tools/ustat.py
@@ -0,0 +1,279 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# ustat  Activity stats from high-level languages, including exceptions,
+#        method calls, class loads, garbage collections, and more.
+#        For Linux, uses BCC, eBPF.
+#
+# USAGE: ustat [-l {java,python,ruby,node}] [-C]
+#        [-S {cload,excp,gc,method,objnew,thread}] [-r MAXROWS] [-d]
+#        [interval [count]]
+#
+# This uses in-kernel eBPF maps to store per process summaries for efficiency.
+# Newly-created processes might only be traced at the next interval, if the
+# relevant USDT probe requires enabling through a semaphore.
+#
+# Copyright 2016 Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 26-Oct-2016   Sasha Goldshtein    Created this.
+
+from __future__ import print_function
+import argparse
+from bcc import BPF, USDT
+import os
+from subprocess import call
+from time import sleep, strftime
+
+class Category(object):
+    THREAD = "THREAD"
+    METHOD = "METHOD"
+    OBJNEW = "OBJNEW"
+    CLOAD = "CLOAD"
+    EXCP = "EXCP"
+    GC = "GC"
+
+class Probe(object):
+    def __init__(self, language, procnames, events):
+        """
+        Initialize a new probe object with a specific language, set of process
+        names to monitor for that language, and a dictionary of events and
+        categories. The dictionary is a mapping of USDT probe names (such as
+        'gc__start') to event categories supported by this tool -- from the
+        Category class.
+        """
+        self.language = language
+        self.procnames = procnames
+        self.events = events
+
+    def _find_targets(self):
+        """Find pids where the comm is one of the specified list"""
+        self.targets = {}
+        all_pids = [int(pid) for pid in os.listdir('/proc') if pid.isdigit()]
+        for pid in all_pids:
+            try:
+                comm = open('/proc/%d/comm' % pid).read().strip()
+                if comm in self.procnames:
+                    cmdline = open('/proc/%d/cmdline' % pid).read()
+                    self.targets[pid] = cmdline.replace('\0', ' ')
+            except IOError:
+                continue    # process may already have terminated
+
+    def _enable_probes(self):
+        self.usdts = []
+        for pid in self.targets:
+            usdt = USDT(pid=pid)
+            for event in self.events:
+                try:
+                    usdt.enable_probe(event, "%s_%s" % (self.language, event))
+                except Exception:
+                    # This process might not have a recent version of the USDT
+                    # probes enabled, or might have been compiled without USDT
+                    # probes at all. The process could even have been shut down
+                    # and the pid been recycled. We have to gracefully handle
+                    # the possibility that we can't attach probes to it at all.
+                    pass
+            self.usdts.append(usdt)
+
+    def _generate_tables(self):
+        text = """
+BPF_HASH(%s_%s_counts, u32, u64);   // pid to event count
+        """
+        return str.join('', [text % (self.language, event)
+                             for event in self.events])
+
+    def _generate_functions(self):
+        text = """
+int %s_%s(void *ctx) {
+    u64 *valp, zero = 0;
+    u32 tgid = bpf_get_current_pid_tgid() >> 32;
+    valp = %s_%s_counts.lookup_or_init(&tgid, &zero);
+    ++(*valp);
+    return 0;
+}
+        """
+        lang = self.language
+        return str.join('', [text % (lang, event, lang, event)
+                             for event in self.events])
+
+    def get_program(self):
+        self._find_targets()
+        self._enable_probes()
+        return self._generate_tables() + self._generate_functions()
+
+    def get_usdts(self):
+        return self.usdts
+
+    def get_counts(self, bpf):
+        """Return a map of event counts per process"""
+        event_dict = dict([(category, 0) for category in self.events.values()])
+        result = dict([(pid, event_dict.copy()) for pid in self.targets])
+        for event, category in self.events.items():
+            counts = bpf["%s_%s_counts" % (self.language, event)]
+            for pid, count in counts.items():
+                result[pid.value][category] = count.value
+            counts.clear()
+        return result
+
+    def cleanup(self):
+        self.usdts = None
+
+class Tool(object):
+    def _parse_args(self):
+        examples = """examples:
+  ./ustat              # stats for all languages, 1 second refresh
+  ./ustat -C           # don't clear the screen
+  ./ustat -l java      # Java processes only
+  ./ustat 5            # 5 second summaries
+  ./ustat 5 10         # 5 second summaries, 10 times only
+        """
+        parser = argparse.ArgumentParser(
+            description="Activity stats from high-level languages.",
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            epilog=examples)
+        parser.add_argument("-l", "--language",
+            choices=["java", "python", "ruby", "node"],
+            help="language to trace (default: all languages)")
+        parser.add_argument("-C", "--noclear", action="store_true",
+            help="don't clear the screen")
+        parser.add_argument("-S", "--sort",
+            choices=[cat.lower() for cat in dir(Category) if cat.isupper()],
+            help="sort by this field (descending order)")
+        parser.add_argument("-r", "--maxrows", default=20, type=int,
+            help="maximum rows to print, default 20")
+        parser.add_argument("-d", "--debug", action="store_true",
+            help="Print the resulting BPF program (for debugging purposes)")
+        parser.add_argument("interval", nargs="?", default=1, type=int,
+            help="output interval, in seconds")
+        parser.add_argument("count", nargs="?", default=99999999, type=int,
+            help="number of outputs")
+        self.args = parser.parse_args()
+
+    def _create_probes(self):
+        probes_by_lang = {
+                "node": Probe("node", ["node"], {
+                    "gc__start": Category.GC
+                    }),
+                "python": Probe("python", ["python"], {
+                    "function__entry": Category.METHOD,
+                    "gc__start": Category.GC
+                    }),
+                "ruby": Probe("ruby", ["ruby", "irb"], {
+                    "method__entry": Category.METHOD,
+                    "cmethod__entry": Category.METHOD,
+                    "gc__mark__begin": Category.GC,
+                    "gc__sweep__begin": Category.GC,
+                    "object__create": Category.OBJNEW,
+                    "hash__create": Category.OBJNEW,
+                    "string__create": Category.OBJNEW,
+                    "array__create": Category.OBJNEW,
+                    "require__entry": Category.CLOAD,
+                    "load__entry": Category.CLOAD,
+                    "raise": Category.EXCP
+                    }),
+                "java": Probe("java", ["java"], {
+                    "gc__begin": Category.GC,
+                    "mem__pool__gc__begin": Category.GC,
+                    "thread__start": Category.THREAD,
+                    "class__loaded": Category.CLOAD,
+                    "object__alloc": Category.OBJNEW,
+                    "method__entry": Category.METHOD,
+                    "ExceptionOccurred__entry": Category.EXCP
+                    })
+                }
+
+        if self.args.language:
+            self.probes = [probes_by_lang[self.args.language]]
+        else:
+            self.probes = probes_by_lang.values()
+
+    def _attach_probes(self):
+        program = str.join('\n', [p.get_program() for p in self.probes])
+        if self.args.debug:
+            print(program)
+            for probe in self.probes:
+                print("Attached to %s processes:" % probe.language,
+                        str.join(', ', map(str, probe.targets)))
+        self.bpf = BPF(text=program)
+        usdts = [usdt for probe in self.probes for usdt in probe.get_usdts()]
+        # Filter out duplicates when we have multiple processes with the same
+        # uprobe. We are attaching to these probes manually instead of using
+        # the USDT support from the bcc module, because the USDT class attaches
+        # to each uprobe with a specific pid. When there is more than one
+        # process from some language, we end up attaching more than once to the
+        # same uprobe (albeit with different pids), which is not allowed.
+        # Instead, we use a global attach (with pid=-1).
+        uprobes = set([(path, func, addr) for usdt in usdts
+                       for (path, func, addr, _)
+                       in usdt.enumerate_active_probes()])
+        for (path, func, addr) in uprobes:
+            self.bpf.attach_uprobe(name=path, fn_name=func, addr=addr, pid=-1)
+
+    def _detach_probes(self):
+        for probe in self.probes:
+            probe.cleanup()     # Cleans up USDT contexts
+        self.bpf.cleanup()      # Cleans up all attached probes
+        self.bpf = None
+
+    def _loop_iter(self):
+        self._attach_probes()
+        try:
+            sleep(self.args.interval)
+        except KeyboardInterrupt:
+            self.exiting = True
+
+        if not self.args.noclear:
+            call("clear")
+        else:
+            print()
+        with open("/proc/loadavg") as stats:
+            print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
+        print("%-6s %-20s %-10s %-6s %-10s %-8s %-6s %-6s" % (
+            "PID", "CMDLINE", "METHOD/s", "GC/s", "OBJNEW/s",
+            "CLOAD/s", "EXC/s", "THR/s"))
+
+        line = 0
+        counts = {}
+        targets = {}
+        for probe in self.probes:
+            counts.update(probe.get_counts(self.bpf))
+            targets.update(probe.targets)
+        if self.args.sort:
+            counts = sorted(counts.items(), key=lambda (_, v):
+                            -v.get(self.args.sort.upper(), 0))
+        else:
+            counts = sorted(counts.items(), key=lambda (k, _): k)
+        for pid, stats in counts:
+            print("%-6d %-20s %-10d %-6d %-10d %-8d %-6d %-6d" % (
+                  pid, targets[pid][:20],
+                  stats.get(Category.METHOD, 0) / self.args.interval,
+                  stats.get(Category.GC, 0) / self.args.interval,
+                  stats.get(Category.OBJNEW, 0) / self.args.interval,
+                  stats.get(Category.CLOAD, 0) / self.args.interval,
+                  stats.get(Category.EXCP, 0) / self.args.interval,
+                  stats.get(Category.THREAD, 0) / self.args.interval
+                  ))
+            line += 1
+            if line >= self.args.maxrows:
+                break
+        self._detach_probes()
+
+    def run(self):
+        self._parse_args()
+        self._create_probes()
+        print('Tracing... Output every %d secs. Hit Ctrl-C to end' %
+              self.args.interval)
+        countdown = self.args.count
+        self.exiting = False
+        while True:
+            self._loop_iter()
+            countdown -= 1
+            if self.exiting or countdown == 0:
+                print("Detaching...")
+                exit()
+
+if __name__ == "__main__":
+    try:
+        Tool().run()
+    except KeyboardInterrupt:
+        pass
diff --git a/tools/ustat_example.txt b/tools/ustat_example.txt
new file mode 100644
index 0000000..7da01e6
--- /dev/null
+++ b/tools/ustat_example.txt
@@ -0,0 +1,78 @@
+Demonstrations of ustat.
+
+
+ustat is a "top"-like tool for monitoring events in high-level languages. It 
+prints statistics about garbage collections, method calls, object allocations,
+and various other events for every process that it recognizes with a Java,
+Python, Ruby, or Node runtime.
+
+For example:
+
+# ./ustat.py
+Tracing... Output every 10 secs. Hit Ctrl-C to end
+12:17:17 loadavg: 0.33 0.08 0.02 5/211 26284
+
+PID    CMDLINE              METHOD/s   GC/s   OBJNEW/s   CLOAD/s  EXC/s  THR/s 
+3018   node/node            0          3      0          0        0      0     
+^C
+Detaching...
+
+
+If desired, you can instruct ustat to print a certain number of entries and 
+exit, which can be useful to get a quick picture on what's happening on the 
+system over a short time interval. Here, we ask ustat to print 5-second 
+summaries 12 times (for a total time of 1 minute):
+
+# ./ustat.py -C 5 12
+Tracing... Output every 5 secs. Hit Ctrl-C to end
+12:18:26 loadavg: 0.27 0.11 0.04 2/336 26455
+
+PID    CMDLINE              METHOD/s   GC/s   OBJNEW/s   CLOAD/s  EXC/s  THR/s 
+3018   node/node            0          1      0          0        0      0     
+
+12:18:31 loadavg: 0.33 0.12 0.04 2/336 26456
+
+PID    CMDLINE              METHOD/s   GC/s   OBJNEW/s   CLOAD/s  EXC/s  THR/s 
+3018   node/node            0          0      0          0        0      0     
+26439  java -XX:+ExtendedDT 2776045    0      0          0        0      0     
+
+12:18:37 loadavg: 0.38 0.14 0.05 2/336 26457
+
+PID    CMDLINE              METHOD/s   GC/s   OBJNEW/s   CLOAD/s  EXC/s  THR/s 
+3018   node/node            0          0      0          0        0      0     
+26439  java -XX:+ExtendedDT 2804378    0      0          0        0      0     
+
+(...more output omitted for brevity)
+
+
+USAGE message:
+
+# ./ustat.py -h
+usage: ustat.py [-h] [-l {java,python,ruby,node}] [-C]
+                [-S {cload,excp,gc,method,objnew,thread}] [-r MAXROWS] [-d]
+                [interval] [count]
+
+Activity stats from high-level languages.
+
+positional arguments:
+  interval              output interval, in seconds
+  count                 number of outputs
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -l {java,python,ruby,node}, --language {java,python,ruby,node}
+                        language to trace (default: all languages)
+  -C, --noclear         don't clear the screen
+  -S {cload,excp,gc,method,objnew,thread}, --sort {cload,excp,gc,method,objnew,thread}
+                        sort by this field (descending order)
+  -r MAXROWS, --maxrows MAXROWS
+                        maximum rows to print, default 20
+  -d, --debug           Print the resulting BPF program (for debugging
+                        purposes)
+
+examples:
+  ./ustat              # stats for all languages, 1 second refresh
+  ./ustat -C           # don't clear the screen
+  ./ustat -l java      # Java processes only
+  ./ustat 5            # 5 second summaries
+  ./ustat 5 10         # 5 second summaries, 10 times only 
diff --git a/tools/uthreads.py b/tools/uthreads.py
new file mode 100755
index 0000000..4f089d4
--- /dev/null
+++ b/tools/uthreads.py
@@ -0,0 +1,116 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# uthreads  Trace thread creation/destruction events in high-level languages.
+#           For Linux, uses BCC, eBPF.
+#
+# USAGE: uthreads [-l {java}] [-v] pid
+#
+# Copyright 2016 Sasha Goldshtein
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 25-Oct-2016   Sasha Goldshtein   Created this.
+
+from __future__ import print_function
+import argparse
+from bcc import BPF, USDT
+import ctypes as ct
+import time
+
+examples = """examples:
+    ./uthreads -l java 185   # trace Java threads in process 185
+    ./uthreads 12245         # trace only pthreads in process 12245
+"""
+parser = argparse.ArgumentParser(
+    description="Trace thread creation/destruction events in " +
+                "high-level languages.",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-l", "--language", choices=["java"],
+    help="language to trace (none for pthreads only)")
+parser.add_argument("pid", type=int, help="process id to attach to")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="verbose mode: print the BPF program (for debugging purposes)")
+args = parser.parse_args()
+
+usdt = USDT(pid=args.pid)
+
+program = """
+struct thread_event_t {
+    u64 runtime_id;
+    u64 native_id;
+    char type[8];
+    char name[80];
+};
+
+BPF_PERF_OUTPUT(threads);
+
+int trace_pthread(struct pt_regs *ctx) {
+    struct thread_event_t te = {};
+    u64 start_routine = 0;
+    char type[] = "pthread";
+    te.native_id = bpf_get_current_pid_tgid() & 0xFFFFFFFF;
+    bpf_usdt_readarg(2, ctx, &start_routine);
+    te.runtime_id = start_routine;  // This is really a function pointer
+    __builtin_memcpy(&te.type, type, sizeof(te.type));
+    threads.perf_submit(ctx, &te, sizeof(te));
+    return 0;
+}
+"""
+usdt.enable_probe("pthread_start", "trace_pthread")
+
+if args.language == "java":
+    template = """
+int %s(struct pt_regs *ctx) {
+    char type[] = "%s";
+    struct thread_event_t te = {};
+    u64 nameptr = 0, id = 0, native_id = 0;
+    bpf_usdt_readarg(1, ctx, &nameptr);
+    bpf_usdt_readarg(3, ctx, &id);
+    bpf_usdt_readarg(4, ctx, &native_id);
+    bpf_probe_read(&te.name, sizeof(te.name), (void *)nameptr);
+    te.runtime_id = id;
+    te.native_id = native_id;
+    __builtin_memcpy(&te.type, type, sizeof(te.type));
+    threads.perf_submit(ctx, &te, sizeof(te));
+    return 0;
+}
+    """
+    program += template % ("trace_start", "start")
+    program += template % ("trace_stop", "stop")
+    usdt.enable_probe("thread__start", "trace_start")
+    usdt.enable_probe("thread__stop", "trace_stop")
+
+if args.verbose:
+    print(usdt.get_text())
+    print(program)
+
+bpf = BPF(text=program, usdt_contexts=[usdt])
+print("Tracing thread events in process %d (language: %s)... Ctrl-C to quit." %
+      (args.pid, args.language or "none"))
+print("%-8s %-16s %-8s %-30s" % ("TIME", "ID", "TYPE", "DESCRIPTION"))
+
+class ThreadEvent(ct.Structure):
+    _fields_ = [
+        ("runtime_id", ct.c_ulonglong),
+        ("native_id", ct.c_ulonglong),
+        ("type", ct.c_char * 8),
+        ("name", ct.c_char * 80),
+        ]
+
+start_ts = time.time()
+
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(ThreadEvent)).contents
+    name = event.name
+    if event.type == "pthread":
+        name = bpf.sym(event.runtime_id, args.pid)
+        tid = event.native_id
+    else:
+        tid = "R=%s/N=%s" % (event.runtime_id, event.native_id)
+    print("%-8.3f %-16s %-8s %-30s" % (
+        time.time() - start_ts, tid, event.type, name))
+
+bpf["threads"].open_perf_buffer(print_event)
+while 1:
+    bpf.kprobe_poll()
diff --git a/tools/uthreads_example.txt b/tools/uthreads_example.txt
new file mode 100644
index 0000000..664b341
--- /dev/null
+++ b/tools/uthreads_example.txt
@@ -0,0 +1,58 @@
+Demonstrations of uthreads.
+
+
+uthreads traces thread creation events in Java or raw pthreads, and prints
+details about the newly created thread. For Java threads, the thread name is
+printed; for pthreads, the thread's start function is printed, if there is
+symbol information to resolve it.
+
+For example, trace all Java thread creation events:
+
+# ./uthreads -l java 27420
+Tracing thread events in process 27420 (language: java)... Ctrl-C to quit.
+TIME     ID               TYPE     DESCRIPTION                   
+18.596   R=9/N=0          start    SIGINT handler                
+18.596   R=4/N=0          stop     Signal Dispatcher             
+^C
+
+The ID column in the preceding output shows the thread's runtime ID and native
+ID, when available. The accuracy of this information depends on the Java 
+runtime.
+
+
+Next, trace only pthread creation events in some native application:
+
+# ./uthreads 27450
+Tracing thread events in process 27450 (language: none)... Ctrl-C to quit.
+TIME     ID               TYPE     DESCRIPTION                   
+0.924    27462            pthread  primes_thread                 
+0.927    27463            pthread  primes_thread                 
+0.928    27464            pthread  primes_thread                 
+0.928    27465            pthread  primes_thread                 
+^C
+
+The thread name ("primes_thread" in this example) is resolved from debuginfo.
+If symbol information is not present, the thread's start address is printed
+instead.
+
+
+USAGE message:
+
+# ./uthreads -h
+usage: uthreads.py [-h] [-l {java}] [-v] pid
+
+Trace thread creation/destruction events in high-level languages.
+
+positional arguments:
+  pid                   process id to attach to
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -l {java}, --language {java}
+                        language to trace (none for pthreads only)
+  -v, --verbose         verbose mode: print the BPF program (for debugging
+                        purposes)
+
+examples:
+    ./uthreads -l java 185   # trace Java threads in process 185
+    ./uthreads 12245         # trace only pthreads in process 12245