Snap for 8414751 from c2146834c218bbde0bf4f8db93698900408e6163 to tm-release

Change-Id: I7ec60dddb605bad1750245cbb5792b50222f2291
diff --git a/.github/workflows/bcc-test.yml b/.github/workflows/bcc-test.yml
index 4ce3360..43d1582 100644
--- a/.github/workflows/bcc-test.yml
+++ b/.github/workflows/bcc-test.yml
@@ -15,8 +15,13 @@
         env:
         - TYPE: Debug
           PYTHON_TEST_LOGFILE: critical.log
+          RW_ENGINE_ENABLED: ON
+        - TYPE: Debug
+          PYTHON_TEST_LOGFILE: critical.log
+          RW_ENGINE_ENABLED: OFF
         - TYPE: Release
           PYTHON_TEST_LOGFILE: critical.log
+          RW_ENGINE_ENABLED: ON
     steps:
     - uses: actions/checkout@v2
     - name: System info
@@ -43,7 +48,7 @@
                    bcc-docker \
                    /bin/bash -c \
                    'mkdir -p /bcc/build && cd /bcc/build && \
-                    cmake -DCMAKE_BUILD_TYPE=${TYPE} .. && make -j9'"
+                    cmake -DCMAKE_BUILD_TYPE=${TYPE} -DENABLE_LLVM_NATIVECODEGEN=${RW_ENGINE_ENABLED} .. && make -j9'"
     - name: Run bcc's cc tests
       env: ${{ matrix.env }}
       # tests are wrapped with `script` as a hack to get a TTY as github actions doesn't provide this
diff --git a/INSTALL.md b/INSTALL.md
index 383406b..f681ac6 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -392,7 +392,7 @@
 make install 
 
 ```
-after install , you may add bcc directory to your $PATH, which you can add to ~/.bashrc
+after install, you may add bcc directory to your $PATH, which you can add to ~/.bashrc
 ```
 bcctools=/usr/share/bcc/tools
 bccexamples=/usr/share/bcc/examples
diff --git a/README.md b/README.md
index 076d127..f206722 100644
--- a/README.md
+++ b/README.md
@@ -89,6 +89,7 @@
 - tools/[bindsnoop](tools/bindsnoop.py): Trace IPv4 and IPv6 bind() system calls (bind()). [Examples](tools/bindsnoop_example.txt).
 - tools/[biolatency](tools/biolatency.py): Summarize block device I/O latency as a histogram. [Examples](tools/biolatency_example.txt).
 - tools/[biotop](tools/biotop.py): Top for disks: Summarize block device I/O by process. [Examples](tools/biotop_example.txt).
+- tools/[biopattern](tools/biopattern.py): Identify random/sequential disk access patterns. [Examples](tools/biopattern_example.txt).
 - tools/[biosnoop](tools/biosnoop.py): Trace block device I/O with PID and latency. [Examples](tools/biosnoop_example.txt).
 - tools/[bitesize](tools/bitesize.py): Show per process I/O size histogram. [Examples](tools/bitesize_example.txt).
 - tools/[bpflist](tools/bpflist.py): Display processes with active BPF programs and maps. [Examples](tools/bpflist_example.txt).
@@ -165,6 +166,7 @@
 - tools/[tcpsynbl](tools/tcpsynbl.py): Show TCP SYN backlog. [Examples](tools/tcpsynbl_example.txt).
 - tools/[tcptop](tools/tcptop.py): Summarize TCP send/recv throughput by host. Top for TCP. [Examples](tools/tcptop_example.txt).
 - tools/[tcptracer](tools/tcptracer.py): Trace TCP established connections (connect(), accept(), close()). [Examples](tools/tcptracer_example.txt).
+- tools/[tcpcong](tools/tcpcong.py): Trace TCP socket congestion control status duration. [Examples](tools/tcpcong_example.txt).
 - tools/[threadsnoop](tools/threadsnoop.py): List new thread creation. [Examples](tools/threadsnoop_example.txt).
 - tools/[tplist](tools/tplist.py): Display kernel tracepoints or USDT probes and their formats. [Examples](tools/tplist_example.txt).
 - tools/[trace](tools/trace.py): Trace arbitrary functions, with filters. [Examples](tools/trace_example.txt).
@@ -250,8 +252,6 @@
 turns those statistics into a graph showing the traffic distribution at
 multiple granularities. See the code [here](examples/networking/tunnel_monitor).
 
-[![Screenshot](http://img.youtube.com/vi/yYy3Cwce02k/0.jpg)](https://youtu.be/yYy3Cwce02k)
-
 ## Contributing
 
 Already pumped up to commit some code? Here are some resources to join the
diff --git a/cmake/clang_libs.cmake b/cmake/clang_libs.cmake
index 3f1523b..f1b1261 100644
--- a/cmake/clang_libs.cmake
+++ b/cmake/clang_libs.cmake
@@ -22,6 +22,9 @@
   list(APPEND llvm_raw_libs bpfasmparser)
   list(APPEND llvm_raw_libs bpfdisassembler)
 endif()
+if (${LLVM_PACKAGE_VERSION} VERSION_EQUAL 15 OR ${LLVM_PACKAGE_VERSION} VERSION_GREATER 15)
+  list(APPEND llvm_raw_libs windowsdriver)
+endif()
 llvm_map_components_to_libnames(_llvm_libs ${llvm_raw_libs})
 llvm_expand_dependencies(llvm_libs ${_llvm_libs})
 endif()
diff --git a/debian/changelog b/debian/changelog
index 8a23841..2202804 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,17 @@
+bcc (0.24.0-1) unstable; urgency=low
+
+  * Support for kernel up to 5.16
+  * bcc tools: update for trace.py, sslsniff.py, tcptop.py, hardirqs.py, etc.
+  * new libbpf tools: bashreadline
+  * allow specify wakeup_events for perf buffer
+  * support BPF_MAP_TYPE_{INODE, TASK}_STORAGE maps
+  * remove all deprecated libbpf function usage
+  * remove P4/B language support
+  * major test infra change, using github actions now
+  * doc update, bug fixes and other tools improvement
+
+ -- Yonghong Song <ys114321@gmail.com>  Wed, 14 Jan 2022 17:00:00 +0000
+
 bcc (0.23.0-1) unstable; urgency=low
 
   * Support for kernel up to 5.15
diff --git a/docs/kernel-versions.md b/docs/kernel-versions.md
index 2c64227..36ee30a 100644
--- a/docs/kernel-versions.md
+++ b/docs/kernel-versions.md
@@ -78,6 +78,7 @@
 Pass map values to map helpers | 4.18 | [`d71962f3e627`](https://github.com/torvalds/linux/commit/d71962f3e627b5941804036755c844fabfb65ff5)
 BPF socket reuseport | 4.19 | [`2dbb9b9e6df6`](https://github.com/torvalds/linux/commit/2dbb9b9e6df67d444fbe425c7f6014858d337adf)
 BPF flow dissector | 4.20 | [`d58e468b1112`](https://github.com/torvalds/linux/commit/d58e468b1112dcd1d5193c0a89ff9f98b5a3e8b9)
+BPF 1M insn limit | 5.2 | [`c04c0d2b968a`](https://github.com/torvalds/linux/commit/c04c0d2b968ac45d6ef020316808ef6c82325a82)
 BPF cgroup sysctl | 5.2 | [`7b146cebe30c`](https://github.com/torvalds/linux/commit/7b146cebe30cb481b0f70d85779da938da818637)
 BPF raw tracepoint writable | 5.2 | [`9df1c28bb752`](https://github.com/torvalds/linux/commit/9df1c28bb75217b244257152ab7d788bb2a386d0)
 BPF trampoline | 5.5 | [`fec56f5890d9`](https://github.com/torvalds/linux/commit/fec56f5890d93fc2ed74166c397dc186b1c25951)
@@ -153,6 +154,7 @@
 `LOOKUP_BATCH` | 5.6 | [`cb4d03ab499d`](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cb4d03ab499d4c040f4ab6fd4389d2b49f42b5a5)
 `UPDATE_BATCH`, `DELETE_BATCH` | 5.6 | [`aa2e93b8e58e`](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=aa2e93b8e58e18442edfb2427446732415bc215e)
 `LOOKUP_AND_DELETE_BATCH` | 5.6 | [`057996380a42`](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=057996380a42bb64ccc04383cfa9c0ace4ea11f0)
+`LOOKUP_AND_DELETE_ELEM` support for hash maps | 5.14 | [`3e87f192b405`](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3e87f192b405960c0fe83e0925bd0dadf4f8cf43)
 
 ## XDP
 
@@ -212,6 +214,7 @@
 `BPF_FUNC_check_mtu()` | 5.12 |  | [`34b2021cc616`](https://github.com/torvalds/linux/commit/34b2021cc61642d61c3cf943d9e71925b827941b)
 `BPF_FUNC_clone_redirect()` | 4.2 |  | [`3896d655f4d4`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=3896d655f4d491c67d669a15f275a39f713410f8)
 `BPF_FUNC_copy_from_user()` | 5.10 |  | [`07be4c4a3e7a`](https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit?id=07be4c4a3e7a0db148e44b16c5190e753d1c8569)
+`BPF_FUNC_copy_from_user_task()` | 5.18 | GPL | [`376040e47334`](https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit?id=376040e47334c6dc6a939a32197acceb00fe4acf)
 `BPF_FUNC_csum_diff()` | 4.6 |  | [`7d672345ed29`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7d672345ed295b1356a5d9f7111da1d1d7d65867)
 `BPF_FUNC_csum_level()` | 5.7 |  | [`7cdec54f9713`](https://github.com/torvalds/linux/commit/7cdec54f9713256bb170873a1fc5c75c9127c9d2)
 `BPF_FUNC_csum_update()` | 4.9 |  | [`36bbef52c7eb`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=36bbef52c7eb646ed6247055a2acd3851e317857)
@@ -234,6 +237,7 @@
 `BPF_FUNC_get_func_arg_cnt()` | 5.17 |  | [`f92c1e183604`](https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit?id=f92c1e183604c20ce00eb889315fdaa8f2d9e509)
 `BPF_FUNC_get_func_ip()` | 5.15 |  | [`5d8b583d04ae`](https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=5d8b583d04aedb3bd5f6d227a334c210c7d735f9)
 `BPF_FUNC_get_func_ret()` | 5.17 |  | [`f92c1e183604`](https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit?id=f92c1e183604c20ce00eb889315fdaa8f2d9e509)
+`BPF_FUNC_get_retval()` | 5.18 |  | [`b44123b4a3dc`](https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit?id=b44123b4a3dcad4664d3a0f72c011ffd4c9c4d93)
 `BPF_FUNC_get_hash_recalc()` | 4.8 |  | [`13c5c240f789`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=13c5c240f789bbd2bcacb14a23771491485ae61f)
 `BPF_FUNC_get_listener_sock()` | 5.1 |  | [`dbafd7ddd623`](https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/dbafd7ddd62369b2f3926ab847cbf8fc40e800b7)
 `BPF_FUNC_get_local_storage()` | 4.19 |  | [`cd3394317653`](https://github.com/torvalds/linux/commit/cd3394317653837e2eb5c5d0904a8996102af9fc)
@@ -249,6 +253,7 @@
 `BPF_FUNC_get_stackid()` | 4.6 | GPL | [`d5a3b1f69186`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d5a3b1f691865be576c2bffa708549b8cdccda19)
 `BPF_FUNC_get_task_stack()` | 5.9 | | [`fa28dcb82a38`](https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf-next/+/fa28dcb82a38f8e3993b0fae9106b1a80b59e4f0)
 `BPF_FUNC_getsockopt()` | 4.15 |  | [`cd86d1fd2102`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=cd86d1fd21025fdd6daf23d1288da405e7ad0ec6)
+`BPF_FUNC_ima_file_hash()` | 5.18 |  | [`174b16946e39`](https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit?id=174b16946e39ebd369097e0f773536c91a8c1a4c)
 `BPF_FUNC_ima_inode_hash()` | 5.11 |  | [`27672f0d280a`](https://github.com/torvalds/linux/commit/27672f0d280a3f286a410a8db2004f46ace72a17)
 `BPF_FUNC_inode_storage_delete()` | 5.10 |  | [`8ea636848aca`](https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit?id=8ea636848aca35b9f97c5b5dee30225cf2dd0fe6)
 `BPF_FUNC_inode_storage_get()` | 5.10 |  | [`8ea636848aca`](https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git/commit?id=8ea636848aca35b9f97c5b5dee30225cf2dd0fe6)
@@ -311,6 +316,7 @@
 `BPF_FUNC_seq_write()` | 5.7 | GPL | [`492e639f0c22`](https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf-next/+/492e639f0c222784e2e0f121966375f641c61b15)
 `BPF_FUNC_set_hash()` | 4.13 |  | [`ded092cd73c2`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ded092cd73c2c56a394b936f86897f29b2e131c0)
 `BPF_FUNC_set_hash_invalid()` | 4.9 |  | [`7a4b28c6cc9f`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7a4b28c6cc9ffac50f791b99cc7e46106436e5d8)
+`BPF_FUNC_set_retval()` | 5.18 |  | [`b44123b4a3dc`](https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit?id=b44123b4a3dcad4664d3a0f72c011ffd4c9c4d93)
 `BPF_FUNC_setsockopt()` | 4.13 |  | [`8c4b4c7e9ff0`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=8c4b4c7e9ff0447995750d9329949fa082520269)
 `BPF_FUNC_sk_ancestor_cgroup_id()` | 5.7 |  | [`f307fa2cb4c9`](https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf-next/+/f307fa2cb4c935f7f1ff0aeb880c7b44fb9a642b)
 `BPF_FUNC_sk_assign()` | 5.6 |  | [`cf7fbe660f2d`](https://github.com/torvalds/linux/commit/cf7fbe660f2dbd738ab58aea8e9b0ca6ad232449)
@@ -340,6 +346,7 @@
 `BPF_FUNC_skb_load_bytes_relative()` | 4.18 |  | [`4e1ec56cdc59`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=4e1ec56cdc59746943b2acfab3c171b930187bbe)
 `BPF_FUNC_skb_output()` | 5.5 |  | [`a7658e1a4164`](https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=a7658e1a4164ce2b9eb4a11aadbba38586e93bd6)
 `BPF_FUNC_skb_pull_data()` | 4.9 |  | [`36bbef52c7eb`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=36bbef52c7eb646ed6247055a2acd3851e317857)
+`BPF_FUNC_skb_set_tstamp()` | 5.18 |  | [`9bb984f28d5b`](https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit?id=9bb984f28d5bcb917d35d930fcfb89f90f9449fd)
 `BPF_FUNC_skb_set_tunnel_key()` | 4.3 |  | [`d3aa45ce6b94`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d3aa45ce6b94c65b83971257317867db13e5f492)
 `BPF_FUNC_skb_set_tunnel_opt()` | 4.6 |  | [`14ca0751c96f`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=14ca0751c96f8d3d0f52e8ed3b3236f8b34d3460)
 `BPF_FUNC_skb_store_bytes()` | 4.1 |  | [`91bc4822c3d6`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=91bc4822c3d61b9bb7ef66d3b77948a4f9177954)
@@ -388,6 +395,9 @@
 `BPF_FUNC_xdp_adjust_head()` | 4.10 |  | [`17bedab27231`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=17bedab2723145d17b14084430743549e6943d03)
 `BPF_FUNC_xdp_adjust_meta()` | 4.15 |  | [`de8f3a83b0a0`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=de8f3a83b0a0fddb2cf56e7a718127e9619ea3da)
 `BPF_FUNC_xdp_adjust_tail()` | 4.18 |  | [`b32cc5b9a346`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=b32cc5b9a346319c171e3ad905e0cddda032b5eb)
+`BPF_FUNC_xdp_get_buff_len()` | 5.18 |  | [`0165cc817075`](https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit?id=0165cc817075cf701e4289838f1d925ff1911b3e)
+`BPF_FUNC_xdp_load_bytes()` | 5.18 |  | [`3f364222d032`](https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit?id=3f364222d032eea6b245780e845ad213dab28cdd)
+`BPF_FUNC_xdp_store_bytes()` | 5.18 |  | [`3f364222d032`](https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit?id=3f364222d032eea6b245780e845ad213dab28cdd)
 `BPF_FUNC_xdp_output()` | 5.6 | GPL | [`d831ee84bfc9`](https://github.com/torvalds/linux/commit/d831ee84bfc9173eecf30dbbc2553ae81b996c60)
 `BPF_FUNC_override_return()` | 4.16 | GPL | [`9802d86585db`](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9802d86585db91655c7d1929a4f6bbe0952ea88e)
 `BPF_FUNC_sock_ops_cb_flags_set()` | 4.16 |  | [`b13d88072172`](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b13d880721729384757f235166068c315326f4a1)
diff --git a/docs/reference_guide.md b/docs/reference_guide.md
index 96e8a3f..ff18ab9 100644
--- a/docs/reference_guide.md
+++ b/docs/reference_guide.md
@@ -202,6 +202,9 @@
 
 This is a macro that instruments the tracepoint defined by *category*:*event*.
 
+The tracepoint name is `<category>:<event>`.
+The probe function name is `tracepoint__<category>__<event>`.
+
 Arguments are available in an ```args``` struct, which are the tracepoint arguments. One way to list these is to cat the relevant format file under /sys/kernel/debug/tracing/events/*category*/*event*/format.
 
 The ```args``` struct can be used in place of ``ctx`` in each functions requiring a context as an argument. This includes notably [perf_submit()](#3-perf_submit).
@@ -216,7 +219,11 @@
 }
 ```
 
-This instruments the random:urandom_read tracepoint, and prints the tracepoint argument ```got_bits```.
+This instruments the tracepoint `random:urandom_read tracepoint`, and prints the tracepoint argument ```got_bits```.
+When using Python API, this probe is automatically attached to the right tracepoint target.
+For C++, this tracepoint probe can be attached by specifying the tracepoint target and function name explicitly:
+`BPF::attach_tracepoint("random:urandom_read", "tracepoint__random__urandom_read")`
+Note the name of the probe function defined above is `tracepoint__random__urandom_read`.
 
 Examples in situ:
 [code](https://github.com/iovisor/bcc/blob/a4159da8c4ea8a05a3c6e402451f530d6e5a8b41/examples/tracing/urandomread.py#L19) ([output](https://github.com/iovisor/bcc/commit/e422f5e50ecefb96579b6391a2ada7f6367b83c4#diff-41e5ecfae4a3b38de5f4e0887ed160e5R10)),
@@ -1825,7 +1832,7 @@
 
 You can use flags like this ```BPF.attach_xdp(dev="device", fn=b.load_func("fn_name",BPF.XDP), flags=BPF.XDP_FLAGS_UPDATE_IF_NOEXIST)```
 
-The default value of flgas is 0. This means if there is no xdp program with `device`, the fn will run with that device. If there is an xdp program running with device, the old program will be replaced with new fn program.
+The default value of flags is 0. This means if there is no xdp program with `device`, the fn will run with that device. If there is an xdp program running with device, the old program will be replaced with new fn program.
 
 Currently, bcc does not support XDP_FLAGS_REPLACE flag. The following are the descriptions of other flags.
 
diff --git a/examples/networking/http_filter/http-parse-complete.c b/examples/networking/http_filter/http-parse-complete.c
index 61cee0f..ef102ba 100644
--- a/examples/networking/http_filter/http-parse-complete.c
+++ b/examples/networking/http_filter/http-parse-complete.c
@@ -100,7 +100,7 @@
 	unsigned long p[7];
 	int i = 0;
 	for (i = 0; i < 7; i++) {
-		p[i] = load_byte(skb , payload_offset + i);
+		p[i] = load_byte(skb, payload_offset + i);
 	}
 
 	//find a match with an HTTP message
diff --git a/examples/networking/http_filter/http-parse-simple.c b/examples/networking/http_filter/http-parse-simple.c
index 292cb7b..9afbe1e 100644
--- a/examples/networking/http_filter/http-parse-simple.c
+++ b/examples/networking/http_filter/http-parse-simple.c
@@ -71,7 +71,7 @@
 	unsigned long p[7];
 	int i = 0;
 	for (i = 0; i < 7; i++) {
-		p[i] = load_byte(skb , payload_offset + i);
+		p[i] = load_byte(skb, payload_offset + i);
 	}
 
 	//find a match with an HTTP message
diff --git a/examples/tracing/biolatpcts.py b/examples/tracing/biolatpcts.py
index c9bb834..68a5951 100755
--- a/examples/tracing/biolatpcts.py
+++ b/examples/tracing/biolatpcts.py
@@ -11,6 +11,7 @@
 
 bpf_source = """
 #include <linux/blk_types.h>
+#include <linux/blk-mq.h>
 #include <linux/blkdev.h>
 #include <linux/time64.h>
 
@@ -45,7 +46,10 @@
 """
 
 bpf = BPF(text=bpf_source)
-bpf.attach_kprobe(event='blk_account_io_done', fn_name='kprobe_blk_account_io_done')
+if BPF.get_kprobe_functions(b'__blk_account_io_done'):
+    bpf.attach_kprobe(event="__blk_account_io_done", fn_name="kprobe_blk_account_io_done")
+else:
+    bpf.attach_kprobe(event="blk_account_io_done", fn_name="kprobe_blk_account_io_done")
 
 cur_lat_100ms = bpf['lat_100ms']
 cur_lat_1ms = bpf['lat_1ms']
diff --git a/examples/tracing/bitehist.py b/examples/tracing/bitehist.py
index 89ceb30..81e8459 100755
--- a/examples/tracing/bitehist.py
+++ b/examples/tracing/bitehist.py
@@ -20,27 +20,32 @@
 # load BPF program
 b = BPF(text="""
 #include <uapi/linux/ptrace.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 
 BPF_HISTOGRAM(dist);
 BPF_HISTOGRAM(dist_linear);
 
-int kprobe__blk_account_io_done(struct pt_regs *ctx, struct request *req)
+int trace_req_done(struct pt_regs *ctx, struct request *req)
 {
-	dist.increment(bpf_log2l(req->__data_len / 1024));
-	dist_linear.increment(req->__data_len / 1024);
-	return 0;
+    dist.increment(bpf_log2l(req->__data_len / 1024));
+    dist_linear.increment(req->__data_len / 1024);
+    return 0;
 }
 """)
 
+if BPF.get_kprobe_functions(b'__blk_account_io_done'):
+    b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_done")
+else:
+    b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_done")
+
 # header
 print("Tracing... Hit Ctrl-C to end.")
 
 # trace until Ctrl-C
 try:
-	sleep(99999999)
+    sleep(99999999)
 except KeyboardInterrupt:
-	print()
+    print()
 
 # output
 print("log2 histogram")
diff --git a/examples/tracing/disksnoop.py b/examples/tracing/disksnoop.py
index a35e1ab..7b6891b 100755
--- a/examples/tracing/disksnoop.py
+++ b/examples/tracing/disksnoop.py
@@ -19,7 +19,7 @@
 # load BPF program
 b = BPF(text="""
 #include <uapi/linux/ptrace.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 
 BPF_HASH(start, struct request *);
 
@@ -46,7 +46,10 @@
 if BPF.get_kprobe_functions(b'blk_start_request'):
         b.attach_kprobe(event="blk_start_request", fn_name="trace_start")
 b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_start")
-b.attach_kprobe(event="blk_account_io_done", fn_name="trace_completion")
+if BPF.get_kprobe_functions(b'__blk_account_io_done'):
+    b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_completion")
+else:
+    b.attach_kprobe(event="blk_account_io_done", fn_name="trace_completion")
 
 # header
 print("%-18s %-2s %-7s %8s" % ("TIME(s)", "T", "BYTES", "LAT(ms)"))
diff --git a/libbpf-tools/.gitignore b/libbpf-tools/.gitignore
index 5937055..ce95db7 100644
--- a/libbpf-tools/.gitignore
+++ b/libbpf-tools/.gitignore
@@ -31,6 +31,7 @@
 /mountsnoop
 /numamove
 /offcputime
+/oomkill
 /opensnoop
 /readahead
 /runqlat
@@ -43,6 +44,7 @@
 /tcpconnect
 /tcpconnlat
 /tcprtt
+/tcpsynbl
 /vfsstat
 /xfsdist
 /xfsslower
diff --git a/libbpf-tools/Android.bp b/libbpf-tools/Android.bp
index e250f5d..03165fd 100644
--- a/libbpf-tools/Android.bp
+++ b/libbpf-tools/Android.bp
@@ -180,6 +180,26 @@
 }
 
 cc_object {
+    name: "drsnoop.bpf.o",
+    srcs: ["drsnoop.bpf.c"],
+    defaults: ["bcc_bpf_defaults"],
+}
+
+genrule {
+    name: "drsnoop.skel.h",
+    srcs: [":drsnoop.bpf.o"],
+    out: ["drsnoop.skel.h"],
+    defaults: ["bpf_skeleton_hdr_defaults"],
+}
+
+cc_binary {
+    name: "drsnoop",
+    srcs: ["drsnoop.c"],
+    generated_headers:  ["drsnoop.skel.h"],
+    defaults: ["bcc_binary_defaults"],
+}
+
+cc_object {
     name: "filelife.bpf.o",
     srcs: ["filelife.bpf.c"],
     defaults: ["bcc_bpf_defaults"],
@@ -220,6 +240,46 @@
 }
 
 cc_object {
+    name: "fsdist.bpf.o",
+    srcs: ["fsdist.bpf.c"],
+    defaults: ["bcc_bpf_defaults"],
+}
+
+genrule {
+    name: "fsdist.skel.h",
+    srcs: [":fsdist.bpf.o"],
+    out: ["fsdist.skel.h"],
+    defaults: ["bpf_skeleton_hdr_defaults"],
+}
+
+cc_binary {
+    name: "fsdist",
+    srcs: ["fsdist.c"],
+    generated_headers:  ["fsdist.skel.h"],
+    defaults: ["bcc_binary_defaults"],
+}
+
+cc_object {
+    name: "fsslower.bpf.o",
+    srcs: ["fsslower.bpf.c"],
+    defaults: ["bcc_bpf_defaults"],
+}
+
+genrule {
+    name: "fsslower.skel.h",
+    srcs: [":fsslower.bpf.o"],
+    out: ["fsslower.skel.h"],
+    defaults: ["bpf_skeleton_hdr_defaults"],
+}
+
+cc_binary {
+    name: "fsslower",
+    srcs: ["fsslower.c"],
+    generated_headers:  ["fsslower.skel.h"],
+    defaults: ["bcc_binary_defaults"],
+}
+
+cc_object {
     name: "funclatency.bpf.o",
     srcs: ["funclatency.bpf.c"],
     defaults: ["bcc_bpf_defaults"],
@@ -329,6 +389,26 @@
 }
 
 cc_object {
+    name: "oomkill.bpf.o",
+    srcs: ["oomkill.bpf.c"],
+    defaults: ["bcc_bpf_defaults"],
+}
+
+genrule {
+    name: "oomkill.skel.h",
+    srcs: [":oomkill.bpf.o"],
+    out: ["oomkill.skel.h"],
+    defaults: ["bpf_skeleton_hdr_defaults"],
+}
+
+cc_binary {
+    name: "oomkill",
+    srcs: ["oomkill.c"],
+    generated_headers:  ["oomkill.skel.h"],
+    defaults: ["bcc_binary_defaults"],
+}
+
+cc_object {
     name: "runqlat.bpf.o",
     srcs: ["runqlat.bpf.c"],
     defaults: ["bcc_bpf_defaults"],
@@ -409,6 +489,26 @@
 }
 
 cc_object {
+    name: "solisten.bpf.o",
+    srcs: ["solisten.bpf.c"],
+    defaults: ["bcc_bpf_defaults"],
+}
+
+genrule {
+    name: "solisten.skel.h",
+    srcs: [":solisten.bpf.o"],
+    out: ["solisten.skel.h"],
+    defaults: ["bpf_skeleton_hdr_defaults"],
+}
+
+cc_binary {
+    name: "solisten",
+    srcs: ["solisten.c"],
+    generated_headers:  ["solisten.skel.h"],
+    defaults: ["bcc_binary_defaults"],
+}
+
+cc_object {
     name: "tcpconnect.bpf.o",
     srcs: ["tcpconnect.bpf.c"],
     defaults: ["bcc_bpf_defaults"],
@@ -430,3 +530,43 @@
     generated_headers:  ["tcpconnect.skel.h"],
     defaults: ["bcc_binary_defaults"],
 }
+
+cc_object {
+    name: "tcprtt.bpf.o",
+    srcs: ["tcprtt.bpf.c"],
+    defaults: ["bcc_bpf_defaults"],
+}
+
+genrule {
+    name: "tcprtt.skel.h",
+    srcs: [":tcprtt.bpf.o"],
+    out: ["tcprtt.skel.h"],
+    defaults: ["bpf_skeleton_hdr_defaults"],
+}
+
+cc_binary {
+    name: "tcprtt",
+    srcs: ["tcprtt.c"],
+    generated_headers:  ["tcprtt.skel.h"],
+    defaults: ["bcc_binary_defaults"],
+}
+
+cc_object {
+    name: "vfsstat.bpf.o",
+    srcs: ["vfsstat.bpf.c"],
+    defaults: ["bcc_bpf_defaults"],
+}
+
+genrule {
+    name: "vfsstat.skel.h",
+    srcs: [":vfsstat.bpf.o"],
+    out: ["vfsstat.skel.h"],
+    defaults: ["bpf_skeleton_hdr_defaults"],
+}
+
+cc_binary {
+    name: "vfsstat",
+    srcs: ["vfsstat.c"],
+    generated_headers:  ["vfsstat.skel.h"],
+    defaults: ["bcc_binary_defaults"],
+}
diff --git a/libbpf-tools/Makefile b/libbpf-tools/Makefile
index 6bf1ed0..81a6faa 100644
--- a/libbpf-tools/Makefile
+++ b/libbpf-tools/Makefile
@@ -7,6 +7,7 @@
 LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a)
 INCLUDES := -I$(OUTPUT) -I../src/cc/libbpf/include/uapi
 CFLAGS := -g -O2 -Wall
+BPFCFLAGS := -g -O2 -Wall
 INSTALL ?= install
 prefix ?= /usr/local
 ARCH := $(shell uname -m | sed 's/x86_64/x86/' | sed 's/aarch64/arm64/' | sed 's/ppc64le/powerpc/' | sed 's/mips.*/mips/')
@@ -42,6 +43,7 @@
 	mountsnoop \
 	numamove \
 	offcputime \
+	oomkill \
 	opensnoop \
 	readahead \
 	runqlat \
@@ -54,6 +56,7 @@
 	tcpconnect \
 	tcpconnlat \
 	tcprtt \
+	tcpsynbl \
 	vfsstat \
 	#
 
@@ -106,7 +109,7 @@
 
 $(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(ARCH)/vmlinux.h | $(OUTPUT)
 	$(call msg,BPF,$@)
-	$(Q)$(CLANG) $(CFLAGS) -target bpf -D__TARGET_ARCH_$(ARCH)	      \
+	$(Q)$(CLANG) $(BPFCFLAGS) -target bpf -D__TARGET_ARCH_$(ARCH)	      \
 		     -I$(ARCH)/ $(INCLUDES) -c $(filter %.c,$^) -o $@ &&      \
 	$(LLVM_STRIP) -g $@
 
@@ -114,7 +117,7 @@
 $(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch]) | $(OUTPUT)/libbpf
 	$(call msg,LIB,$@)
 	$(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1		      \
-		    OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@)		      \
+		    OBJDIR=$(dir $@)libbpf DESTDIR=$(dir $@)		      \
 		    INCLUDEDIR= LIBDIR= UAPIDIR=			      \
 		    install
 
diff --git a/libbpf-tools/android/argp.cpp b/libbpf-tools/android/argp.cpp
index 5c39f79..6c1b22f 100644
--- a/libbpf-tools/android/argp.cpp
+++ b/libbpf-tools/android/argp.cpp
@@ -61,7 +61,7 @@
     // Handle positional arguments
     if (optind < argc) {
         for (int idx = optind; idx < argc; idx++) {
-            struct argp_state state = { .input = input, .argp = argp, .arg_num = idx };
+            struct argp_state state = { .input = input, .argp = argp, .arg_num = idx - optind };
             const error_t ret = argp->parser(ARGP_KEY_ARG, argv[idx], &state);
             if (ret) return ret;
         }
diff --git a/libbpf-tools/bashreadline.c b/libbpf-tools/bashreadline.c
index 2fcb2e2..0277f53 100644
--- a/libbpf-tools/bashreadline.c
+++ b/libbpf-tools/bashreadline.c
@@ -132,7 +132,7 @@
 	if (line)
 		free(line);
 	if (fp)
-		fclose(fp);
+		pclose(fp);
 	return result;
 }
 
diff --git a/libbpf-tools/bindsnoop.bpf.c b/libbpf-tools/bindsnoop.bpf.c
index bcbfc54..941826c 100644
--- a/libbpf-tools/bindsnoop.bpf.c
+++ b/libbpf-tools/bindsnoop.bpf.c
@@ -6,7 +6,6 @@
 #include <bpf/bpf_tracing.h>
 #include <bpf/bpf_endian.h>
 #include "bindsnoop.h"
-#include "maps.bpf.h"
 
 #define MAX_ENTRIES	10240
 #define MAX_PORTS	1024
diff --git a/libbpf-tools/bindsnoop.h b/libbpf-tools/bindsnoop.h
index 1c881b0..fa7b19d 100644
--- a/libbpf-tools/bindsnoop.h
+++ b/libbpf-tools/bindsnoop.h
@@ -11,8 +11,8 @@
 	__u32 bound_dev_if;
 	int ret;
 	__u16 port;
+	__u16 proto;
 	__u8 opts;
-	__u8 proto;
 	__u8 ver;
 	char task[TASK_COMM_LEN];
 };
diff --git a/libbpf-tools/cachestat.c b/libbpf-tools/cachestat.c
index 0578525..5556cfd 100644
--- a/libbpf-tools/cachestat.c
+++ b/libbpf-tools/cachestat.c
@@ -142,12 +142,31 @@
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
 	libbpf_set_print(libbpf_print_fn);
 
-	obj = cachestat_bpf__open_and_load();
+	obj = cachestat_bpf__open();
 	if (!obj) {
-		fprintf(stderr, "failed to open and/or load BPF object\n");
+		fprintf(stderr, "failed to open BPF object\n");
 		return 1;
 	}
 
+	/**
+	 * account_page_dirtied was renamed to folio_account_dirtied
+	 * in kernel commit 203a31516616 ("mm/writeback: Add __folio_mark_dirty()")
+	 */
+	if (fentry_can_attach("folio_account_dirtied", NULL)) {
+		err = bpf_program__set_attach_target(obj->progs.account_page_dirtied, 0,
+						     "folio_account_dirtied");
+		if (err) {
+			fprintf(stderr, "failed to set attach target\n");
+			goto cleanup;
+		}
+	}
+
+	err = cachestat_bpf__load(obj);
+	if (err) {
+		fprintf(stderr, "failed to load BPF object\n");
+		goto cleanup;
+	}
+
 	if (!obj->bss) {
 		fprintf(stderr, "Memory-mapping BPF maps is supported starting from Linux 5.7, please upgrade.\n");
 		goto cleanup;
diff --git a/libbpf-tools/filetop.bpf.c b/libbpf-tools/filetop.bpf.c
index c02a205..d8b9712 100644
--- a/libbpf-tools/filetop.bpf.c
+++ b/libbpf-tools/filetop.bpf.c
@@ -44,7 +44,8 @@
 	if (regular_file_only && !S_ISREG(mode))
 		return 0;
 
-	key.dev = BPF_CORE_READ(file, f_inode, i_rdev);
+	key.dev = BPF_CORE_READ(file, f_inode, i_sb, s_dev);
+	key.rdev = BPF_CORE_READ(file, f_inode, i_rdev);
 	key.inode = BPF_CORE_READ(file, f_inode, i_ino);
 	key.pid = pid;
 	key.tid = tid;
diff --git a/libbpf-tools/filetop.c b/libbpf-tools/filetop.c
index 70240d8..4e4554e 100644
--- a/libbpf-tools/filetop.c
+++ b/libbpf-tools/filetop.c
@@ -195,7 +195,7 @@
 		time(&t);
 		tm = localtime(&t);
 		strftime(ts, sizeof(ts), "%H:%M:%S", tm);
-		memset(buf, 0 , sizeof(buf));
+		memset(buf, 0, sizeof(buf));
 		n = fread(buf, 1, sizeof(buf), f);
 		if (n)
 			printf("%8s loadavg: %s\n", ts, buf);
diff --git a/libbpf-tools/filetop.h b/libbpf-tools/filetop.h
index 2974ebf..7ddf385 100644
--- a/libbpf-tools/filetop.h
+++ b/libbpf-tools/filetop.h
@@ -13,6 +13,7 @@
 struct file_id {
 	__u64 inode;
 	__u32 dev;
+	__u32 rdev;
 	__u32 pid;
 	__u32 tid;
 };
diff --git a/libbpf-tools/fsdist.c b/libbpf-tools/fsdist.c
index f411d16..88d1a09 100644
--- a/libbpf-tools/fsdist.c
+++ b/libbpf-tools/fsdist.c
@@ -233,7 +233,7 @@
 	for (i = 0; i < MAX_OP; i++) {
 		fn_name = fs_configs[fs_type].op_funcs[i];
 		module = fs_configs[fs_type].fs;
-		if (fn_name && !fentry_exists(fn_name, module)) {
+		if (fn_name && !fentry_can_attach(fn_name, module)) {
 			support_fentry = false;
 			break;
 		}
diff --git a/libbpf-tools/fsslower.c b/libbpf-tools/fsslower.c
index e96c9ef..820a201 100644
--- a/libbpf-tools/fsslower.c
+++ b/libbpf-tools/fsslower.c
@@ -201,7 +201,7 @@
 	for (i = 0; i < MAX_OP; i++) {
 		fn_name = fs_configs[fs_type].op_funcs[i];
 		module = fs_configs[fs_type].fs;
-		if (fn_name && !fentry_exists(fn_name, module)) {
+		if (fn_name && !fentry_can_attach(fn_name, module)) {
 			support_fentry = false;
 			break;
 		}
diff --git a/libbpf-tools/klockstat.bpf.c b/libbpf-tools/klockstat.bpf.c
index eddf8b7..2a5c8e7 100644
--- a/libbpf-tools/klockstat.bpf.c
+++ b/libbpf-tools/klockstat.bpf.c
@@ -107,6 +107,21 @@
 	bpf_map_update_elem(&lockholder_map, &tl, li, BPF_ANY);
 }
 
+static void lock_aborted(struct mutex *lock)
+{
+	u64 task_id;
+	struct task_lock tl = {};
+
+	if (targ_lock && targ_lock != lock)
+		return;
+	task_id = bpf_get_current_pid_tgid();
+	if (!tracing_task(task_id))
+		return;
+	tl.task_id = task_id;
+	tl.lock_ptr = (u64)lock;
+	bpf_map_delete_elem(&lockholder_map, &tl);
+}
+
 static void lock_acquired(struct mutex *lock)
 {
 	u64 task_id;
@@ -220,6 +235,40 @@
 	return 0;
 }
 
+SEC("fentry/mutex_lock_interruptible")
+int BPF_PROG(mutex_lock_interruptible, struct mutex *lock)
+{
+	lock_contended(ctx, lock);
+	return 0;
+}
+
+SEC("fexit/mutex_lock_interruptible")
+int BPF_PROG(mutex_lock_interruptible_exit, struct mutex *lock, long ret)
+{
+	if (ret)
+		lock_aborted(lock);
+	else
+		lock_acquired(lock);
+	return 0;
+}
+
+SEC("fentry/mutex_lock_killable")
+int BPF_PROG(mutex_lock_killable, struct mutex *lock)
+{
+	lock_contended(ctx, lock);
+	return 0;
+}
+
+SEC("fexit/mutex_lock_killable")
+int BPF_PROG(mutex_lock_killable_exit, struct mutex *lock, long ret)
+{
+	if (ret)
+		lock_aborted(lock);
+	else
+		lock_acquired(lock);
+	return 0;
+}
+
 SEC("fentry/mutex_unlock")
 int BPF_PROG(mutex_unlock, struct mutex *lock)
 {
diff --git a/libbpf-tools/klockstat.c b/libbpf-tools/klockstat.c
index b1cac63..d3a6fac 100644
--- a/libbpf-tools/klockstat.c
+++ b/libbpf-tools/klockstat.c
@@ -54,6 +54,7 @@
 	unsigned int iterations;
 	bool reset;
 	bool timestamp;
+	bool verbose;
 } env = {
 	.nr_locks = 99999999,
 	.nr_stack_entries = 1,
@@ -70,7 +71,7 @@
 static const char program_doc[] =
 "Trace mutex lock acquisition and hold times, in nsec\n"
 "\n"
-"Usage: klockstat [-hRT] [-p PID] [-t TID] [-c FUNC] [-L LOCK] [-n NR_LOCKS]\n"
+"Usage: klockstat [-hRTv] [-p PID] [-t TID] [-c FUNC] [-L LOCK] [-n NR_LOCKS]\n"
 "                 [-s NR_STACKS] [-S SORT] [-d DURATION] [-i INTERVAL]\n"
 "\v"
 "Examples:\n"
@@ -104,6 +105,7 @@
 	{ "interval", 'i', "SECONDS", 0, "Print interval" },
 	{ "reset", 'R', NULL, 0, "Reset stats each interval" },
 	{ "timestamp", 'T', NULL, 0, "Print timestamp" },
+	{ "verbose", 'v', NULL, 0, "Verbose debug output" },
 
 	{ NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" },
 	{},
@@ -230,6 +232,9 @@
 	case 'h':
 		argp_state_help(state, stderr, ARGP_HELP_STD_HELP);
 		break;
+	case 'v':
+		env->verbose = true;
+		break;
 	case ARGP_KEY_END:
 		if (env->duration) {
 			if (env->interval > env->duration)
@@ -324,7 +329,7 @@
 
 static void print_acq_header(void)
 {
-	printf("\n                               Caller  Avg Spin    Count   Max Spin   Total Spin\n");
+	printf("\n                               Caller  Avg Wait    Count   Max Wait   Total Wait\n");
 }
 
 static void print_acq_stat(struct ksyms *ksyms, struct stack_stat *ss,
@@ -471,6 +476,13 @@
 
 static struct sigaction sigact = {.sa_handler = sig_hand};
 
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !env.verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
 int main(int argc, char **argv)
 {
 	static const struct argp argp = {
@@ -494,6 +506,7 @@
 	sigaction(SIGINT, &sigact, 0);
 
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+	libbpf_set_print(libbpf_print_fn);
 
 	ksyms = ksyms__load();
 	if (!ksyms) {
@@ -521,6 +534,21 @@
 	obj->rodata->targ_pid = env.tid;
 	obj->rodata->targ_lock = lock_addr;
 
+	if (fentry_can_attach("mutex_lock_nested", NULL)) {
+		bpf_program__set_attach_target(obj->progs.mutex_lock, 0,
+					       "mutex_lock_nested");
+		bpf_program__set_attach_target(obj->progs.mutex_lock_exit, 0,
+					       "mutex_lock_nested");
+		bpf_program__set_attach_target(obj->progs.mutex_lock_interruptible, 0,
+					       "mutex_lock_interruptible_nested");
+		bpf_program__set_attach_target(obj->progs.mutex_lock_interruptible_exit, 0,
+					       "mutex_lock_interruptible_nested");
+		bpf_program__set_attach_target(obj->progs.mutex_lock_killable, 0,
+					       "mutex_lock_killable_nested");
+		bpf_program__set_attach_target(obj->progs.mutex_lock_killable_exit, 0,
+					       "mutex_lock_killable_nested");
+	}
+
 	err = klockstat_bpf__load(obj);
 	if (err) {
 		warn("failed to load BPF object\n");
diff --git a/libbpf-tools/oomkill.bpf.c b/libbpf-tools/oomkill.bpf.c
new file mode 100644
index 0000000..2586683
--- /dev/null
+++ b/libbpf-tools/oomkill.bpf.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2022 Jingxiang Zeng
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_tracing.h>
+
+#include "oomkill.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+	__uint(key_size, sizeof(u32));
+	__uint(value_size, sizeof(u32));
+} events SEC(".maps");
+
+SEC("kprobe/oom_kill_process")
+int BPF_KPROBE(oom_kill_process, struct oom_control *oc, const char *message)
+{
+	struct data_t data;
+
+	data.fpid = bpf_get_current_pid_tgid() >> 32;
+	data.tpid = BPF_CORE_READ(oc, chosen, tgid);
+	data.pages = BPF_CORE_READ(oc, totalpages);
+	bpf_get_current_comm(&data.fcomm, sizeof(data.fcomm));
+	bpf_probe_read_kernel(&data.tcomm, sizeof(data.tcomm), BPF_CORE_READ(oc, chosen, comm));
+	bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &data, sizeof(data));
+	return 0;
+}
+
+char LICENSE[] SEC("license") = "GPL";
diff --git a/libbpf-tools/oomkill.c b/libbpf-tools/oomkill.c
new file mode 100644
index 0000000..92976b8
--- /dev/null
+++ b/libbpf-tools/oomkill.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+// Copyright (c) 2022 Jingxiang Zeng
+//
+// Based on oomkill(8) from BCC by Brendan Gregg.
+// 13-Jan-2022   Jingxiang Zeng   Created this.
+#include <argp.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include "oomkill.skel.h"
+#include "oomkill.h"
+#include "trace_helpers.h"
+
+#define PERF_POLL_TIMEOUT_MS	100
+
+static volatile sig_atomic_t exiting = 0;
+
+static bool verbose = false;
+
+const char *argp_program_version = "oomkill 0.1";
+const char *argp_program_bug_address =
+	"https://github.com/iovisor/bcc/tree/master/libbpf-tools";
+const char argp_program_doc[] =
+"Trace OOM kills.\n"
+"\n"
+"USAGE: oomkill [-h]\n"
+"\n"
+"EXAMPLES:\n"
+"    oomkill               # trace OOM kills\n";
+
+static const struct argp_option opts[] = {
+	{ "verbose", 'v', NULL, 0, "Verbose debug output" },
+	{ NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" },
+	{},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+	switch (key) {
+	case 'v':
+		verbose = true;
+		break;
+	case 'h':
+		argp_state_help(state, stderr, ARGP_HELP_STD_HELP);
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+	return 0;
+}
+
+static void handle_event(void *ctx, int cpu, void *data, __u32 data_sz)
+{
+	FILE *f;
+	char buf[256];
+	int n = 0;
+	struct tm *tm;
+	char ts[32];
+	time_t t;
+	struct data_t *e = data;
+
+	f = fopen("/proc/loadavg", "r");
+	if (f) {
+		memset(buf, 0, sizeof(buf));
+		n = fread(buf, 1, sizeof(buf), f);
+		fclose(f);
+	}
+	time(&t);
+	tm = localtime(&t);
+	strftime(ts, sizeof(ts), "%H:%M:%S", tm);
+
+	if (n)
+		printf("%s Triggered by PID %d (\"%s\"), OOM kill of PID %d (\"%s\"), %lld pages, loadavg: %s\n",
+			ts, e->fpid, e->fcomm, e->tpid, e->tcomm, e->pages, buf);
+	else
+		printf("%s Triggered by PID %d (\"%s\"), OOM kill of PID %d (\"%s\"), %lld pages\n",
+                        ts, e->fpid, e->fcomm, e->tpid, e->tcomm, e->pages);
+}
+
+static void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt)
+{
+	printf("Lost %llu events on CPU #%d!\n", lost_cnt, cpu);
+}
+
+static void sig_int(int signo)
+{
+	exiting = 1;
+}
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+int main(int argc, char **argv)
+{
+	static const struct argp argp = {
+		.options = opts,
+		.parser = parse_arg,
+		.doc = argp_program_doc,
+	};
+	struct perf_buffer *pb = NULL;
+	struct oomkill_bpf *obj;
+	int err;
+
+	err = argp_parse(&argp, argc, argv, 0, NULL, NULL);
+	if (err)
+		return err;
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+	libbpf_set_print(libbpf_print_fn);
+
+	obj = oomkill_bpf__open_and_load();
+	if (!obj) {
+		fprintf(stderr, "failed to load and open BPF object\n");
+		return 1;
+	}
+
+	err = oomkill_bpf__attach(obj);
+	if (err) {
+		fprintf(stderr, "failed to attach BPF programs\n");
+		goto cleanup;
+	}
+
+	pb = perf_buffer__new(bpf_map__fd(obj->maps.events), 64,
+			      handle_event, handle_lost_events, NULL, NULL);
+	if (!pb) {
+		err = -errno;
+		fprintf(stderr, "failed to open perf buffer: %d\n", err);
+		goto cleanup;
+	}
+
+	if (signal(SIGINT, sig_int) == SIG_ERR) {
+		fprintf(stderr, "can't set signal handler: %d\n", err);
+		err = 1;
+		goto cleanup;
+	}
+
+	printf("Tracing OOM kills... Ctrl-C to stop.\n");
+
+	while (!exiting) {
+		err = perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS);
+		if (err < 0 && err != -EINTR) {
+			fprintf(stderr, "error polling perf buffer: %d\n", err);
+			goto cleanup;
+		}
+		/* reset err to return 0 if exiting */
+		err = 0;
+	}
+
+cleanup:
+	perf_buffer__free(pb);
+	oomkill_bpf__destroy(obj);
+
+	return err != 0;
+}
diff --git a/libbpf-tools/oomkill.h b/libbpf-tools/oomkill.h
new file mode 100644
index 0000000..086099d
--- /dev/null
+++ b/libbpf-tools/oomkill.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+#ifndef __OOMKILL_H
+#define __OOMKILL_H
+
+#define TASK_COMM_LEN 16
+
+struct data_t {
+	__u32 fpid;
+	__u32 tpid;
+	__u64 pages;
+	char fcomm[TASK_COMM_LEN];
+	char tcomm[TASK_COMM_LEN];
+};
+
+#endif /* __OOMKILL_H */
diff --git a/libbpf-tools/opensnoop.bpf.c b/libbpf-tools/opensnoop.bpf.c
index e378dcc..e28131a 100644
--- a/libbpf-tools/opensnoop.bpf.c
+++ b/libbpf-tools/opensnoop.bpf.c
@@ -5,9 +5,6 @@
 #include <bpf/bpf_helpers.h>
 #include "opensnoop.h"
 
-#define TASK_RUNNING	0
-
-const volatile __u64 min_us = 0;
 const volatile pid_t targ_pid = 0;
 const volatile pid_t targ_tgid = 0;
 const volatile uid_t targ_uid = 0;
diff --git a/libbpf-tools/runqlen.c b/libbpf-tools/runqlen.c
index 9cbbc73..8c77693 100644
--- a/libbpf-tools/runqlen.c
+++ b/libbpf-tools/runqlen.c
@@ -29,7 +29,7 @@
 	bool runqocc;
 	bool timestamp;
 	time_t interval;
-	bool freq;
+	int freq;
 	int times;
 	bool verbose;
 } env = {
@@ -171,12 +171,13 @@
 
 static void print_runq_occupancy(struct runqlen_bpf__bss *bss)
 {
-	__u64 samples, idle = 0, queued = 0;
 	struct hist hist;
 	int slot, i = 0;
 	float runqocc;
 
 	do {
+		__u64 samples, idle = 0, queued = 0;
+
 		hist = bss->hists[i];
 		bss->hists[i] = zero;
 		for (slot = 0; slot < MAX_SLOTS; slot++) {
diff --git a/libbpf-tools/softirqs.c b/libbpf-tools/softirqs.c
index 34cfdb7..833bc1a 100644
--- a/libbpf-tools/softirqs.c
+++ b/libbpf-tools/softirqs.c
@@ -39,10 +39,10 @@
 "USAGE: softirqs [--help] [-T] [-N] [-d] [interval] [count]\n"
 "\n"
 "EXAMPLES:\n"
-"    softirqss            # sum soft irq event time\n"
-"    softirqss -d         # show soft irq event time as histograms\n"
-"    softirqss 1 10       # print 1 second summaries, 10 times\n"
-"    softirqss -NT 1      # 1s summaries, nanoseconds, and timestamps\n";
+"    softirqs            # sum soft irq event time\n"
+"    softirqs -d         # show soft irq event time as histograms\n"
+"    softirqs 1 10       # print 1 second summaries, 10 times\n"
+"    softirqs -NT 1      # 1s summaries, nanoseconds, and timestamps\n";
 
 static const struct argp_option opts[] = {
 	{ "distributed", 'd', NULL, 0, "Show distributions as histograms" },
diff --git a/libbpf-tools/solisten.c b/libbpf-tools/solisten.c
index adaa668..02f1ee5 100644
--- a/libbpf-tools/solisten.c
+++ b/libbpf-tools/solisten.c
@@ -156,7 +156,7 @@
 
 	obj->rodata->target_pid = target_pid;
 
-	if (fentry_exists("inet_listen", NULL)) {
+	if (fentry_can_attach("inet_listen", NULL)) {
 		bpf_program__set_autoload(obj->progs.inet_listen_entry, false);
 		bpf_program__set_autoload(obj->progs.inet_listen_exit, false);
 	} else {
diff --git a/libbpf-tools/tcprtt.c b/libbpf-tools/tcprtt.c
index bed6efa..cfc0ed5 100644
--- a/libbpf-tools/tcprtt.c
+++ b/libbpf-tools/tcprtt.c
@@ -243,7 +243,7 @@
 	obj->rodata->targ_daddr = env.raddr;
 	obj->rodata->targ_ms = env.milliseconds;
 
-	if (fentry_exists("tcp_rcv_established", NULL))
+	if (fentry_can_attach("tcp_rcv_established", NULL))
 		bpf_program__set_autoload(obj->progs.tcp_rcv_kprobe, false);
 	else
 		bpf_program__set_autoload(obj->progs.tcp_rcv, false);
diff --git a/libbpf-tools/tcpsynbl.bpf.c b/libbpf-tools/tcpsynbl.bpf.c
new file mode 100644
index 0000000..c7d47fa
--- /dev/null
+++ b/libbpf-tools/tcpsynbl.bpf.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2021 Yaqi Chen
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_endian.h>
+#include "tcpsynbl.h"
+#include "bits.bpf.h"
+#include "maps.bpf.h"
+
+#define MAX_ENTRIES 65536
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, MAX_ENTRIES);
+	__type(key, u64);
+	__type(value, struct hist);
+} hists SEC(".maps");
+
+static struct hist zero;
+
+static int do_entry(struct sock *sk)
+{
+	u64 max_backlog, backlog, slot;
+	struct hist *histp;
+
+	max_backlog = BPF_CORE_READ(sk, sk_max_ack_backlog);
+	backlog = BPF_CORE_READ(sk, sk_ack_backlog);
+	histp = bpf_map_lookup_or_try_init(&hists, &max_backlog, &zero);
+	if (!histp)
+		return 0;
+
+	slot = log2l(backlog);
+	if (slot >= MAX_SLOTS)
+		slot = MAX_SLOTS - 1;
+	__sync_fetch_and_add(&histp->slots[slot], 1);
+	return 0;
+}
+
+
+SEC("kprobe/tcp_v4_syn_recv_sock")
+int BPF_KPROBE(tcp_v4_syn_recv_kprobe, struct sock *sk)
+{
+	return do_entry(sk);
+}
+
+SEC("kprobe/tcp_v6_syn_recv_sock")
+int BPF_KPROBE(tcp_v6_syn_recv_kprobe, struct sock *sk)
+{
+	return do_entry(sk);
+}
+
+SEC("fentry/tcp_v4_syn_recv_sock")
+int BPF_PROG(tcp_v4_syn_recv, struct sock *sk)
+{
+	return do_entry(sk);
+}
+
+SEC("fentry/tcp_v6_syn_recv_sock")
+int BPF_PROG(tcp_v6_syn_recv, struct sock *sk)
+{
+	return do_entry(sk);
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/libbpf-tools/tcpsynbl.c b/libbpf-tools/tcpsynbl.c
new file mode 100644
index 0000000..188a2af
--- /dev/null
+++ b/libbpf-tools/tcpsynbl.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+// Copyright (c) 2021 Yaqi Chen
+//
+// Based on tcpsynbl(8) from BCC by Brendan Gregg.
+// 19-Dec-2021   Yaqi Chen   Created this.
+#include <argp.h>
+#include <stdio.h>
+#include <signal.h>
+#include <unistd.h>
+#include <time.h>
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+#include "tcpsynbl.h"
+#include "tcpsynbl.skel.h"
+#include "trace_helpers.h"
+
+static struct env {
+	bool ipv4;
+	bool ipv6;
+	time_t interval;
+	int times;
+	bool timestamp;
+	bool verbose;
+} env = {
+	.interval = 99999999,
+	.times = 99999999,
+};
+
+static volatile sig_atomic_t exiting = 0;
+
+const char *argp_program_version = "tcpsynbl 0.1";
+const char *argp_program_bug_address =
+	"https://github.com/iovisor/bcc/tree/master/libbpf-tools";
+const char argp_program_doc[] =
+"Summarize TCP SYN backlog as a histogram.\n"
+"\n"
+"USAGE: tcpsynbl [--help] [-T] [-4] [-6] [interval] [count]\n"
+"\n"
+"EXAMPLES:\n"
+"    tcpsynbl              # summarize TCP SYN backlog as a histogram\n"
+"    tcpsynbl 1 10         # print 1 second summaries, 10 times\n"
+"    tcpsynbl -T 1         # 1s summaries with timestamps\n"
+"    tcpsynbl -4           # trace IPv4 family only\n"
+"    tcpsynbl -6           # trace IPv6 family only\n";
+
+
+static const struct argp_option opts[] = {
+	{ "timestamp", 'T', NULL, 0, "Include timestamp on output" },
+	{ "ipv4", '4', NULL, 0, "Trace IPv4 family only" },
+	{ "ipv6", '6', NULL, 0, "Trace IPv6 family only" },
+	{ "verbose", 'v', NULL, 0, "Verbose debug output" },
+	{ NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" },
+	{},
+};
+
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+	static int pos_args;
+
+	switch (key) {
+	case 'h':
+		argp_state_help(state, stderr, ARGP_HELP_STD_HELP);
+		break;
+	case 'v':
+		env.verbose = true;
+		break;
+	case 'T':
+		env.timestamp = true;
+		break;
+	case '4':
+		env.ipv4 = true;
+		break;
+	case '6':
+		env.ipv6 = true;
+		break;
+	case ARGP_KEY_ARG:
+		errno = 0;
+		if (pos_args == 0) {
+			env.interval = strtol(arg, NULL, 10);
+			if (errno) {
+				fprintf(stderr, "invalid internal\n");
+				argp_usage(state);
+			}
+		} else if (pos_args == 1) {
+			env.times = strtol(arg, NULL, 10);
+			if (errno) {
+				fprintf(stderr, "invalid times\n");
+				argp_usage(state);
+			}
+		} else {
+			fprintf(stderr,
+				"unrecognized positional argument: %s\n", arg);
+			argp_usage(state);
+		}
+		pos_args++;
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+	return 0;
+}
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	if (level == LIBBPF_DEBUG && !env.verbose)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+static void sig_handler(int sig)
+{
+	exiting = true;
+}
+
+static void disable_all_progs(struct tcpsynbl_bpf *obj)
+{
+	bpf_program__set_autoload(obj->progs.tcp_v4_syn_recv_kprobe, false);
+	bpf_program__set_autoload(obj->progs.tcp_v6_syn_recv_kprobe, false);
+	bpf_program__set_autoload(obj->progs.tcp_v4_syn_recv, false);
+	bpf_program__set_autoload(obj->progs.tcp_v6_syn_recv, false);
+}
+
+static void set_autoload_prog(struct tcpsynbl_bpf *obj, int version)
+{
+	if (version == 4) {
+		if (fentry_can_attach("tcp_v4_syn_recv_sock", NULL))
+			bpf_program__set_autoload(obj->progs.tcp_v4_syn_recv, true);
+		else
+			bpf_program__set_autoload(obj->progs.tcp_v4_syn_recv_kprobe, true);
+	}
+
+	if (version == 6){
+		if (fentry_can_attach("tcp_v6_syn_recv_sock", NULL))
+			bpf_program__set_autoload(obj->progs.tcp_v6_syn_recv, true);
+		else
+			bpf_program__set_autoload(obj->progs.tcp_v6_syn_recv_kprobe, true);
+	}
+}
+
+static int print_log2_hists(int fd)
+{
+	__u64 lookup_key = -1, next_key;
+	struct hist hist;
+	int err;
+
+	while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) {
+		err = bpf_map_lookup_elem(fd, &next_key, &hist);
+		if (err < 0) {
+			fprintf(stderr, "failed to lookup hist: %d\n", err);
+			return -1;
+		}
+		printf("backlog_max = %lld\n", next_key);
+		print_log2_hist(hist.slots, MAX_SLOTS, "backlog");
+		lookup_key = next_key;
+	}
+
+	lookup_key = -1;
+	while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) {
+		err = bpf_map_delete_elem(fd, &next_key);
+		if (err < 0) {
+			fprintf(stderr, "failed to cleanup hist : %d\n", err);
+			return -1;
+		}
+		lookup_key = next_key;
+	}
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	static const struct argp argp = {
+		.options = opts,
+		.parser = parse_arg,
+		.doc = argp_program_doc
+	};
+
+	struct tcpsynbl_bpf *obj;
+	struct tm *tm;
+	char ts[32];
+	time_t t;
+	int err, map_fd;
+
+	err = argp_parse(&argp, argc, argv, 0, NULL, NULL);
+	if (err)
+		return err;
+
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+	libbpf_set_print(libbpf_print_fn);
+
+	obj = tcpsynbl_bpf__open();
+	if (!obj) {
+		fprintf(stderr, "failed to open BPF object\n");
+		return 1;
+	}
+
+	disable_all_progs(obj);
+
+	if (env.ipv4) {
+		set_autoload_prog(obj, 4);
+	} else if (env.ipv6) {
+		set_autoload_prog(obj, 6);
+	} else {
+		set_autoload_prog(obj, 4);
+		set_autoload_prog(obj, 6);
+	}
+
+	err = tcpsynbl_bpf__load(obj);
+	if (err) {
+		fprintf(stderr, "failed to load BPF object: %d\n", err);
+		goto cleanup;
+	}
+
+	err = tcpsynbl_bpf__attach(obj);
+	if (err) {
+		fprintf(stderr, "failed to attach BPF programs\n");
+		goto cleanup;
+	}
+
+	map_fd= bpf_map__fd(obj->maps.hists);
+
+	signal(SIGINT, sig_handler);
+
+	printf("Tracing SYN backlog size. Ctrl-C to end.\n");
+
+	/* main: poll */
+	while (1) {
+		sleep(env.interval);
+		printf("\n");
+
+		if (env.timestamp) {
+			time(&t);
+			tm = localtime(&t);
+			strftime(ts, sizeof(ts), "%H:%M:%S", tm);
+			printf("%-8s\n", ts);
+		}
+
+		err = print_log2_hists(map_fd);
+		if (err)
+			break;
+
+		if (exiting || --env.times == 0)
+			break;
+	}
+
+cleanup:
+	tcpsynbl_bpf__destroy(obj);
+	return err != 0;
+}
diff --git a/libbpf-tools/tcpsynbl.h b/libbpf-tools/tcpsynbl.h
new file mode 100644
index 0000000..6c22abb
--- /dev/null
+++ b/libbpf-tools/tcpsynbl.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+#ifndef __TCPSYNBL_H
+#define __TCPSYNBL_H
+
+#define MAX_SLOTS 32
+
+struct hist {
+	__u32 slots[MAX_SLOTS];
+};
+
+#endif /* __TCPSYNBL_H */
diff --git a/libbpf-tools/trace_helpers.c b/libbpf-tools/trace_helpers.c
index 322b3c4..9165be4 100644
--- a/libbpf-tools/trace_helpers.c
+++ b/libbpf-tools/trace_helpers.c
@@ -15,6 +15,7 @@
 #include <fcntl.h>
 #include <sys/resource.h>
 #include <time.h>
+#include <bpf/bpf.h>
 #include <bpf/btf.h>
 #include <bpf/libbpf.h>
 #include <limits.h>
@@ -990,14 +991,33 @@
 	return found;
 }
 
-bool fentry_exists(const char *name, const char *mod)
+static bool fentry_try_attach(int id)
+{
+	struct bpf_insn insns[] = { { .code = BPF_JMP | BPF_EXIT } };
+	LIBBPF_OPTS(bpf_prog_load_opts, opts);
+	int prog_fd, attach_fd;
+
+	opts.expected_attach_type = BPF_TRACE_FENTRY;
+	opts.attach_btf_id = id,
+
+	prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACING, "test", NULL, insns, 1, &opts);
+	if (prog_fd < 0)
+		return false;
+
+	attach_fd = bpf_raw_tracepoint_open(NULL, prog_fd);
+	if (attach_fd >= 0)
+		close(attach_fd);
+
+	close(prog_fd);
+	return attach_fd >= 0;
+}
+
+bool fentry_can_attach(const char *name, const char *mod)
 {
 	const char sysfs_vmlinux[] = "/sys/kernel/btf/vmlinux";
 	struct btf *base, *btf = NULL;
-	const struct btf_type *type;
-	const struct btf_enum *e;
 	char sysfs_mod[80];
-	int id = -1, i, err;
+	int id = -1, err;
 
 	base = btf__parse(sysfs_vmlinux, NULL);
 	if (!base) {
@@ -1021,28 +1041,12 @@
 		base = NULL;
 	}
 
-	id = btf__find_by_name_kind(btf, "bpf_attach_type", BTF_KIND_ENUM);
-	if (id < 0)
-		goto err_out;
-	type = btf__type_by_id(btf, id);
-
-	/*
-         * As kernel BTF is exposed starting from 5.4 kernel, but fentry/fexit
-         * is actually supported starting from 5.5, so that's check this gap
-         * first, then check if target func has btf type.
-	 */
-	for (id = -1, i = 0, e = btf_enum(type); i < btf_vlen(type); i++, e++) {
-		if (!strcmp(btf__name_by_offset(btf, e->name_off),
-			    "BPF_TRACE_FENTRY")) {
-			id = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC);
-			break;
-		}
-	}
+	id = btf__find_by_name_kind(btf, name, BTF_KIND_FUNC);
 
 err_out:
 	btf__free(btf);
 	btf__free(base);
-	return id > 0;
+	return id > 0 && fentry_try_attach(id);
 }
 
 bool kprobe_exists(const char *name)
diff --git a/libbpf-tools/trace_helpers.h b/libbpf-tools/trace_helpers.h
index 98fd640..d68d246 100644
--- a/libbpf-tools/trace_helpers.h
+++ b/libbpf-tools/trace_helpers.h
@@ -77,7 +77,7 @@
  * *mod* is a hint that indicates the *name* may reside in module BTF,
  * if NULL, it means *name* belongs to vmlinux.
  */
-bool fentry_exists(const char *name, const char *mod);
+bool fentry_can_attach(const char *name, const char *mod);
 
 /*
  * The name of a kernel function to be attached to may be changed between
diff --git a/libbpf-tools/vfsstat.c b/libbpf-tools/vfsstat.c
index 5519c36..3cba0b0 100644
--- a/libbpf-tools/vfsstat.c
+++ b/libbpf-tools/vfsstat.c
@@ -160,7 +160,7 @@
 	}
 
 	/* It fallbacks to kprobes when kernel does not support fentry. */
-	if (vmlinux_btf_exists() && fentry_exists("vfs_read", NULL)) {
+	if (vmlinux_btf_exists() && fentry_can_attach("vfs_read", NULL)) {
 		bpf_program__set_autoload(skel->progs.kprobe_vfs_read, false);
 		bpf_program__set_autoload(skel->progs.kprobe_vfs_write, false);
 		bpf_program__set_autoload(skel->progs.kprobe_vfs_fsync, false);
diff --git a/man/man8/biopattern.8 b/man/man8/biopattern.8
new file mode 100644
index 0000000..451d667
--- /dev/null
+++ b/man/man8/biopattern.8
@@ -0,0 +1,78 @@
+.TH biopattern 8  "2022-02-21" "USER COMMANDS"
+.SH NAME
+biopattern \- Identify random/sequential disk access patterns.
+.SH SYNOPSIS
+.B biopattern [\-h] [\-d DISK] [interval] [count]
+.SH DESCRIPTION
+This traces block device I/O (disk I/O), and prints ratio of random/sequential I/O
+for each disk or the specified disk either on Ctrl-C, or after a given interval in seconds.
+
+This works by tracing kernel tracepoint block:block_rq_complete.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Show help message and exit.
+.TP
+\-d
+Trace this disk only.
+.TP
+interval
+Print output every interval seconds, if any.
+.TP
+count
+Number of interval summaries.
+.SH EXAMPLES
+.TP
+Trace access patterns of all disks, and print a summary on Ctrl-C:
+#
+.B biopattern
+.TP
+Trace disk sdb only:
+#
+.B biopattern -d sdb
+.TP
+Print 1 second summaries, 10 times:
+#
+.B biopattern 1 10
+.SH FIELDS
+.TP
+TIME
+Time of the output, in HH:MM:SS format.
+.TP
+DISK
+Disk device name.
+.TP
+%RND
+Ratio of random I/O.
+.TP
+%SEQ
+Ratio of sequential I/O.
+.TP
+COUNT
+Number of I/O during the interval.
+.TP
+KBYTES
+Total Kbytes for these I/O, during the interval.
+.SH OVERHEAD
+Since block device I/O usually has a relatively low frequency (< 10,000/s),
+the overhead for this tool is expected to be low or negligible. For high IOPS
+storage systems, test and quantify before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Rocky Xing
+.SH SEE ALSO
+biosnoop(8), biolatency(8), iostat(1)
diff --git a/man/man8/biotop.8 b/man/man8/biotop.8
index ed25521..47392bc 100644
--- a/man/man8/biotop.8
+++ b/man/man8/biotop.8
@@ -2,7 +2,7 @@
 .SH NAME
 biotop \- Block device (disk) I/O by process top.
 .SH SYNOPSIS
-.B biotop [\-h] [\-C] [\-r MAXROWS] [interval] [count]
+.B biotop [\-h] [\-C] [\-r MAXROWS] [\-p PID] [interval] [count]
 .SH DESCRIPTION
 This is top for disks. 
 
@@ -30,6 +30,9 @@
 \-r MAXROWS
 Maximum number of rows to print. Default is 20.
 .TP
+\-p PID
+Trace this PID only.
+.TP
 interval
 Interval between updates, seconds.
 .TP
@@ -98,7 +101,7 @@
 .SH STABILITY
 Unstable - in development.
 .SH AUTHOR
-Brendan Gregg
+Brendan Gregg, Rocky Xing
 .SH INSPIRATION
 top(1) by William LeFebvre
 .SH SEE ALSO
diff --git a/man/man8/cachetop.8 b/man/man8/cachetop.8
index 5642fa1..f6d1ea3 100644
--- a/man/man8/cachetop.8
+++ b/man/man8/cachetop.8
@@ -2,7 +2,7 @@
 .SH NAME
 cachetop \- Statistics for linux page cache hit/miss ratios per processes. Uses Linux eBPF/bcc.
 .SH SYNOPSIS
-.B cachetop
+.B cachetop [\-p PID]
 [interval]
 .SH DESCRIPTION
 This traces four kernel functions and prints per-processes summaries every
@@ -15,6 +15,10 @@
 customize which functions are traced.
 
 Since this uses BPF, only the root user can use this tool.
+.SH OPTIONS
+.TP
+\-p PID
+Trace this PID only.
 .SH KEYBINDINGS
 The following keybindings can be used to control the output of \fBcachetop\fR.
 .TP
@@ -86,6 +90,6 @@
 .SH STABILITY
 Unstable - in development.
 .SH AUTHOR
-Emmanuel Bretelle
+Emmanuel Bretelle, Rocky Xing
 .SH SEE ALSO
 cachestat (8)
diff --git a/man/man8/cpudist.8 b/man/man8/cpudist.8
index b517910..b59346b 100644
--- a/man/man8/cpudist.8
+++ b/man/man8/cpudist.8
@@ -2,7 +2,7 @@
 .SH NAME
 cpudist \- On- and off-CPU task time as a histogram.
 .SH SYNOPSIS
-.B cpudist [\-h] [-O] [\-T] [\-m] [\-P] [\-L] [\-p PID] [interval] [count]
+.B cpudist [\-h] [-O] [\-T] [\-m] [\-P] [\-L] [\-p PID] [\-I] [interval] [count]
 .SH DESCRIPTION
 This measures the time a task spends on the CPU before being descheduled, and
 shows the times as a histogram. Tasks that spend a very short time on the CPU
@@ -15,6 +15,8 @@
 operations, or alternatively very short descheduling times due to short-lived
 locks or timers.
 
+By default CPU idle time are excluded by simply excluding PID 0.
+
 This tool uses in-kernel eBPF maps for storing timestamps and the histogram,
 for efficiency. Despite this, the overhead of this tool may become significant
 for some workloads: see the OVERHEAD section.
@@ -45,6 +47,9 @@
 \-p PID
 Only show this PID (filtered in kernel for efficiency).
 .TP
+\-I
+Include CPU idle time (by default these are excluded).
+.TP
 interval
 Output interval, in seconds.
 .TP
@@ -71,6 +76,10 @@
 Trace PID 185 only, 1 second summaries:
 #
 .B cpudist -p 185 1
+.TP
+Include CPU idle time:
+#
+.B cpudist -I
 .SH FIELDS
 .TP
 usecs
diff --git a/man/man8/hardirqs.8 b/man/man8/hardirqs.8
index 12ae6be..aa9afb8 100644
--- a/man/man8/hardirqs.8
+++ b/man/man8/hardirqs.8
@@ -33,6 +33,9 @@
 .TP
 \-d
 Show IRQ time distribution as histograms.
+.TP
+\-c CPU
+Trace on this CPU only.
 .SH EXAMPLES
 .TP
 Sum hard IRQ event time until Ctrl-C:
@@ -50,6 +53,10 @@
 1 second summaries, printed in nanoseconds, with timestamps:
 #
 .B hardirqs \-NT 1
+.TP
+Sum hard IRQ event time on CPU 1 until Ctrl-C:
+#
+.B hardirqs \-c 1
 .SH FIELDS
 .TP
 HARDIRQ
@@ -91,6 +98,6 @@
 .SH STABILITY
 Unstable - in development.
 .SH AUTHOR
-Brendan Gregg, Hengqi Chen
+Brendan Gregg, Hengqi Chen, Rocky Xing
 .SH SEE ALSO
 softirqs(8)
diff --git a/man/man8/softirqs.8 b/man/man8/softirqs.8
index a9a1441..fa475f7 100644
--- a/man/man8/softirqs.8
+++ b/man/man8/softirqs.8
@@ -2,7 +2,7 @@
 .SH NAME
 softirqs \- Measure soft IRQ (soft interrupt) event time. Uses Linux eBPF/bcc.
 .SH SYNOPSIS
-.B softirqs [\-h] [\-T] [\-N] [\-d] [interval] [count]
+.B softirqs [\-h] [\-T] [\-N] [\-C] [\-d] [\-c CPU] [interval] [count]
 .SH DESCRIPTION
 This summarizes the time spent servicing soft IRQs (soft interrupts), and can
 show this time as either totals or histogram distributions. A system-wide
@@ -26,16 +26,26 @@
 Include timestamps on output.
 .TP
 \-N
-Output in nanoseconds
+Output in nanoseconds.
+.TP
+\-C
+Show the number of soft irq events.
 .TP
 \-d
-Show IRQ time distribution as histograms
+Show IRQ time distribution as histograms.
+.TP
+\-c CPU
+Trace on this CPU only.
 .SH EXAMPLES
 .TP
 Sum soft IRQ event time until Ctrl-C:
 #
 .B softirqs
 .TP
+Show the number of soft irq events:
+#
+.B softirqs \-C
+.TP
 Show soft IRQ event time as histograms:
 #
 .B softirqs \-d
@@ -47,6 +57,10 @@
 1 second summaries, printed in nanoseconds, with timestamps:
 #
 .B softirqs \-NT 1
+.TP
+Sum soft IRQ event time on CPU 1 until Ctrl-C:
+#
+.B softirqs \-c 1
 .SH FIELDS
 .TP
 SOFTIRQ
@@ -88,6 +102,6 @@
 .SH STABILITY
 Unstable - in development.
 .SH AUTHORS
-Brendan Gregg, Sasha Goldshtein
+Brendan Gregg, Sasha Goldshtein, Rocky Xing
 .SH SEE ALSO
 hardirqs(8)
diff --git a/man/man8/sslsniff.8 b/man/man8/sslsniff.8
index df81664..4b80191 100644
--- a/man/man8/sslsniff.8
+++ b/man/man8/sslsniff.8
@@ -3,7 +3,8 @@
 sslsniff \- Print data passed to OpenSSL, GnuTLS or NSS. Uses Linux eBPF/bcc.
 .SH SYNOPSIS
 .B sslsniff [-h] [-p PID] [-u UID] [-x] [-c COMM] [-o] [-g] [-n] [-d]
-.B [--hexdump] [--max-buffer-size SIZE]
+.B [--hexdump] [--max-buffer-size SIZE] [-l] [--handshake]
+.B [--extra-lib EXTRA_LIB]
 .SH DESCRIPTION
 sslsniff prints data sent to write/send and read/recv functions of
 OpenSSL, GnuTLS and NSS, allowing us to read plain text content before
@@ -46,6 +47,16 @@
 \-\-max-buffer-size SIZE
 Sets maximum buffer size of intercepted data. Longer values would be truncated.
 Default value is 8 Kib, maximum possible value is a bit less than 32 Kib.
+.TP
+\-l, \-\-latency
+Show function latency in ms.
+.TP
+\--handshake
+Show handshake latency, enabled only if latency option is on.
+.TP
+\--extra-lib EXTRA_LIB
+Consist type of the library and library path separated by colon. Supported
+library types are: openssl, gnutls, nss. Can be specified multiple times.
 .SH EXAMPLES
 .TP
 Print all calls to SSL write/send and read/recv system-wide:
@@ -55,6 +66,14 @@
 Print only OpenSSL calls issued by user with UID 1000
 #
 .B sslsniff -u 1000 --no-nss --no-gnutls
+.TP
+Print SSL handshake event and latency for all traced functions:
+#
+.B sslsniff -l --handshake
+.TP
+Print only calls to OpenSSL from /some/path/libssl.so
+.B sslsniff --no-openssl --no-gnutls --no-nss --extra-lib
+.B openssl:/some/path/libssl.so
 .SH FIELDS
 .TP
 FUNC
@@ -77,6 +96,9 @@
 .TP
 TID
 Thread ID, displayed only if launched with -x.
+.TP
+LAT(ms)
+Function latency in ms.
 .SH SOURCE
 This is from bcc.
 .IP
diff --git a/man/man8/tcpcong.8 b/man/man8/tcpcong.8
new file mode 100644
index 0000000..877ed80
--- /dev/null
+++ b/man/man8/tcpcong.8
@@ -0,0 +1,136 @@
+.TH tcpcong 8  "2022-01-27" "USER COMMANDS"
+.SH NAME
+tcpcong \- Measure tcp congestion state duration. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B tcpcong [\-h] [\-T] [\-L] [\-R] [\-u] [\-d] [interval] [outputs]
+.SH DESCRIPTION
+this tool measures tcp sockets congestion control status duration, and
+prints a summary of tcp congestion state durations along with the number
+of total state changes.
+  
+It uses dynamic tracing of kernel tcp congestion control status 
+updating functions,  and will need to be updated to match kernel changes.
+
+The traced functions are only called when there is congestion state update,
+and therefore have low overhead. we also use BPF map to store traced data 
+to reduce overhead. See the OVERHEAD section for more details.
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-h
+Print usage message.
+.TP
+\-T
+Include a timestamp column.
+.TP
+\-L
+Specify local tcp port range.
+.TP
+\-R
+Specify remote tcp port range.
+.TP
+\-u
+Output in microseconds.
+.TP
+\-d
+Show congestion status duration distribution as histograms.
+.SH EXAMPLES
+.TP
+Show all tcp sockets congestion status duration until Ctrl-C:
+#
+.B tcpcongestdura
+.TP
+Show all tcp sockets congestion status duration every 1 second and 10 times:
+#
+.B tcpcong 1 10
+.TP
+Show only local port 3000-3006 congestion status duration every 1 second:
+#
+.B tcpcong \-L 3000-3006  1
+.TP
+Show only remote port 5000-5005 congestion status duration every 1 second:
+#
+.B tcpcong \-R 5000-5005  1
+.TP
+Show 1 second summaries, printed in microseconds, with timestamps:
+#
+.B tcpcong \-uT 1
+.TP
+Show all tcp sockets congestion status duration as histograms:
+#
+.B tcpcong \-d
+.SH FIELDS
+.TP
+LAddrPort
+local ip address and tcp socket port.
+.TP
+RAddrPort
+remote ip address and tcp socket port.
+.TP
+Open_us
+Total duration in open status for microseconds.
+.TP
+Dod_us
+Total duration in disorder status for microseconds.
+.TP
+Rcov_us
+Total duration in recovery status for microseconds.
+.TP
+Cwr_us
+Total duration in cwr status for microseconds.
+.TP
+Los_us
+Total duration in loss status for microseconds.
+.TP
+Open_ms
+Total duration in open status for milliseconds.
+.TP
+Dod_ms
+Total duration in disorder status for milliseconds.
+.TP
+Rcov_ms
+Total duration in recovery status for milliseconds.
+.TP
+Cwr_ms
+Total duration in cwr status for milliseconds.
+.TP
+Loss_ms
+Total duration in loss status for milliseconds.
+.TP
+Chgs
+Total number of status change.
+.TP
+usecs
+Range of microseconds for this bucket.
+.TP
+msecs
+Range of milliseconds for this bucket.
+.TP
+count
+Number of congestion status in this time range.
+.TP
+distribution
+ASCII representation of the distribution (the count column).
+.SH OVERHEAD
+This traces the kernel tcp congestion status change functions. 
+As called rate per second of these functions per socket is low(<10000), the 
+overhead is also expected to be negligible. If you have an application that 
+will create thousands of tcp connections, then test and understand overhead 
+before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+jacky gan
+.SH SEE ALSO
+tcpretrans(8), tcpconnect(8), tcptop(8), tcpdrop(8)
diff --git a/man/man8/trace.8 b/man/man8/trace.8
index 7afd252..acfff58 100644
--- a/man/man8/trace.8
+++ b/man/man8/trace.8
@@ -3,7 +3,7 @@
 trace \- Trace a function and print its arguments or return value, optionally evaluating a filter. Uses Linux eBPF/bcc.
 .SH SYNOPSIS
 .B trace [-h] [-b BUFFER_PAGES] [-p PID] [-L TID] [--uid UID] [-v] [-Z STRING_SIZE] [-S] [-s SYM_FILE_LIST]
-         [-M MAX_EVENTS] [-t] [-u] [-T] [-C] [-K] [-U] [-a] [-I header]
+         [-M MAX_EVENTS] [-t] [-u] [-T] [-C] [-K] [-U] [-a] [-I header] [-A]
          probe [probe ...]
 .SH DESCRIPTION
 trace probes functions you specify and displays trace messages if a particular
@@ -83,6 +83,9 @@
 filter or print expressions use types or data structures that are not available
 in the standard headers. For example: 'linux/mm.h'
 .TP
+\-A
+Print aggregated amount of each trace. This should be used with -M/--max-events together.
+.TP
 probe [probe ...]
 One or more probes that attach to functions, filter conditions, and print
 information. See PROBE SYNTAX below.
diff --git a/src/cc/TEST_MAPPING b/src/cc/TEST_MAPPING
new file mode 100644
index 0000000..90892fe
--- /dev/null
+++ b/src/cc/TEST_MAPPING
@@ -0,0 +1,7 @@
+{
+  "presubmit": [
+    {
+      "name": "libbpf_load_test"
+    }
+  ]
+}
diff --git a/src/cc/api/BPFTable.cc b/src/cc/api/BPFTable.cc
index 689992b..23beae3 100644
--- a/src/cc/api/BPFTable.cc
+++ b/src/cc/api/BPFTable.cc
@@ -397,21 +397,21 @@
                                 "' is not a perf buffer");
 }
 
-StatusTuple BPFPerfBuffer::open_on_cpu(perf_reader_raw_cb cb,
-                                       perf_reader_lost_cb lost_cb, int cpu,
-                                       void* cb_cookie, int page_cnt) {
-  if (cpu_readers_.find(cpu) != cpu_readers_.end())
-    return StatusTuple(-1, "Perf buffer already open on CPU %d", cpu);
+StatusTuple BPFPerfBuffer::open_on_cpu(perf_reader_raw_cb cb, perf_reader_lost_cb lost_cb,
+                                       void* cb_cookie, int page_cnt,
+                                       struct bcc_perf_buffer_opts& opts) {
+  if (cpu_readers_.find(opts.cpu) != cpu_readers_.end())
+    return StatusTuple(-1, "Perf buffer already open on CPU %d", opts.cpu);
 
   auto reader = static_cast<perf_reader*>(
-      bpf_open_perf_buffer(cb, lost_cb, cb_cookie, -1, cpu, page_cnt));
+      bpf_open_perf_buffer_opts(cb, lost_cb, cb_cookie, page_cnt, &opts));
   if (reader == nullptr)
     return StatusTuple(-1, "Unable to construct perf reader");
 
   int reader_fd = perf_reader_fd(reader);
-  if (!update(&cpu, &reader_fd)) {
+  if (!update(&opts.cpu, &reader_fd)) {
     perf_reader_free(static_cast<void*>(reader));
-    return StatusTuple(-1, "Unable to open perf buffer on CPU %d: %s", cpu,
+    return StatusTuple(-1, "Unable to open perf buffer on CPU %d: %s", opts.cpu,
                        std::strerror(errno));
   }
 
@@ -424,13 +424,21 @@
                        std::strerror(errno));
   }
 
-  cpu_readers_[cpu] = reader;
+  cpu_readers_[opts.cpu] = reader;
   return StatusTuple::OK();
 }
 
 StatusTuple BPFPerfBuffer::open_all_cpu(perf_reader_raw_cb cb,
                                         perf_reader_lost_cb lost_cb,
                                         void* cb_cookie, int page_cnt) {
+  return open_all_cpu(cb, lost_cb, cb_cookie, page_cnt, 1);
+}
+
+StatusTuple BPFPerfBuffer::open_all_cpu(perf_reader_raw_cb cb,
+                                        perf_reader_lost_cb lost_cb,
+                                        void* cb_cookie, int page_cnt,
+                                        int wakeup_events)
+{
   if (cpu_readers_.size() != 0 || epfd_ != -1)
     return StatusTuple(-1, "Previously opened perf buffer not cleaned");
 
@@ -439,7 +447,12 @@
   epfd_ = epoll_create1(EPOLL_CLOEXEC);
 
   for (int i : cpus) {
-    auto res = open_on_cpu(cb, lost_cb, i, cb_cookie, page_cnt);
+    struct bcc_perf_buffer_opts opts = {
+      .pid = -1,
+      .cpu = i,
+      .wakeup_events = wakeup_events,
+    };
+    auto res = open_on_cpu(cb, lost_cb, cb_cookie, page_cnt, opts);
     if (!res.ok()) {
       TRY2(close_all_cpu());
       return res;
@@ -500,6 +513,14 @@
   return cnt;
 }
 
+int BPFPerfBuffer::consume() {
+  if (epfd_ < 0)
+    return -1;
+  for (auto it : cpu_readers_)
+    perf_reader_event_read(it.second);
+  return 0;
+}
+
 BPFPerfBuffer::~BPFPerfBuffer() {
   auto res = close_all_cpu();
   if (!res.ok())
diff --git a/src/cc/api/BPFTable.h b/src/cc/api/BPFTable.h
index 4b902dc..681b4a9 100644
--- a/src/cc/api/BPFTable.h
+++ b/src/cc/api/BPFTable.h
@@ -415,12 +415,15 @@
 
   StatusTuple open_all_cpu(perf_reader_raw_cb cb, perf_reader_lost_cb lost_cb,
                            void* cb_cookie, int page_cnt);
+  StatusTuple open_all_cpu(perf_reader_raw_cb cb, perf_reader_lost_cb lost_cb,
+                           void* cb_cookie, int page_cnt, int wakeup_events);
   StatusTuple close_all_cpu();
   int poll(int timeout_ms);
+  int consume();
 
  private:
   StatusTuple open_on_cpu(perf_reader_raw_cb cb, perf_reader_lost_cb lost_cb,
-                          int cpu, void* cb_cookie, int page_cnt);
+                          void* cb_cookie, int page_cnt, struct bcc_perf_buffer_opts& opts);
   StatusTuple close_on_cpu(int cpu);
 
   std::map<int, perf_reader*> cpu_readers_;
diff --git a/src/cc/bcc_btf.cc b/src/cc/bcc_btf.cc
index 7f551ae..be24861 100644
--- a/src/cc/bcc_btf.cc
+++ b/src/cc/bcc_btf.cc
@@ -652,9 +652,47 @@
 int BTF::get_map_tids(std::string map_name,
                       unsigned expected_ksize, unsigned expected_vsize,
                       unsigned *key_tid, unsigned *value_tid) {
-  return btf__get_map_kv_tids(btf_, map_name.c_str(),
-                              expected_ksize, expected_vsize,
-                              key_tid, value_tid);
+  auto struct_name = "____btf_map_" + map_name;
+  auto type_id = btf__find_by_name_kind(btf_, struct_name.c_str(), BTF_KIND_STRUCT);
+  if (type_id < 0) {
+    warning("struct %s not found in BTF\n", struct_name.c_str());
+    return -1;
+  }
+
+  auto struct_type = btf__type_by_id(btf_, type_id);
+  if (!struct_type || btf_vlen(struct_type) < 2) {
+    warning("struct %s is not a valid map struct\n", struct_name.c_str());
+    return -1;
+  }
+
+  auto members = btf_members(struct_type);
+  auto key = members[0];
+  auto key_name = btf__name_by_offset(btf_, key.name_off);
+  if (strcmp(key_name, "key")) {
+    warning("'key' should be the first member\n");
+    return -1;
+  }
+  auto key_size = btf__resolve_size(btf_, key.type);
+  if (key_size != expected_ksize) {
+    warning("expect key size to be %d, got %d\n", expected_ksize, key_size);
+    return -1;
+  }
+  *key_tid = key.type;
+
+  auto value = members[1];
+  auto value_name = btf__name_by_offset(btf_, value.name_off);
+  if (strcmp(value_name, "value")) {
+    warning("'value' should be the second member\n");
+    return -1;
+  }
+  auto value_size = btf__resolve_size(btf_, value.type);
+  if (value_size != expected_vsize) {
+    warning("expect value size to be %d, got %d\n", expected_vsize, value_size);
+    return -1;
+  }
+  *value_tid = value.type;
+
+  return 0;
 }
 
 } // namespace ebpf
diff --git a/src/cc/bcc_btf.h b/src/cc/bcc_btf.h
index b460eb3..96492b4 100644
--- a/src/cc/bcc_btf.h
+++ b/src/cc/bcc_btf.h
@@ -26,6 +26,7 @@
 #include "bpf_module.h"
 
 struct btf;
+struct btf_type;
 
 namespace btf_ext_vendored {
 
diff --git a/src/cc/bcc_common.cc b/src/cc/bcc_common.cc
index 5c349d7..c33e37a 100644
--- a/src/cc/bcc_common.cc
+++ b/src/cc/bcc_common.cc
@@ -37,6 +37,10 @@
   return mod;
 }
 
+bool bpf_module_rw_engine_enabled() {
+  return ebpf::bpf_module_rw_engine_enabled();
+}
+
 void bpf_module_destroy(void *program) {
   auto mod = static_cast<ebpf::BPFModule *>(program);
   if (!mod) return;
diff --git a/src/cc/bcc_common.h b/src/cc/bcc_common.h
index b5f77db..ed68f54 100644
--- a/src/cc/bcc_common.h
+++ b/src/cc/bcc_common.h
@@ -30,6 +30,7 @@
 void * bpf_module_create_c_from_string(const char *text, unsigned flags, const char *cflags[],
                                        int ncflags, bool allow_rlimit,
                                        const char *dev_name);
+bool bpf_module_rw_engine_enabled();
 void bpf_module_destroy(void *program);
 char * bpf_module_license(void *program);
 unsigned bpf_module_kern_version(void *program);
diff --git a/src/cc/bcc_debug.cc b/src/cc/bcc_debug.cc
index 52b6571..d7ed49f 100644
--- a/src/cc/bcc_debug.cc
+++ b/src/cc/bcc_debug.cc
@@ -19,6 +19,9 @@
 #include <tuple>
 #include <vector>
 
+#if LLVM_MAJOR_VERSION >= 15
+#include <llvm/DebugInfo/DWARF/DWARFCompileUnit.h>
+#endif
 #include <llvm/DebugInfo/DWARF/DWARFContext.h>
 #include <llvm/DebugInfo/DWARF/DWARFDebugLine.h>
 #include <llvm/IR/Module.h>
@@ -29,6 +32,9 @@
 #include <llvm/MC/MCInstrInfo.h>
 #include <llvm/MC/MCObjectFileInfo.h>
 #include <llvm/MC/MCRegisterInfo.h>
+#if LLVM_MAJOR_VERSION >= 15
+#include <llvm/MC/MCSubtargetInfo.h>
+#endif
 #if LLVM_MAJOR_VERSION >= 14
 #include <llvm/MC/TargetRegistry.h>
 #else
@@ -190,68 +196,67 @@
   vector<string> LineCache = buildLineCache();
 
   // Start to disassemble with source code annotation section by section
-  for (auto section : sections_)
-    if (!strncmp(fn_prefix_.c_str(), section.first.c_str(),
-                 fn_prefix_.size())) {
-      MCDisassembler::DecodeStatus S;
-      MCInst Inst;
-      uint64_t Size;
-      uint8_t *FuncStart = get<0>(section.second);
-      uint64_t FuncSize = get<1>(section.second);
+  prog_func_info_.for_each_func([&](std::string func_name, FuncInfo &info) {
+    MCDisassembler::DecodeStatus S;
+    MCInst Inst;
+    uint64_t Size;
+    uint8_t *FuncStart = info.start_;
+    uint64_t FuncSize = info.size_;
 #if LLVM_MAJOR_VERSION >= 9
-      unsigned SectionID = get<2>(section.second);
-#endif
-      ArrayRef<uint8_t> Data(FuncStart, FuncSize);
-      uint32_t CurrentSrcLine = 0;
-      string func_name = section.first.substr(fn_prefix_.size());
-
-      errs() << "Disassembly of section " << section.first << ":\n"
-             << func_name << ":\n";
-
-      string src_dbg_str;
-      llvm::raw_string_ostream os(src_dbg_str);
-      for (uint64_t Index = 0; Index < FuncSize; Index += Size) {
-#if LLVM_MAJOR_VERSION >= 10
-        S = DisAsm->getInstruction(Inst, Size, Data.slice(Index), Index,
-                                   nulls());
-#else
-        S = DisAsm->getInstruction(Inst, Size, Data.slice(Index), Index,
-                                   nulls(), nulls());
-#endif
-        if (S != MCDisassembler::Success) {
-          os << "Debug Error: disassembler failed: " << std::to_string(S)
+    auto section = sections_.find(info.section_);
+    if (section == sections_.end()) {
+      errs() << "Debug Error: no section entry for section " << info.section_
              << '\n';
-          break;
-        } else {
-          DILineInfo LineInfo;
-
-          LineTable->getFileLineInfoForAddress(
-#if LLVM_MAJOR_VERSION >= 9
-              {(uint64_t)FuncStart + Index, SectionID},
-#else
-              (uint64_t)FuncStart + Index,
-#endif
-              CU->getCompilationDir(),
-              DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath,
-              LineInfo);
-
-          adjustInstSize(Size, Data[Index], Data[Index + 1]);
-          dumpSrcLine(LineCache, LineInfo.FileName, LineInfo.Line,
-                      CurrentSrcLine, os);
-          os << format("%4" PRIu64 ":", Index >> 3) << '\t';
-          dumpBytes(Data.slice(Index, Size), os);
-#if LLVM_MAJOR_VERSION >= 10
-          IP->printInst(&Inst, 0, "", *STI, os);
-#else
-          IP->printInst(&Inst, os, "", *STI);
-#endif
-          os << '\n';
-        }
-      }
-      os.flush();
-      errs() << src_dbg_str << '\n';
-      src_dbg_fmap_[func_name] = src_dbg_str;
+      return;
     }
+    unsigned SectionID = get<2>(section->second);
+#endif
+    ArrayRef<uint8_t> Data(FuncStart, FuncSize);
+    uint32_t CurrentSrcLine = 0;
+
+    errs() << "Disassembly of function " << func_name << "\n";
+
+    string src_dbg_str;
+    llvm::raw_string_ostream os(src_dbg_str);
+    for (uint64_t Index = 0; Index < FuncSize; Index += Size) {
+#if LLVM_MAJOR_VERSION >= 10
+      S = DisAsm->getInstruction(Inst, Size, Data.slice(Index), Index, nulls());
+#else
+      S = DisAsm->getInstruction(Inst, Size, Data.slice(Index), Index, nulls(),
+                                 nulls());
+#endif
+      if (S != MCDisassembler::Success) {
+        os << "Debug Error: disassembler failed: " << std::to_string(S) << '\n';
+        break;
+      } else {
+        DILineInfo LineInfo;
+
+        LineTable->getFileLineInfoForAddress(
+#if LLVM_MAJOR_VERSION >= 9
+            {(uint64_t)FuncStart + Index, SectionID},
+#else
+            (uint64_t)FuncStart + Index,
+#endif
+            CU->getCompilationDir(),
+            DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, LineInfo);
+
+        adjustInstSize(Size, Data[Index], Data[Index + 1]);
+        dumpSrcLine(LineCache, LineInfo.FileName, LineInfo.Line, CurrentSrcLine,
+                    os);
+        os << format("%4" PRIu64 ":", Index >> 3) << '\t';
+        dumpBytes(Data.slice(Index, Size), os);
+#if LLVM_MAJOR_VERSION >= 10
+        IP->printInst(&Inst, 0, "", *STI, os);
+#else
+        IP->printInst(&Inst, os, "", *STI);
+#endif
+        os << '\n';
+      }
+    }
+    os.flush();
+    errs() << src_dbg_str << '\n';
+    src_dbg_fmap_[func_name] = src_dbg_str;
+  });
 }
 
 }  // namespace ebpf
diff --git a/src/cc/bcc_debug.h b/src/cc/bcc_debug.h
index 1467ca8..f9bda11 100644
--- a/src/cc/bcc_debug.h
+++ b/src/cc/bcc_debug.h
@@ -15,19 +15,18 @@
  */
 
 #include "bpf_module.h"
+#include "frontends/clang/loader.h"
 
 namespace ebpf {
 
 class SourceDebugger {
  public:
-  SourceDebugger(
-      llvm::Module *mod,
-      sec_map_def &sections,
-      const std::string &fn_prefix, const std::string &mod_src,
-      std::map<std::string, std::string> &src_dbg_fmap)
+  SourceDebugger(llvm::Module *mod, sec_map_def &sections,
+                 ProgFuncInfo &prog_func_info, const std::string &mod_src,
+                 std::map<std::string, std::string> &src_dbg_fmap)
       : mod_(mod),
         sections_(sections),
-        fn_prefix_(fn_prefix),
+        prog_func_info_(prog_func_info),
         mod_src_(mod_src),
         src_dbg_fmap_(src_dbg_fmap) {}
 // Only support dump for llvm 6.x and later.
@@ -56,7 +55,7 @@
  private:
   llvm::Module *mod_;
   const sec_map_def &sections_;
-  const std::string &fn_prefix_;
+  ProgFuncInfo &prog_func_info_;
   const std::string &mod_src_;
   std::map<std::string, std::string> &src_dbg_fmap_;
 };
diff --git a/src/cc/bpf_module.cc b/src/cc/bpf_module.cc
index 36f9582..b029962 100644
--- a/src/cc/bpf_module.cc
+++ b/src/cc/bpf_module.cc
@@ -13,38 +13,48 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <fcntl.h>
-#include <map>
-#include <string>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <vector>
-#include <set>
-#include <linux/bpf.h>
-#include <net/if.h>
+#include "bpf_module.h"
 
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <llvm-c/Transforms/IPO.h>
 #include <llvm/ExecutionEngine/MCJIT.h>
 #include <llvm/ExecutionEngine/SectionMemoryManager.h>
 #include <llvm/IR/IRPrintingPasses.h>
-#include <llvm/IR/LegacyPassManager.h>
 #include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/LegacyPassManager.h>
 #include <llvm/IR/Module.h>
+
+#if LLVM_MAJOR_VERSION >= 15
+#include <llvm/Pass.h>
+#endif
+
 #include <llvm/IR/Verifier.h>
+#include <llvm/Object/ObjectFile.h>
+#include <llvm/Object/ELFObjectFile.h>
+#include <llvm/Object/SymbolSize.h>
 #include <llvm/Support/TargetSelect.h>
 #include <llvm/Transforms/IPO.h>
 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
-#include <llvm-c/Transforms/IPO.h>
+#include <net/if.h>
+#include <sys/stat.h>
+#include <unistd.h>
 
-#include "common.h"
+#include <map>
+#include <set>
+#include <string>
+#include <iostream>
+#include <vector>
+
+#include "bcc_btf.h"
 #include "bcc_debug.h"
 #include "bcc_elf.h"
-#include "frontends/clang/loader.h"
-#include "frontends/clang/b_frontend_action.h"
-#include "bpf_module.h"
-#include "exported_files.h"
-#include "libbpf.h"
-#include "bcc_btf.h"
 #include "bcc_libbpf_inc.h"
+#include "common.h"
+#include "exported_files.h"
+#include "frontends/clang/b_frontend_action.h"
+#include "frontends/clang/loader.h"
+#include "libbpf.h"
 
 namespace ebpf {
 
@@ -58,15 +68,11 @@
 using std::vector;
 using namespace llvm;
 
-const string BPFModule::FN_PREFIX = BPF_FN_PREFIX;
-
 // Snooping class to remember the sections as the JIT creates them
 class MyMemoryManager : public SectionMemoryManager {
  public:
-
-  explicit MyMemoryManager(sec_map_def *sections)
-      : sections_(sections) {
-  }
+  explicit MyMemoryManager(sec_map_def *sections, ProgFuncInfo *prog_func_info)
+      : sections_(sections), prog_func_info_(prog_func_info) {}
 
   virtual ~MyMemoryManager() {}
   uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
@@ -74,8 +80,6 @@
                                StringRef SectionName) override {
     // The programs need to change from fake fd to real map fd, so not allocate ReadOnly regions.
     uint8_t *Addr = SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID, SectionName, false);
-    //printf("allocateDataSection: %s Addr %p Size %ld Alignment %d SectionID %d\n",
-    //       SectionName.str().c_str(), (void *)Addr, Size, Alignment, SectionID);
     (*sections_)[SectionName.str()] = make_tuple(Addr, Size, SectionID);
     return Addr;
   }
@@ -85,12 +89,38 @@
     // The lines in .BTF.ext line_info, if corresponding to remapped files, will have empty source line.
     // The line_info will be fixed in place, so not allocate ReadOnly regions.
     uint8_t *Addr = SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID, SectionName, false);
-    //printf("allocateDataSection: %s Addr %p Size %ld Alignment %d SectionID %d\n",
-    //       SectionName.str().c_str(), (void *)Addr, Size, Alignment, SectionID);
     (*sections_)[SectionName.str()] = make_tuple(Addr, Size, SectionID);
     return Addr;
   }
+
+  void notifyObjectLoaded(ExecutionEngine *EE,
+                          const object::ObjectFile &o) override {
+    auto sizes = llvm::object::computeSymbolSizes(o);
+    for (auto ss : sizes) {
+      auto maybe_name = ss.first.getName();
+      if (!maybe_name)
+        continue;
+
+      std::string name = maybe_name->str();
+      auto info = prog_func_info_->get_func(name);
+      if (!info)
+        continue;
+
+      auto section = ss.first.getSection();
+      if (!section)
+        continue;
+
+      auto sec_name = section.get()->getName();
+      if (!sec_name)
+        continue;
+
+      info->section_ = sec_name->str();
+      info->size_ = ss.second;
+    }
+  }
+
   sec_map_def *sections_;
+  ProgFuncInfo *prog_func_info_;
 };
 
 BPFModule::BPFModule(unsigned flags, TableStorage *ts, bool rw_engine_enabled,
@@ -120,7 +150,7 @@
     local_ts_ = createSharedTableStorage();
     ts_ = &*local_ts_;
   }
-  func_src_ = ebpf::make_unique<FuncSource>();
+  prog_func_info_ = ebpf::make_unique<ProgFuncInfo>();
 }
 
 static StatusTuple unimplemented_sscanf(const char *, void *) {
@@ -139,14 +169,18 @@
   }
 
   if (!rw_engine_enabled_) {
-    for (auto section : sections_)
-      delete[] get<0>(section.second);
+    prog_func_info_->for_each_func(
+        [&](std::string name, FuncInfo &info) {
+      if (!info.start_)
+        return;
+      delete[] info.start_;
+    });
   }
 
   engine_.reset();
   cleanup_rw_engine();
   ctx_.reset();
-  func_src_.reset();
+  prog_func_info_.reset();
 
   if (btf_)
     delete btf_;
@@ -162,7 +196,8 @@
 int BPFModule::load_cfile(const string &file, bool in_memory, const char *cflags[], int ncflags) {
   ClangLoader clang_loader(&*ctx_, flags_);
   if (clang_loader.parse(&mod_, *ts_, file, in_memory, cflags, ncflags, id_,
-                         *func_src_, mod_src_, maps_ns_, fake_fd_map_, perf_events_))
+                         *prog_func_info_, mod_src_, maps_ns_, fake_fd_map_,
+                         perf_events_))
     return -1;
   return 0;
 }
@@ -175,8 +210,9 @@
 int BPFModule::load_includes(const string &text) {
   ClangLoader clang_loader(&*ctx_, flags_);
   const char *cflags[] = {"-DB_WORKAROUND"};
-  if (clang_loader.parse(&mod_, *ts_, text, true, cflags, 1, "", *func_src_,
-                         mod_src_, "", fake_fd_map_, perf_events_))
+  if (clang_loader.parse(&mod_, *ts_, text, true, cflags, 1, "",
+                         *prog_func_info_, mod_src_, "", fake_fd_map_,
+                         perf_events_))
     return -1;
   return 0;
 }
@@ -426,26 +462,19 @@
   }
 
   // update instructions
-  for (auto section : sections) {
-    auto sec_name = section.first;
-    if (strncmp(".bpf.fn.", sec_name.c_str(), 8) == 0) {
-      uint8_t *addr = get<0>(section.second);
-      uintptr_t size = get<1>(section.second);
-      struct bpf_insn *insns = (struct bpf_insn *)addr;
-      int i, num_insns;
-
-      num_insns = size/sizeof(struct bpf_insn);
-      for (i = 0; i < num_insns; i++) {
-        if (insns[i].code == (BPF_LD | BPF_DW | BPF_IMM)) {
-          // change map_fd is it is a ld_pseudo */
-          if (insns[i].src_reg == BPF_PSEUDO_MAP_FD &&
-              map_fds.find(insns[i].imm) != map_fds.end())
-            insns[i].imm = map_fds[insns[i].imm];
-          i++;
-        }
+  prog_func_info_->for_each_func([&](std::string name, FuncInfo &info) {
+    struct bpf_insn *insns = (struct bpf_insn *)info.start_;
+    uint32_t i, num_insns = info.size_ / sizeof(struct bpf_insn);
+    for (i = 0; i < num_insns; i++) {
+      if (insns[i].code == (BPF_LD | BPF_DW | BPF_IMM)) {
+        // change map_fd is it is a ld_pseudo
+        if (insns[i].src_reg == BPF_PSEUDO_MAP_FD &&
+            map_fds.find(insns[i].imm) != map_fds.end())
+          insns[i].imm = map_fds[insns[i].imm];
+        i++;
       }
     }
-  }
+  });
 
   return 0;
 }
@@ -474,7 +503,8 @@
   string err;
   EngineBuilder builder(move(mod_));
   builder.setErrorStr(&err);
-  builder.setMCJITMemoryManager(ebpf::make_unique<MyMemoryManager>(sections_p));
+  builder.setMCJITMemoryManager(
+      ebpf::make_unique<MyMemoryManager>(sections_p, &*prog_func_info_));
   builder.setMArch("bpf");
 #if LLVM_MAJOR_VERSION <= 11
   builder.setUseOrcMCJITReplacement(false);
@@ -485,20 +515,19 @@
     return -1;
   }
 
-#if LLVM_MAJOR_VERSION >= 9
   engine_->setProcessAllSections(true);
-#else
-  if (flags_ & DEBUG_SOURCE)
-    engine_->setProcessAllSections(true);
-#endif
 
   if (int rc = run_pass_manager(*mod))
     return rc;
 
   engine_->finalizeObject();
+  prog_func_info_->for_each_func([&](std::string name, FuncInfo &info) {
+    info.start_ = (uint8_t *)engine_->getFunctionAddress(name);
+  });
+  finalize_prog_func_info();
 
   if (flags_ & DEBUG_SOURCE) {
-    SourceDebugger src_debugger(mod, *sections_p, FN_PREFIX, mod_src_,
+    SourceDebugger src_debugger(mod, *sections_p, *prog_func_info_, mod_src_,
                                 src_dbg_fmap_);
     src_debugger.dump();
   }
@@ -521,51 +550,74 @@
       }
       sections_[fname] = make_tuple(tmp_p, size, get<2>(section.second));
     }
+
+    prog_func_info_->for_each_func([](std::string name, FuncInfo &info) {
+      uint8_t *tmp_p = new uint8_t[info.size_];
+      memcpy(tmp_p, info.start_, info.size_);
+      info.start_ = tmp_p;
+    });
     engine_.reset();
     ctx_.reset();
   }
 
-  // give functions an id
-  for (auto section : sections_)
-    if (!strncmp(FN_PREFIX.c_str(), section.first.c_str(), FN_PREFIX.size()))
-      function_names_.push_back(section.first);
-
   return 0;
 }
 
-size_t BPFModule::num_functions() const {
-  return function_names_.size();
+void BPFModule::finalize_prog_func_info() {
+  // prog_func_info_'s FuncInfo data is gradually populated (first in frontend
+  // action, then bpf_module). It's possible for a FuncInfo to have been
+  // created by FrontendAction but no corresponding start location found in
+  // bpf_module - filter out these functions
+  //
+  // The numeric function ids in the new prog_func_info_ are considered
+  // canonical
+  std::unique_ptr<ProgFuncInfo> finalized = ebpf::make_unique<ProgFuncInfo>();
+  prog_func_info_->for_each_func([&](std::string name, FuncInfo &info) {
+    if(info.start_) {
+      auto i = finalized->add_func(name);
+      if (i) { // should always be true
+        *i = info;
+      }
+    }
+  });
+  prog_func_info_.swap(finalized);
 }
 
+size_t BPFModule::num_functions() const { return prog_func_info_->num_funcs(); }
+
 const char * BPFModule::function_name(size_t id) const {
-  if (id >= function_names_.size())
-    return nullptr;
-  return function_names_[id].c_str() + FN_PREFIX.size();
+  auto name = prog_func_info_->func_name(id);
+  if (name)
+    return name->c_str();
+  return nullptr;
 }
 
 uint8_t * BPFModule::function_start(size_t id) const {
-  if (id >= function_names_.size())
-    return nullptr;
-  auto section = sections_.find(function_names_[id]);
-  if (section == sections_.end())
-    return nullptr;
-  return get<0>(section->second);
+  auto fn = prog_func_info_->get_func(id);
+  if (fn)
+    return fn->start_;
+  return nullptr;
 }
 
 uint8_t * BPFModule::function_start(const string &name) const {
-  auto section = sections_.find(FN_PREFIX + name);
-  if (section == sections_.end())
-    return nullptr;
-
-  return get<0>(section->second);
+  auto fn = prog_func_info_->get_func(name);
+  if (fn)
+    return fn->start_;
+  return nullptr;
 }
 
 const char * BPFModule::function_source(const string &name) const {
-  return func_src_->src(name);
+  auto fn = prog_func_info_->get_func(name);
+  if (fn)
+    return fn->src_.c_str();
+  return "";
 }
 
 const char * BPFModule::function_source_rewritten(const string &name) const {
-  return func_src_->src_rewritten(name);
+  auto fn = prog_func_info_->get_func(name);
+  if (fn)
+    return fn->src_rewritten_.c_str();
+  return "";
 }
 
 int BPFModule::annotate_prog_tag(const string &name, int prog_fd,
@@ -637,20 +689,17 @@
 }
 
 size_t BPFModule::function_size(size_t id) const {
-  if (id >= function_names_.size())
-    return 0;
-  auto section = sections_.find(function_names_[id]);
-  if (section == sections_.end())
-    return 0;
-  return get<1>(section->second);
+  auto fn = prog_func_info_->get_func(id);
+  if (fn)
+    return fn->size_;
+  return 0;
 }
 
 size_t BPFModule::function_size(const string &name) const {
-  auto section = sections_.find(FN_PREFIX + name);
-  if (section == sections_.end())
-    return 0;
-
-  return get<1>(section->second);
+  auto fn = prog_func_info_->get_func(name);
+  if (fn)
+    return fn->size_;
+  return 0;
 }
 
 char * BPFModule::license() const {
@@ -903,7 +952,7 @@
     int btf_fd = btf_->get_fd();
     char secname[256];
 
-    ::snprintf(secname, sizeof(secname), ".bpf.fn.%s", name);
+    ::snprintf(secname, sizeof(secname), "%s%s", BPF_FN_PREFIX, name);
     ret = btf_->get_btf_info(secname, &func_info, &func_info_cnt,
                              &finfo_rec_size, &line_info,
                              &line_info_cnt, &linfo_rec_size);
diff --git a/src/cc/bpf_module.h b/src/cc/bpf_module.h
index 87938c3..fb368af 100644
--- a/src/cc/bpf_module.h
+++ b/src/cc/bpf_module.h
@@ -59,14 +59,13 @@
 class TableStorage;
 class BLoader;
 class ClangLoader;
-class FuncSource;
+class ProgFuncInfo;
 class BTF;
 
 bool bpf_module_rw_engine_enabled(void);
 
 class BPFModule {
  private:
-  static const std::string FN_PREFIX;
   int init_engine();
   void initialize_rw_engine();
   void cleanup_rw_engine();
@@ -74,6 +73,7 @@
   int finalize();
   int annotate();
   void annotate_light();
+  void finalize_prog_func_info();
   std::unique_ptr<llvm::ExecutionEngine> finalize_rw(std::unique_ptr<llvm::Module> mod);
   std::string make_reader(llvm::Module *mod, llvm::Type *type);
   std::string make_writer(llvm::Module *mod, llvm::Type *type);
@@ -162,11 +162,10 @@
   std::unique_ptr<llvm::ExecutionEngine> engine_;
   std::unique_ptr<llvm::ExecutionEngine> rw_engine_;
   std::unique_ptr<llvm::Module> mod_;
-  std::unique_ptr<FuncSource> func_src_;
+  std::unique_ptr<ProgFuncInfo> prog_func_info_;
   sec_map_def sections_;
   std::vector<TableDesc *> tables_;
   std::map<std::string, size_t> table_names_;
-  std::vector<std::string> function_names_;
   std::map<llvm::Type *, std::string> readers_;
   std::map<llvm::Type *, std::string> writers_;
   std::string id_;
diff --git a/src/cc/bpf_module_rw_engine.cc b/src/cc/bpf_module_rw_engine.cc
index 533d8a1..f164988 100644
--- a/src/cc/bpf_module_rw_engine.cc
+++ b/src/cc/bpf_module_rw_engine.cc
@@ -401,7 +401,12 @@
     GlobalValue *gvar = mod_->getNamedValue(table.name);
     if (!gvar) continue;
     if (PointerType *pt = dyn_cast<PointerType>(gvar->getType())) {
-      if (StructType *st = dyn_cast<StructType>(pt->getElementType())) {
+#if LLVM_MAJOR_VERSION >= 15
+      StructType *st = dyn_cast<StructType>(pt->getPointerElementType());
+#else
+      StructType *st = dyn_cast<StructType>(pt->getElementType());
+#endif
+      if (st) {
         if (st->getNumElements() < 2) continue;
         Type *key_type = st->elements()[0];
         Type *leaf_type = st->elements()[1];
diff --git a/src/cc/compat/linux/virtual_bpf.h b/src/cc/compat/linux/virtual_bpf.h
index 0f3a547..f54dd25 100644
--- a/src/cc/compat/linux/virtual_bpf.h
+++ b/src/cc/compat/linux/virtual_bpf.h
@@ -331,6 +331,8 @@
  *			*ctx_out*, *data_in* and *data_out* must be NULL.
  *			*repeat* must be zero.
  *
+ *		BPF_PROG_RUN is an alias for BPF_PROG_TEST_RUN.
+ *
  *	Return
  *		Returns zero on success. On error, -1 is returned and *errno*
  *		is set appropriately.
@@ -996,6 +998,7 @@
 	BPF_SK_REUSEPORT_SELECT,
 	BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
 	BPF_PERF_EVENT,
+	BPF_TRACE_KPROBE_MULTI,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1010,6 +1013,7 @@
 	BPF_LINK_TYPE_NETNS = 5,
 	BPF_LINK_TYPE_XDP = 6,
 	BPF_LINK_TYPE_PERF_EVENT = 7,
+	BPF_LINK_TYPE_KPROBE_MULTI = 8,
 
 	MAX_BPF_LINK_TYPE,
 };
@@ -1112,6 +1116,16 @@
  */
 #define BPF_F_SLEEPABLE		(1U << 4)
 
+/* If BPF_F_XDP_HAS_FRAGS is used in BPF_PROG_LOAD command, the loaded program
+ * fully support xdp frags.
+ */
+#define BPF_F_XDP_HAS_FRAGS	(1U << 5)
+
+/* link_create.kprobe_multi.flags used in LINK_CREATE command for
+ * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
+ */
+#define BPF_F_KPROBE_MULTI_RETURN	(1U << 0)
+
 /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
  * the following extensions:
  *
@@ -1226,6 +1240,8 @@
 
 /* If set, run the test on the cpu specified by bpf_attr.test.cpu */
 #define BPF_F_TEST_RUN_ON_CPU	(1U << 0)
+/* If set, XDP frames will be transmitted after processing */
+#define BPF_F_TEST_XDP_LIVE_FRAMES	(1U << 1)
 
 /* type for BPF_ENABLE_STATS */
 enum bpf_stats_type {
@@ -1387,6 +1403,7 @@
 		__aligned_u64	ctx_out;
 		__u32		flags;
 		__u32		cpu;
+		__u32		batch_size;
 	} test;
 
 	struct { /* anonymous struct used by BPF_*_GET_*_ID */
@@ -1466,6 +1483,13 @@
 				 */
 				__u64		bpf_cookie;
 			} perf_event;
+			struct {
+				__u32		flags;
+				__u32		cnt;
+				__aligned_u64	syms;
+				__aligned_u64	addrs;
+				__aligned_u64	cookies;
+			} kprobe_multi;
 		};
 	} link_create;
 
@@ -1776,6 +1800,8 @@
  * 		0 on success, or a negative error in case of failure.
  *
  * u64 bpf_get_current_pid_tgid(void)
+ * 	Description
+ * 		Get the current pid and tgid.
  * 	Return
  * 		A 64-bit integer containing the current tgid and pid, and
  * 		created as such:
@@ -1783,6 +1809,8 @@
  * 		*current_task*\ **->pid**.
  *
  * u64 bpf_get_current_uid_gid(void)
+ * 	Description
+ * 		Get the current uid and gid.
  * 	Return
  * 		A 64-bit integer containing the current GID and UID, and
  * 		created as such: *current_gid* **<< 32 \|** *current_uid*.
@@ -2257,6 +2285,8 @@
  * 		The 32-bit hash.
  *
  * u64 bpf_get_current_task(void)
+ * 	Description
+ * 		Get the current task.
  * 	Return
  * 		A pointer to the current task struct.
  *
@@ -2287,8 +2317,8 @@
  * 	Return
  * 		The return value depends on the result of the test, and can be:
  *
- *		* 0, if current task belongs to the cgroup2.
- *		* 1, if current task does not belong to the cgroup2.
+ *		* 1, if current task belongs to the cgroup2.
+ *		* 0, if current task does not belong to the cgroup2.
  * 		* A negative error code, if an error occurred.
  *
  * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
@@ -2370,6 +2400,8 @@
  * 		indicate that the hash is outdated and to trigger a
  * 		recalculation the next time the kernel tries to access this
  * 		hash or when the **bpf_get_hash_recalc**\ () helper is called.
+ * 	Return
+ * 		void.
  *
  * long bpf_get_numa_node_id(void)
  * 	Description
@@ -2467,6 +2499,8 @@
  * 		A 8-byte long unique number or 0 if *sk* is NULL.
  *
  * u32 bpf_get_socket_uid(struct sk_buff *skb)
+ * 	Description
+ * 		Get the owner UID of the socked associated to *skb*.
  * 	Return
  * 		The owner UID of the socket associated to *skb*. If the socket
  * 		is **NULL**, or if it is not a full socket (i.e. if it is a
@@ -3241,6 +3275,9 @@
  * 		The id is returned or 0 in case the id could not be retrieved.
  *
  * u64 bpf_get_current_cgroup_id(void)
+ * 	Description
+ * 		Get the current cgroup id based on the cgroup within which
+ * 		the current task is running.
  * 	Return
  * 		A 64-bit integer containing the current cgroup id based
  * 		on the cgroup within which the current task is running.
@@ -5019,6 +5056,94 @@
  *
  *	Return
  *		The number of arguments of the traced function.
+ *
+ * int bpf_get_retval(void)
+ *	Description
+ *		Get the syscall's return value that will be returned to userspace.
+ *
+ *		This helper is currently supported by cgroup programs only.
+ *	Return
+ *		The syscall's return value.
+ *
+ * int bpf_set_retval(int retval)
+ *	Description
+ *		Set the syscall's return value that will be returned to userspace.
+ *
+ *		This helper is currently supported by cgroup programs only.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_xdp_get_buff_len(struct xdp_buff *xdp_md)
+ *	Description
+ *		Get the total size of a given xdp buff (linear and paged area)
+ *	Return
+ *		The total size of a given xdp buffer.
+ *
+ * long bpf_xdp_load_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len)
+ *	Description
+ *		This helper is provided as an easy way to load data from a
+ *		xdp buffer. It can be used to load *len* bytes from *offset* from
+ *		the frame associated to *xdp_md*, into the buffer pointed by
+ *		*buf*.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * long bpf_xdp_store_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len)
+ *	Description
+ *		Store *len* bytes from buffer *buf* into the frame
+ *		associated to *xdp_md*, at *offset*.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *
+ * long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags)
+ *	Description
+ *		Read *size* bytes from user space address *user_ptr* in *tsk*'s
+ *		address space, and stores the data in *dst*. *flags* is not
+ *		used yet and is provided for future extensibility. This helper
+ *		can only be used by sleepable programs.
+ *	Return
+ *		0 on success, or a negative error in case of failure. On error
+ *		*dst* buffer is zeroed out.
+ *
+ * long bpf_skb_set_tstamp(struct sk_buff *skb, u64 tstamp, u32 tstamp_type)
+ *	Description
+ *		Change the __sk_buff->tstamp_type to *tstamp_type*
+ *		and set *tstamp* to the __sk_buff->tstamp together.
+ *
+ *		If there is no need to change the __sk_buff->tstamp_type,
+ *		the tstamp value can be directly written to __sk_buff->tstamp
+ *		instead.
+ *
+ *		BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that
+ *		will be kept during bpf_redirect_*().  A non zero
+ *		*tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO
+ *		*tstamp_type*.
+ *
+ *		A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used
+ *		with a zero *tstamp*.
+ *
+ *		Only IPv4 and IPv6 skb->protocol are supported.
+ *
+ *		This function is most useful when it needs to set a
+ *		mono delivery time to __sk_buff->tstamp and then
+ *		bpf_redirect_*() to the egress of an iface.  For example,
+ *		changing the (rcv) timestamp in __sk_buff->tstamp at
+ *		ingress to a mono delivery time and then bpf_redirect_*()
+ *		to sch_fq@phy-dev.
+ *	Return
+ *		0 on success.
+ *		**-EINVAL** for invalid input
+ *		**-EOPNOTSUPP** for unsupported protocol
+ *
+ * long bpf_ima_file_hash(struct file *file, void *dst, u32 size)
+ *	Description
+ *		Returns a calculated IMA hash of the *file*.
+ *		If the hash is larger than *size*, then only *size*
+ *		bytes will be copied to *dst*
+ *	Return
+ *		The **hash_algo** is returned on success,
+ *		**-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if
+ *		invalid arguments are passed.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5207,6 +5332,14 @@
 	FN(get_func_arg),		\
 	FN(get_func_ret),		\
 	FN(get_func_arg_cnt),		\
+	FN(get_retval),			\
+	FN(set_retval),			\
+	FN(xdp_get_buff_len),		\
+	FN(xdp_load_bytes),		\
+	FN(xdp_store_bytes),		\
+	FN(copy_from_user_task),	\
+	FN(skb_set_tstamp),		\
+	FN(ima_file_hash),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -5396,6 +5529,15 @@
 	__u64 :64;			\
 } __attribute__((aligned(8)))
 
+enum {
+	BPF_SKB_TSTAMP_UNSPEC,
+	BPF_SKB_TSTAMP_DELIVERY_MONO,	/* tstamp has mono delivery time */
+	/* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle,
+	 * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC
+	 * and try to deduce it by ingress, egress or skb->sk->sk_clockid.
+	 */
+};
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
@@ -5436,7 +5578,8 @@
 	__u32 gso_segs;
 	__bpf_md_ptr(struct bpf_sock *, sk);
 	__u32 gso_size;
-	__u32 :32;		/* Padding, future use. */
+	__u8  tstamp_type;
+	__u32 :24;		/* Padding, future use. */
 	__u64 hwtstamp;
 };
 
@@ -5501,7 +5644,8 @@
 	__u32 src_ip4;
 	__u32 src_ip6[4];
 	__u32 src_port;		/* host byte order */
-	__u32 dst_port;		/* network byte order */
+	__be16 dst_port;	/* network byte order */
+	__u16 :16;		/* zero padding */
 	__u32 dst_ip4;
 	__u32 dst_ip6[4];
 	__u32 state;
@@ -6379,7 +6523,8 @@
 	__u32 protocol;		/* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
 	__u32 remote_ip4;	/* Network byte order */
 	__u32 remote_ip6[4];	/* Network byte order */
-	__u32 remote_port;	/* Network byte order */
+	__be16 remote_port;	/* Network byte order */
+	__u16 :16;		/* Zero padding */
 	__u32 local_ip4;	/* Network byte order */
 	__u32 local_ip6[4];	/* Network byte order */
 	__u32 local_port;	/* Host byte order */
diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h
index c1253e2..7ede57a 100644
--- a/src/cc/export/helpers.h
+++ b/src/cc/export/helpers.h
@@ -109,6 +109,12 @@
   void (*increment) (_key_type, ...); \
   void (*atomic_increment) (_key_type, ...); \
   int (*get_stackid) (void *, u64); \
+  void * (*sk_storage_get) (void *, void *, int); \
+  int (*sk_storage_delete) (void *); \
+  void * (*inode_storage_get) (void *, void *, int); \
+  int (*inode_storage_delete) (void *); \
+  void * (*task_storage_get) (void *, void *, int); \
+  int (*task_storage_delete) (void *); \
   u32 max_entries; \
   int flags; \
 }; \
@@ -164,8 +170,17 @@
 #define BPF_TABLE(_table_type, _key_type, _leaf_type, _name, _max_entries) \
 BPF_F_TABLE(_table_type, _key_type, _leaf_type, _name, _max_entries, 0)
 
-#define BPF_TABLE_PINNED(_table_type, _key_type, _leaf_type, _name, _max_entries, _pinned) \
-BPF_TABLE(_table_type ":" _pinned, _key_type, _leaf_type, _name, _max_entries)
+#define BPF_TABLE_PINNED7(_table_type, _key_type, _leaf_type, _name, _max_entries, _pinned, _flags) \
+  BPF_F_TABLE(_table_type ":" _pinned, _key_type, _leaf_type, _name, _max_entries, _flags)
+
+#define BPF_TABLE_PINNED6(_table_type, _key_type, _leaf_type, _name, _max_entries, _pinned) \
+  BPF_F_TABLE(_table_type ":" _pinned, _key_type, _leaf_type, _name, _max_entries, 0)
+
+#define BPF_TABLE_PINNEDX(_1, _2, _3, _4, _5, _6, _7, NAME, ...) NAME
+
+// Define a pinned table with optional flags argument
+#define BPF_TABLE_PINNED(...) \
+  BPF_TABLE_PINNEDX(__VA_ARGS__, BPF_TABLE_PINNED7, BPF_TABLE_PINNED6)(__VA_ARGS__)
 
 // define a table same as above but allow it to be referenced by other modules
 #define BPF_TABLE_PUBLIC(_table_type, _key_type, _leaf_type, _name, _max_entries) \
@@ -952,6 +967,20 @@
   (void *)BPF_FUNC_get_func_arg;
 static long (*bpf_get_func_ret)(void *ctx, __u64 *value) = (void *)BPF_FUNC_get_func_ret;
 static long (*bpf_get_func_arg_cnt)(void *ctx) = (void *)BPF_FUNC_get_func_arg_cnt;
+static int (*bpf_get_retval)(void) = (void *)BPF_FUNC_get_retval;
+static int (*bpf_set_retval)(int retval) = (void *)BPF_FUNC_set_retval;
+static __u64 (*bpf_xdp_get_buff_len)(struct xdp_md *xdp_md) = (void *)BPF_FUNC_xdp_get_buff_len;
+static long (*bpf_xdp_load_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf, __u32 len) =
+  (void *)BPF_FUNC_xdp_load_bytes;
+static long (*bpf_xdp_store_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf, __u32 len) =
+  (void *)BPF_FUNC_xdp_store_bytes;
+static long (*bpf_copy_from_user_task)(void *dst, __u32 size, const void *user_ptr,
+				       struct task_struct *tsk, __u64 flags) =
+  (void *)BPF_FUNC_copy_from_user_task;
+static long (*bpf_skb_set_tstamp)(struct __sk_buff *skb, __u64 tstamp, __u32 tstamp_type) =
+  (void *)BPF_FUNC_skb_set_tstamp;
+static long (*bpf_ima_file_hash)(struct file *file, void *dst, __u32 size) =
+  (void *)BPF_FUNC_ima_file_hash;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/src/cc/frontends/clang/b_frontend_action.cc b/src/cc/frontends/clang/b_frontend_action.cc
index 7bfc4ed..9b2853a 100644
--- a/src/cc/frontends/clang/b_frontend_action.cc
+++ b/src/cc/frontends/clang/b_frontend_action.cc
@@ -811,10 +811,23 @@
   if (fe_.is_rewritable_ext_func(D)) {
     current_fn_ = string(D->getName());
     string bd = rewriter_.getRewrittenText(expansionRange(D->getSourceRange()));
-    fe_.func_src_.set_src(current_fn_, bd);
+    auto func_info = fe_.prog_func_info_.add_func(current_fn_);
+    if (!func_info) {
+      // We should only reach add_func above once per function seen, but the
+      // BPF_PROG-helper using macros in export/helpers.h (KFUNC_PROBE ..
+      // LSM_PROBE) break this logic. TODO: adjust export/helpers.h to not
+      // do so and bail out here, or find a better place to do add_func
+      func_info = fe_.prog_func_info_.get_func(current_fn_);
+      //error(GET_BEGINLOC(D), "redefinition of existing function");
+      //return false;
+    }
+    func_info->src_ = bd;
     fe_.func_range_[current_fn_] = expansionRange(D->getSourceRange());
-    string attr = string("__attribute__((section(\"") + BPF_FN_PREFIX + D->getName().str() + "\")))\n";
-    rewriter_.InsertText(real_start_loc, attr);
+    if (!D->getAttr<SectionAttr>()) {
+      string attr = string("__attribute__((section(\"") + BPF_FN_PREFIX +
+                    D->getName().str() + "\")))\n";
+      rewriter_.InsertText(real_start_loc, attr);
+    }
     if (D->param_size() > MAX_CALLING_CONV_REGS + 1) {
       error(GET_BEGINLOC(D->getParamDecl(MAX_CALLING_CONV_REGS + 1)),
             "too many arguments, bcc only supports in-register parameters");
@@ -1689,13 +1702,12 @@
 
 }
 
-BFrontendAction::BFrontendAction(llvm::raw_ostream &os, unsigned flags,
-                                 TableStorage &ts, const std::string &id,
-                                 const std::string &main_path,
-                                 FuncSource &func_src, std::string &mod_src,
-                                 const std::string &maps_ns,
-                                 fake_fd_map_def &fake_fd_map,
-                                 std::map<std::string, std::vector<std::string>> &perf_events)
+BFrontendAction::BFrontendAction(
+    llvm::raw_ostream &os, unsigned flags, TableStorage &ts,
+    const std::string &id, const std::string &main_path,
+    ProgFuncInfo &prog_func_info, std::string &mod_src,
+    const std::string &maps_ns, fake_fd_map_def &fake_fd_map,
+    std::map<std::string, std::vector<std::string>> &perf_events)
     : os_(os),
       flags_(flags),
       ts_(ts),
@@ -1703,7 +1715,7 @@
       maps_ns_(maps_ns),
       rewriter_(new Rewriter),
       main_path_(main_path),
-      func_src_(func_src),
+      prog_func_info_(prog_func_info),
       mod_src_(mod_src),
       next_fake_fd_(-1),
       fake_fd_map_(fake_fd_map),
@@ -1781,7 +1793,9 @@
   for (auto func : func_range_) {
     auto f = func.first;
     string bd = rewriter_->getRewrittenText(func_range_[f]);
-    func_src_.set_src_rewritten(f, bd);
+    auto fn = prog_func_info_.get_func(f);
+    if (fn)
+      fn->src_rewritten_ = bd;
   }
   rewriter_->getEditBuffer(rewriter_->getSourceMgr().getMainFileID()).write(os_);
   os_.flush();
diff --git a/src/cc/frontends/clang/b_frontend_action.h b/src/cc/frontends/clang/b_frontend_action.h
index 530d322..2256459 100644
--- a/src/cc/frontends/clang/b_frontend_action.h
+++ b/src/cc/frontends/clang/b_frontend_action.h
@@ -40,7 +40,7 @@
 namespace ebpf {
 
 class BFrontendAction;
-class FuncSource;
+class ProgFuncInfo;
 
 // Traces maps with external pointers as values.
 class MapVisitor : public clang::RecursiveASTVisitor<MapVisitor> {
@@ -156,9 +156,8 @@
   // should be written.
   BFrontendAction(llvm::raw_ostream &os, unsigned flags, TableStorage &ts,
                   const std::string &id, const std::string &main_path,
-                  FuncSource &func_src, std::string &mod_src,
-                  const std::string &maps_ns,
-                  fake_fd_map_def &fake_fd_map,
+                  ProgFuncInfo &prog_func_info, std::string &mod_src,
+                  const std::string &maps_ns, fake_fd_map_def &fake_fd_map,
                   std::map<std::string, std::vector<std::string>> &perf_events);
 
   // Called by clang when the AST has been completed, here the output stream
@@ -192,7 +191,7 @@
   friend class BTypeVisitor;
   std::map<std::string, clang::SourceRange> func_range_;
   const std::string &main_path_;
-  FuncSource &func_src_;
+  ProgFuncInfo &prog_func_info_;
   std::string &mod_src_;
   std::set<clang::Decl *> m_;
   int next_fake_fd_;
diff --git a/src/cc/frontends/clang/loader.cc b/src/cc/frontends/clang/loader.cc
index 4f9914a..d0f4d88 100644
--- a/src/cc/frontends/clang/loader.cc
+++ b/src/cc/frontends/clang/loader.cc
@@ -66,6 +66,44 @@
 
 namespace ebpf {
 
+optional<FuncInfo &> ProgFuncInfo::get_func(std::string name) {
+  auto it = funcs_.find(name);
+  if (it != funcs_.end())
+    return it->second;
+  return nullopt;
+}
+
+optional<FuncInfo &> ProgFuncInfo::get_func(size_t id) {
+  auto it = func_idx_.find(id);
+  if (it != func_idx_.end())
+    return get_func(it->second);
+  return nullopt;
+}
+
+optional<std::string &> ProgFuncInfo::func_name(size_t id) {
+  auto it = func_idx_.find(id);
+  if (it != func_idx_.end())
+    return it->second;
+  return nullopt;
+}
+
+void ProgFuncInfo::for_each_func(
+    std::function<void(std::string, FuncInfo &)> cb) {
+  for (auto it = funcs_.begin(); it != funcs_.end(); ++it) {
+    cb(it->first, it->second);
+  }
+}
+
+optional<FuncInfo &> ProgFuncInfo::add_func(std::string name) {
+  auto fn = get_func(name);
+  if (fn)
+    return nullopt;
+  size_t current = funcs_.size();
+  funcs_.emplace(name, 0);
+  func_idx_.emplace(current, name);
+  return get_func(name);
+}
+
 ClangLoader::ClangLoader(llvm::LLVMContext *ctx, unsigned flags)
     : ctx_(ctx), flags_(flags)
 {
@@ -152,13 +190,12 @@
 
 }
 
-int ClangLoader::parse(unique_ptr<llvm::Module> *mod, TableStorage &ts,
-                       const string &file, bool in_memory, const char *cflags[],
-                       int ncflags, const std::string &id, FuncSource &func_src,
-                       std::string &mod_src,
-                       const std::string &maps_ns,
-                       fake_fd_map_def &fake_fd_map,
-                       std::map<std::string, std::vector<std::string>> &perf_events) {
+int ClangLoader::parse(
+    unique_ptr<llvm::Module> *mod, TableStorage &ts, const string &file,
+    bool in_memory, const char *cflags[], int ncflags, const std::string &id,
+    ProgFuncInfo &prog_func_info, std::string &mod_src,
+    const std::string &maps_ns, fake_fd_map_def &fake_fd_map,
+    std::map<std::string, std::vector<std::string>> &perf_events) {
   string main_path = "/virtual/main.c";
   unique_ptr<llvm::MemoryBuffer> main_buf;
   struct utsname un;
@@ -249,6 +286,7 @@
     return -1;
 #if LLVM_MAJOR_VERSION >= 9
   flags_cstr.push_back("-g");
+  flags_cstr.push_back("-gdwarf-4");
 #else
   if (flags_ & DEBUG_SOURCE)
     flags_cstr.push_back("-g");
@@ -280,7 +318,8 @@
 #endif
 
   if (do_compile(mod, ts, in_memory, flags_cstr, flags_cstr_rem, main_path,
-                 main_buf, id, func_src, mod_src, true, maps_ns, fake_fd_map, perf_events)) {
+                 main_buf, id, prog_func_info, mod_src, true, maps_ns,
+                 fake_fd_map, perf_events)) {
 #if BCC_BACKUP_COMPILE != 1
     return -1;
 #else
@@ -288,11 +327,12 @@
     llvm::errs() << "WARNING: compilation failure, trying with system bpf.h\n";
 
     ts.DeletePrefix(Path({id}));
-    func_src.clear();
+    prog_func_info.clear();
     mod_src.clear();
     fake_fd_map.clear();
     if (do_compile(mod, ts, in_memory, flags_cstr, flags_cstr_rem, main_path,
-                   main_buf, id, func_src, mod_src, false, maps_ns, fake_fd_map, perf_events))
+                   main_buf, id, prog_func_info, mod_src, false, maps_ns,
+                   fake_fd_map, perf_events))
       return -1;
 #endif
   }
@@ -334,17 +374,14 @@
   return string(ret);
 }
 
-int ClangLoader::do_compile(unique_ptr<llvm::Module> *mod, TableStorage &ts,
-                            bool in_memory,
-                            const vector<const char *> &flags_cstr_in,
-                            const vector<const char *> &flags_cstr_rem,
-                            const std::string &main_path,
-                            const unique_ptr<llvm::MemoryBuffer> &main_buf,
-                            const std::string &id, FuncSource &func_src,
-                            std::string &mod_src, bool use_internal_bpfh,
-                            const std::string &maps_ns,
-                            fake_fd_map_def &fake_fd_map,
-                            std::map<std::string, std::vector<std::string>> &perf_events) {
+int ClangLoader::do_compile(
+    unique_ptr<llvm::Module> *mod, TableStorage &ts, bool in_memory,
+    const vector<const char *> &flags_cstr_in,
+    const vector<const char *> &flags_cstr_rem, const std::string &main_path,
+    const unique_ptr<llvm::MemoryBuffer> &main_buf, const std::string &id,
+    ProgFuncInfo &prog_func_info, std::string &mod_src, bool use_internal_bpfh,
+    const std::string &maps_ns, fake_fd_map_def &fake_fd_map,
+    std::map<std::string, std::vector<std::string>> &perf_events) {
   using namespace clang;
 
   vector<const char *> flags_cstr = flags_cstr_in;
@@ -444,7 +481,7 @@
   // capture the rewritten c file
   string out_str1;
   llvm::raw_string_ostream os1(out_str1);
-  BFrontendAction bact(os1, flags_, ts, id, main_path, func_src, mod_src,
+  BFrontendAction bact(os1, flags_, ts, id, main_path, prog_func_info, mod_src,
                        maps_ns, fake_fd_map, perf_events);
   if (!compiler1.ExecuteAction(bact))
     return -1;
@@ -474,27 +511,4 @@
 
   return 0;
 }
-
-const char * FuncSource::src(const std::string& name) {
-  auto src = funcs_.find(name);
-  if (src == funcs_.end())
-    return "";
-  return src->second.src_.data();
-}
-
-const char * FuncSource::src_rewritten(const std::string& name) {
-  auto src = funcs_.find(name);
-  if (src == funcs_.end())
-    return "";
-  return src->second.src_rewritten_.data();
-}
-
-void FuncSource::set_src(const std::string& name, const std::string& src) {
-  funcs_[name].src_ = src;
-}
-
-void FuncSource::set_src_rewritten(const std::string& name, const std::string& src) {
-  funcs_[name].src_rewritten_ = src;
-}
-
 }  // namespace ebpf
diff --git a/src/cc/frontends/clang/loader.h b/src/cc/frontends/clang/loader.h
index 05db08c..aa6f9ee 100644
--- a/src/cc/frontends/clang/loader.h
+++ b/src/cc/frontends/clang/loader.h
@@ -16,13 +16,18 @@
 
 #pragma once
 
+#include <clang/Frontend/CompilerInvocation.h>
+
+#include <functional>
 #include <map>
 #include <memory>
 #include <string>
 
-#include <clang/Frontend/CompilerInvocation.h>
-
 #include "table_storage.h"
+#include "vendor/optional.hpp"
+
+using std::experimental::nullopt;
+using std::experimental::optional;
 
 namespace llvm {
 class Module;
@@ -32,21 +37,33 @@
 
 namespace ebpf {
 
-class FuncSource {
-  class SourceCode {
-   public:
-    SourceCode(const std::string& s1 = "", const std::string& s2 = ""): src_(s1), src_rewritten_(s2) {}
-    std::string src_;
-    std::string src_rewritten_;
-  };
-  std::map<std::string, SourceCode> funcs_;
+struct FuncInfo {
+  uint8_t *start_ = nullptr;
+  size_t size_ = 0;
+  std::string section_;
+  std::string src_;
+  std::string src_rewritten_;
+  // dummy constructor so emplace() works
+  FuncInfo(int i) {}
+};
+
+class ProgFuncInfo {
  public:
-  FuncSource() {}
-  void clear() { funcs_.clear(); }
-  const char * src(const std::string& name);
-  const char * src_rewritten(const std::string& name);
-  void set_src(const std::string& name, const std::string& src);
-  void set_src_rewritten(const std::string& name, const std::string& src);
+  ProgFuncInfo() {}
+  void clear() {
+    funcs_.clear();
+    func_idx_.clear();
+  }
+  optional<FuncInfo &> get_func(std::string name);
+  optional<FuncInfo &> get_func(size_t id);
+  optional<std::string &> func_name(size_t id);
+  optional<FuncInfo &> add_func(std::string name);
+  size_t num_funcs() { return funcs_.size(); }
+  void for_each_func(std::function<void(std::string, FuncInfo &)> cb);
+
+ private:
+  std::map<std::string, FuncInfo> funcs_;
+  std::map<uint32_t, std::string> func_idx_;
 };
 
 class ClangLoader {
@@ -55,7 +72,7 @@
   ~ClangLoader();
   int parse(std::unique_ptr<llvm::Module> *mod, TableStorage &ts,
             const std::string &file, bool in_memory, const char *cflags[],
-            int ncflags, const std::string &id, FuncSource &func_src,
+            int ncflags, const std::string &id, ProgFuncInfo &prog_func_info,
             std::string &mod_src, const std::string &maps_ns,
             fake_fd_map_def &fake_fd_map,
             std::map<std::string, std::vector<std::string>> &perf_events);
@@ -66,10 +83,9 @@
                  const std::vector<const char *> &flags_cstr_rem,
                  const std::string &main_path,
                  const std::unique_ptr<llvm::MemoryBuffer> &main_buf,
-                 const std::string &id, FuncSource &func_src,
+                 const std::string &id, ProgFuncInfo &prog_func_info,
                  std::string &mod_src, bool use_internal_bpfh,
-                 const std::string &maps_ns,
-                 fake_fd_map_def &fake_fd_map,
+                 const std::string &maps_ns, fake_fd_map_def &fake_fd_map,
                  std::map<std::string, std::vector<std::string>> &perf_events);
   void add_remapped_includes(clang::CompilerInvocation& invocation);
   void add_main_input(clang::CompilerInvocation& invocation,
diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c
index bbd6161..c93f82f 100644
--- a/src/cc/libbpf.c
+++ b/src/cc/libbpf.c
@@ -289,7 +289,15 @@
   {"strncmp", "5.17"},
   {"get_func_arg", "5.17"},
   {"get_func_ret", "5.17"},
-  {"get_func_arg_cnt", "5.17"},
+  {"get_func_ret", "5.17"},
+  {"get_retval", "5.18"},
+  {"set_retval", "5.18"},
+  {"xdp_get_buff_len", "5.18"},
+  {"xdp_load_bytes", "5.18"},
+  {"xdp_store_bytes", "5.18"},
+  {"copy_from_user_task", "5.18"},
+  {"skb_set_tstamp", "5.18"},
+  {"ima_file_hash", "5.18"},
 };
 
 static uint64_t ptr_to_u64(void *ptr)
@@ -1386,7 +1394,16 @@
 #ifndef MINIMAL_LIBBPF
 bool bpf_has_kernel_btf(void)
 {
-  return libbpf_find_vmlinux_btf_id("bpf_prog_put", 0) > 0;
+  struct btf *btf;
+  int err;
+
+  btf = btf__parse_raw("/sys/kernel/btf/vmlinux");
+  err = libbpf_get_error(btf);
+  if (err)
+    return false;
+
+  btf__free(btf);
+  return true;
 }
 
 int kernel_struct_has_field(const char *struct_name, const char *field_name)
diff --git a/src/cc/perf_reader.c b/src/cc/perf_reader.c
index dedb11d..f4c24fd 100644
--- a/src/cc/perf_reader.c
+++ b/src/cc/perf_reader.c
@@ -93,7 +93,7 @@
     return -1;
   }
 
-  reader->base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE , MAP_SHARED, reader->fd, 0);
+  reader->base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, reader->fd, 0);
   if (reader->base == MAP_FAILED) {
     perror("mmap");
     return -1;
@@ -237,6 +237,14 @@
   return 0;
 }
 
+int perf_reader_consume(int num_readers, struct perf_reader **readers) {
+  int i;
+  for (i = 0; i < num_readers; ++i) {
+    perf_reader_event_read(readers[i]);
+  }
+  return 0;
+}
+
 void perf_reader_set_fd(struct perf_reader *reader, int fd) {
   reader->fd = fd;
 }
diff --git a/src/cc/perf_reader.h b/src/cc/perf_reader.h
index dbe9cfb..278b885 100644
--- a/src/cc/perf_reader.h
+++ b/src/cc/perf_reader.h
@@ -32,6 +32,7 @@
 int perf_reader_mmap(struct perf_reader *reader);
 void perf_reader_event_read(struct perf_reader *reader);
 int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout);
+int perf_reader_consume(int num_readers, struct perf_reader **readers);
 int perf_reader_fd(struct perf_reader *reader);
 void perf_reader_set_fd(struct perf_reader *reader, int fd);
 
diff --git a/src/cc/usdt.h b/src/cc/usdt.h
index 5f12588..a3d9bfe 100644
--- a/src/cc/usdt.h
+++ b/src/cc/usdt.h
@@ -166,6 +166,22 @@
     X64_REG_14,
     X64_REG_15,
     X64_REG_RIP,
+    X64_REG_XMM0,
+    X64_REG_XMM1,
+    X64_REG_XMM2,
+    X64_REG_XMM3,
+    X64_REG_XMM4,
+    X64_REG_XMM5,
+    X64_REG_XMM6,
+    X64_REG_XMM7,
+    X64_REG_XMM8,
+    X64_REG_XMM9,
+    X64_REG_XMM10,
+    X64_REG_XMM11,
+    X64_REG_XMM12,
+    X64_REG_XMM13,
+    X64_REG_XMM14,
+    X64_REG_XMM15,
   };
 
   struct RegInfo {
diff --git a/src/cc/usdt/usdt.cc b/src/cc/usdt/usdt.cc
index e3d0c44..84f8076 100644
--- a/src/cc/usdt/usdt.cc
+++ b/src/cc/usdt/usdt.cc
@@ -175,6 +175,11 @@
   if (arg_count == 0)
     return true;
 
+  uint64_t page_size = sysconf(_SC_PAGESIZE);
+  std::unordered_set<int> page_offsets;
+  for (Location &location : locations_)
+    page_offsets.insert(location.address_ % page_size);
+
   for (size_t arg_n = 0; arg_n < arg_count; ++arg_n) {
     std::string ctype = largest_arg_type(arg_n);
     std::string cptr = tfm::format("*((%s *)dest)", ctype);
@@ -193,15 +198,22 @@
         return false;
       stream << "\n  return 0;\n}\n";
     } else {
-      stream << "  switch(PT_REGS_IP(ctx)) {\n";
+      if (page_offsets.size() == locations_.size())
+        tfm::format(stream, "  switch (PT_REGS_IP(ctx) %% 0x%xULL) {\n", page_size);
+      else
+        stream << "  switch (PT_REGS_IP(ctx)) {\n";
       for (Location &location : locations_) {
-        uint64_t global_address;
+        if (page_offsets.size() == locations_.size()) {
+          tfm::format(stream, "  case 0x%xULL: ", location.address_ % page_size);
+        } else {
+          uint64_t global_address;
 
-        if (!resolve_global_address(&global_address, location.bin_path_,
-                                    location.address_))
-          return false;
+          if (!resolve_global_address(&global_address, location.bin_path_,
+                                      location.address_))
+            return false;
 
-        tfm::format(stream, "  case 0x%xULL: ", global_address);
+          tfm::format(stream, "  case 0x%xULL: ", global_address);
+        }
         if (!location.arguments_[arg_n].assign_to_local(stream, cptr, location.bin_path_,
                                                         pid_))
           return false;
diff --git a/src/cc/usdt/usdt_args.cc b/src/cc/usdt/usdt_args.cc
index c3384e1..88555c3 100644
--- a/src/cc/usdt/usdt_args.cc
+++ b/src/cc/usdt/usdt_args.cc
@@ -69,7 +69,13 @@
   }
 
   if (!deref_offset_) {
-    tfm::format(stream, "%s = ctx->%s;", local_name, *base_register_name_);
+    if(base_register_name_->substr(0,3) == "xmm") {
+      // TODO: When we can read xmm registers from BPF, update this to read
+      // the actual value
+      tfm::format(stream, "%s = 0;", local_name);
+    } else {
+      tfm::format(stream, "%s = ctx->%s;", local_name, *base_register_name_);
+    }
     // Put a compiler barrier to prevent optimization
     // like llvm SimplifyCFG SinkThenElseCodeToEnd
     // Volatile marking is not sufficient to prevent such optimization.
@@ -532,6 +538,23 @@
         {"r15w", {X64_REG_15, 2}}, {"r15b", {X64_REG_15, 1}},
 
         {"rip", {X64_REG_RIP, 8}},
+
+        {"xmm0", {X64_REG_XMM0, 16}},
+        {"xmm1", {X64_REG_XMM1, 16}},
+        {"xmm2", {X64_REG_XMM2, 16}},
+        {"xmm3", {X64_REG_XMM3, 16}},
+        {"xmm4", {X64_REG_XMM4, 16}},
+        {"xmm5", {X64_REG_XMM5, 16}},
+        {"xmm6", {X64_REG_XMM6, 16}},
+        {"xmm7", {X64_REG_XMM7, 16}},
+        {"xmm8", {X64_REG_XMM8, 16}},
+        {"xmm9", {X64_REG_XMM9, 16}},
+        {"xmm10", {X64_REG_XMM10, 16}},
+        {"xmm11", {X64_REG_XMM11, 16}},
+        {"xmm12", {X64_REG_XMM12, 16}},
+        {"xmm13", {X64_REG_XMM13, 16}},
+        {"xmm14", {X64_REG_XMM14, 16}},
+        {"xmm15", {X64_REG_XMM15, 16}},
 };
 
 void ArgumentParser_x64::reg_to_name(std::string *norm, Register reg) {
@@ -590,6 +613,56 @@
   case X64_REG_RIP:
     *norm = "ip";
     break;
+
+  case X64_REG_XMM0:
+    *norm = "xmm0";
+    break;
+  case X64_REG_XMM1:
+    *norm = "xmm1";
+    break;
+  case X64_REG_XMM2:
+    *norm = "xmm2";
+    break;
+  case X64_REG_XMM3:
+    *norm = "xmm3";
+    break;
+  case X64_REG_XMM4:
+    *norm = "xmm4";
+    break;
+  case X64_REG_XMM5:
+    *norm = "xmm5";
+    break;
+  case X64_REG_XMM6:
+    *norm = "xmm6";
+    break;
+  case X64_REG_XMM7:
+    *norm = "xmm7";
+    break;
+  case X64_REG_XMM8:
+    *norm = "xmm8";
+    break;
+  case X64_REG_XMM9:
+    *norm = "xmm9";
+    break;
+  case X64_REG_XMM10:
+    *norm = "xmm10";
+    break;
+  case X64_REG_XMM11:
+    *norm = "xmm11";
+    break;
+  case X64_REG_XMM12:
+    *norm = "xmm12";
+    break;
+  case X64_REG_XMM13:
+    *norm = "xmm13";
+    break;
+  case X64_REG_XMM14:
+    *norm = "xmm14";
+    break;
+  case X64_REG_XMM15:
+    *norm = "xmm15";
+    break;
+
   }
 }
 
diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py
index 2ff5cf0..1118698 100644
--- a/src/python/bcc/__init__.py
+++ b/src/python/bcc/__init__.py
@@ -957,7 +957,8 @@
             ct.cast(None, ct.POINTER(bcc_symbol_option)),
             ct.byref(sym),
         ) < 0:
-            raise Exception("could not determine address of symbol %s" % symname)
+            raise Exception("could not determine address of symbol %s in %s"
+                            % (symname.decode(), module.decode()))
         new_addr = sym.offset + sym_off
         module_path = ct.cast(sym.module, ct.c_char_p).value
         lib.bcc_procutils_free(sym.module)
@@ -1667,6 +1668,18 @@
             readers[i] = v
         lib.perf_reader_poll(len(readers), readers, timeout)
 
+    def perf_buffer_consume(self):
+        """perf_buffer_consume(self)
+
+        Consume all open perf buffers, regardless of whether or not
+        they currently contain events data. Necessary to catch 'remainder'
+        events when wakeup_events > 1 is set in open_perf_buffer
+        """
+        readers = (ct.c_void_p * len(self.perf_buffers))()
+        for i, v in enumerate(self.perf_buffers.values()):
+            readers[i] = v
+        lib.perf_reader_consume(len(readers), readers)
+
     def kprobe_poll(self, timeout = -1):
         """kprobe_poll(self)
 
diff --git a/src/python/bcc/libbcc.py b/src/python/bcc/libbcc.py
index f9b83b3..ca5584c 100644
--- a/src/python/bcc/libbcc.py
+++ b/src/python/bcc/libbcc.py
@@ -26,6 +26,8 @@
 lib.bpf_module_create_c_from_string.restype = ct.c_void_p
 lib.bpf_module_create_c_from_string.argtypes = [ct.c_char_p, ct.c_uint,
         ct.POINTER(ct.c_char_p), ct.c_int, ct.c_bool, ct.c_char_p]
+lib.bpf_module_rw_engine_enabled.restype = ct.c_bool
+lib.bpf_module_rw_engine_enabled.argtypes = None
 lib.bpf_module_destroy.restype = None
 lib.bpf_module_destroy.argtypes = [ct.c_void_p]
 lib.bpf_module_license.restype = ct.c_char_p
@@ -147,6 +149,8 @@
 lib.bpf_open_perf_event.argtypes = [ct.c_uint, ct.c_ulonglong, ct.c_int, ct.c_int]
 lib.perf_reader_poll.restype = ct.c_int
 lib.perf_reader_poll.argtypes = [ct.c_int, ct.POINTER(ct.c_void_p), ct.c_int]
+lib.perf_reader_consume.restype = ct.c_int
+lib.perf_reader_consume.argtypes = [ct.c_int, ct.POINTER(ct.c_void_p)]
 lib.perf_reader_free.restype = None
 lib.perf_reader_free.argtypes = [ct.c_void_p]
 lib.perf_reader_fd.restype = int
diff --git a/tests/cc/test_bpf_table.cc b/tests/cc/test_bpf_table.cc
index 2d5a564..43bf28b 100644
--- a/tests/cc/test_bpf_table.cc
+++ b/tests/cc/test_bpf_table.cc
@@ -21,7 +21,7 @@
 #include "BPF.h"
 #include "catch.hpp"
 
-TEST_CASE("test bpf table", "[bpf_table]") {
+TEST_CASE("test bpf table", ebpf::bpf_module_rw_engine_enabled() ? "[bpf_table]" : "[bpf_table][!mayfail]") {
   const std::string BPF_PROGRAM = R"(
     BPF_TABLE("hash", int, int, myhash, 128);
   )";
@@ -92,7 +92,7 @@
 }
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
-TEST_CASE("test bpf percpu tables", "[bpf_percpu_table]") {
+TEST_CASE("test bpf percpu tables", ebpf::bpf_module_rw_engine_enabled() ? "[bpf_percpu_table]" : "[bpf_percpu_table][!mayfail]") {
   const std::string BPF_PROGRAM = R"(
     BPF_PERCPU_HASH(myhash, int, u64, 128);
   )";
diff --git a/tests/cc/test_pinned_table.cc b/tests/cc/test_pinned_table.cc
index 265a8be..e478b40 100644
--- a/tests/cc/test_pinned_table.cc
+++ b/tests/cc/test_pinned_table.cc
@@ -47,7 +47,7 @@
   // test table access
   {
     const std::string BPF_PROGRAM = R"(
-      BPF_TABLE_PINNED("hash", u64, u64, ids, 1024, "/sys/fs/bpf/test_pinned_table");
+      BPF_TABLE_PINNED("hash", u64, u64, ids, 0, "/sys/fs/bpf/test_pinned_table", BPF_F_NO_PREALLOC);
     )";
 
     ebpf::BPF bpf;
@@ -85,3 +85,65 @@
   }
 }
 #endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0)
+TEST_CASE("test pinned sk_storage table", "[pinned_sk_storage_table]") {
+  bool mounted = false;
+  if (system("mount | grep /sys/fs/bpf")) {
+    REQUIRE(system("mkdir -p /sys/fs/bpf") == 0);
+    REQUIRE(system("mount -o nosuid,nodev,noexec,mode=700 -t bpf bpf /sys/fs/bpf") == 0);
+    mounted = true;
+  }
+  // prepare test by pinning table to bpffs
+  {
+    const std::string BPF_PROGRAM = R"(
+      BPF_SK_STORAGE(sk_stg, __u64);
+      int test(struct __sk_buff *skb) { return 0; }
+    )";
+
+    ebpf::BPF bpf;
+    ebpf::StatusTuple res(0);
+    res = bpf.init(BPF_PROGRAM);
+    REQUIRE(res.ok());
+
+    REQUIRE(bpf_obj_pin(bpf.get_sk_storage_table<unsigned long long>("sk_stg").get_fd(), "/sys/fs/bpf/test_pinned_table") == 0);
+  }
+
+  // exercise <pinned_map>.sk_storage_get().
+  {
+    const std::string BPF_PROGRAM = R"(
+      BPF_TABLE_PINNED("sk_storage", __u32, __u64, sk_stg, 0, "/sys/fs/bpf/test_pinned_table");
+      int test(struct __sk_buff *skb) {
+        struct bpf_sock *sk;
+        __u64 *val;
+
+        sk = skb->sk;
+        if (!sk)
+          return 0;
+        sk = bpf_sk_fullsock(sk);
+        if (!sk)
+          return 0;
+
+        val = sk_stg.sk_storage_get(sk, NULL, BPF_SK_STORAGE_GET_F_CREATE);
+        if (!val)
+          return 0;
+
+        return 1;
+      }
+    )";
+
+    ebpf::BPF bpf;
+    ebpf::StatusTuple res(0);
+    res = bpf.init(BPF_PROGRAM);
+    REQUIRE(res.ok());
+    int prog_fd;
+    res = bpf.load_func("test", BPF_PROG_TYPE_CGROUP_SKB, prog_fd);
+    REQUIRE(res.ok());
+  }
+
+  unlink("/sys/fs/bpf/test_pinned_table");
+  if (mounted) {
+    REQUIRE(umount("/sys/fs/bpf") == 0);
+  }
+}
+#endif
diff --git a/tests/python/test_clang.py b/tests/python/test_clang.py
index 7bf12cc..519e502 100755
--- a/tests/python/test_clang.py
+++ b/tests/python/test_clang.py
@@ -3,6 +3,7 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 
 from bcc import BPF
+from bcc.libbcc import lib
 import ctypes as ct
 from unittest import main, skipUnless, TestCase
 from utils import kernel_version_ge
@@ -143,6 +144,7 @@
         b = BPF(text=text, debug=0)
         fns = b.load_funcs(BPF.KPROBE)
 
+    @skipUnless(lib.bpf_module_rw_engine_enabled(), "requires enabled rwengine")
     def test_sscanf(self):
         text = """
 BPF_HASH(stats, int, struct { u64 a; u64 b; u64 c:36; u64 d:28; struct { u32 a; u32 b; } s; }, 10);
@@ -164,6 +166,7 @@
         self.assertEqual(l.s.a, 5)
         self.assertEqual(l.s.b, 6)
 
+    @skipUnless(lib.bpf_module_rw_engine_enabled(), "requires enabled rwengine")
     def test_sscanf_array(self):
         text = """
 BPF_HASH(stats, int, struct { u32 a[3]; u32 b; }, 10);
@@ -180,6 +183,7 @@
         self.assertEqual(l.a[2], 3)
         self.assertEqual(l.b, 4)
 
+    @skipUnless(lib.bpf_module_rw_engine_enabled(), "requires enabled rwengine")
     def test_sscanf_string(self):
         text = """
 struct Symbol {
diff --git a/tests/python/test_tools_smoke.py b/tests/python/test_tools_smoke.py
index ebc1728..879bdb1 100755
--- a/tests/python/test_tools_smoke.py
+++ b/tests/python/test_tools_smoke.py
@@ -358,6 +358,9 @@
     def test_tcptop(self):
         self.run_with_duration("tcptop.py 1 1")
 
+    def test_tcpcong(self):
+        self.run_with_duration("tcpcong.py 1 1")
+
     def test_tplist(self):
         self.run_with_duration("tplist.py -p %d" % os.getpid())
 
diff --git a/tools/bashreadline.py b/tools/bashreadline.py
index 908a145..3e18997 100755
--- a/tools/bashreadline.py
+++ b/tools/bashreadline.py
@@ -68,11 +68,11 @@
 b.attach_uretprobe(name=name, sym="readline", fn_name="printret")
 
 # header
-print("%-9s %-6s %s" % ("TIME", "PID", "COMMAND"))
+print("%-9s %-7s %s" % ("TIME", "PID", "COMMAND"))
 
 def print_event(cpu, data, size):
     event = b["events"].event(data)
-    print("%-9s %-6d %s" % (strftime("%H:%M:%S"), event.pid,
+    print("%-9s %-7d %s" % (strftime("%H:%M:%S"), event.pid,
                             event.str.decode('utf-8', 'replace')))
 
 b["events"].open_perf_buffer(print_event)
diff --git a/tools/bindsnoop.py b/tools/bindsnoop.py
index ac3a8aa..0750335 100755
--- a/tools/bindsnoop.py
+++ b/tools/bindsnoop.py
@@ -27,7 +27,7 @@
 # 14-Feb-2020   Pavel Dubovitsky   Created this.
 
 from __future__ import print_function, absolute_import, unicode_literals
-from bcc import BPF, DEBUG_SOURCE
+from bcc import BPF
 from bcc.containers import filter_by_containers
 from bcc.utils import printb
 import argparse
@@ -243,10 +243,14 @@
     opts.fields.reuseport = bitfield >> 4 & 0x01;
 
     // workaround for reading the sk_protocol bitfield (from tcpaccept.py):
-    u8 protocol;
+    u16 protocol;
     int gso_max_segs_offset = offsetof(struct sock, sk_gso_max_segs);
     int sk_lingertime_offset = offsetof(struct sock, sk_lingertime);
-    if (sk_lingertime_offset - gso_max_segs_offset == 4)
+
+    // Since kernel v5.6 sk_protocol has its own u16 field
+    if (sk_lingertime_offset - gso_max_segs_offset == 2)
+        protocol = skp->sk_protocol;
+    else if (sk_lingertime_offset - gso_max_segs_offset == 4)
         // 4.10+ with little endian
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
         protocol = *(u8 *)((u64)&skp->sk_gso_max_segs - 3);
diff --git a/tools/biolatency.py b/tools/biolatency.py
index f4e2c9e..63a2a57 100755
--- a/tools/biolatency.py
+++ b/tools/biolatency.py
@@ -4,7 +4,7 @@
 # biolatency    Summarize block device I/O latency as a histogram.
 #       For Linux, uses BCC, eBPF.
 #
-# USAGE: biolatency [-h] [-T] [-Q] [-m] [-D] [-e] [interval] [count]
+# USAGE: biolatency [-h] [-T] [-Q] [-m] [-D] [-F] [-e] [-j] [interval] [count]
 #
 # Copyright (c) 2015 Brendan Gregg.
 # Licensed under the Apache License, Version 2.0 (the "License")
@@ -64,7 +64,7 @@
 # define BPF program
 bpf_text = """
 #include <uapi/linux/ptrace.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 
 typedef struct disk_key {
     char disk[DISK_NAME_LEN];
@@ -128,12 +128,16 @@
 store_str = ""
 if args.disks:
     storage_str += "BPF_HISTOGRAM(dist, disk_key_t);"
-    store_str += """
+    disks_str = """
     disk_key_t key = {.slot = bpf_log2l(delta)};
-    void *__tmp = (void *)req->rq_disk->disk_name;
+    void *__tmp = (void *)req->__RQ_DISK__->disk_name;
     bpf_probe_read(&key.disk, sizeof(key.disk), __tmp);
     dist.atomic_increment(key);
     """
+    if BPF.kernel_struct_has_field(b'request', b'rq_disk'):
+        store_str += disks_str.replace('__RQ_DISK__', 'rq_disk')
+    else:
+        store_str += disks_str.replace('__RQ_DISK__', 'q->disk')
 elif args.flags:
     storage_str += "BPF_HISTOGRAM(dist, flag_key_t);"
     store_str += """
@@ -184,6 +188,12 @@
 if not args.json:
     print("Tracing block device I/O... Hit Ctrl-C to end.")
 
+def disk_print(s):
+    disk = s.decode('utf-8', 'replace')
+    if not disk:
+        disk = "<unknown>"
+    return disk
+
 # see blk_fill_rwbs():
 req_opf = {
     0: "Read",
@@ -252,9 +262,8 @@
 
         if args.flags:
             dist.print_json_hist(label, "flags", flags_print)
-
         else:
-            dist.print_json_hist(label)
+            dist.print_json_hist(label, "disk", disk_print)
 
     else:
         if args.timestamp:
@@ -263,7 +272,7 @@
         if args.flags:
             dist.print_log2_hist(label, "flags", flags_print)
         else:
-            dist.print_log2_hist(label, "disk")
+            dist.print_log2_hist(label, "disk", disk_print)
         if args.extension:
             total = extension[0].total
             counts = extension[0].count
diff --git a/tools/biolatpcts.py b/tools/biolatpcts.py
index a2f5959..ea8b1ce 100755
--- a/tools/biolatpcts.py
+++ b/tools/biolatpcts.py
@@ -56,6 +56,7 @@
 bpf_source = """
 #include <linux/blk_types.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/time64.h>
 
 BPF_PERCPU_ARRAY(rwdf_100ms, u64, 400);
@@ -71,9 +72,9 @@
         if (!rq->__START_TIME_FIELD__)
                 return;
 
-        if (!rq->rq_disk ||
-            rq->rq_disk->major != __MAJOR__ ||
-            rq->rq_disk->first_minor != __MINOR__)
+        if (!rq->__RQ_DISK__ ||
+            rq->__RQ_DISK__->major != __MAJOR__ ||
+            rq->__RQ_DISK__->first_minor != __MINOR__)
                 return;
 
         cmd_flags = rq->cmd_flags;
@@ -141,6 +142,11 @@
 bpf_source = bpf_source.replace('__MAJOR__', str(major))
 bpf_source = bpf_source.replace('__MINOR__', str(minor))
 
+if BPF.kernel_struct_has_field(b'request', b'rq_disk'):
+    bpf_source = bpf_source.replace('__RQ_DISK__', 'rq_disk')
+else:
+    bpf_source = bpf_source.replace('__RQ_DISK__', 'q->disk')
+
 bpf = BPF(text=bpf_source)
 if BPF.get_kprobe_functions(b'__blk_account_io_done'):
     bpf.attach_kprobe(event="__blk_account_io_done", fn_name="kprobe_blk_account_io_done")
diff --git a/tools/biopattern.py b/tools/biopattern.py
new file mode 100755
index 0000000..9bfc077
--- /dev/null
+++ b/tools/biopattern.py
@@ -0,0 +1,140 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# biopattern - Identify random/sequential disk access patterns.
+#              For Linux, uses BCC, eBPF.
+#
+# Copyright (c) 2022 Rocky Xing.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 21-Feb-2022   Rocky Xing   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+import argparse
+import os
+
+examples = """examples:
+    ./biopattern            # show block device I/O pattern.
+    ./biopattern 1 10       # print 1 second summaries, 10 times
+    ./biopattern -d sdb     # show sdb only
+"""
+parser = argparse.ArgumentParser(
+    description="Show block device I/O pattern.",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-d", "--disk", type=str,
+    help="Trace this disk only")
+parser.add_argument("interval", nargs="?", default=99999999,
+    help="Output interval in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="Number of outputs")
+args = parser.parse_args()
+countdown = int(args.count)
+
+bpf_text="""
+struct counter {
+    u64 last_sector;
+    u64 bytes;
+    u32 sequential;
+    u32 random;
+};
+
+BPF_HASH(counters, u32, struct counter);
+
+TRACEPOINT_PROBE(block, block_rq_complete)
+{
+    struct counter *counterp;
+    struct counter zero = {};
+    u32 dev = args->dev;
+    u64 sector = args->sector;
+    u32 nr_sector = args->nr_sector;
+
+    DISK_FILTER
+
+    counterp = counters.lookup_or_try_init(&dev, &zero);
+    if (counterp == 0) {
+        return 0;
+    }
+
+    if (counterp->last_sector) {
+        if (counterp->last_sector == sector) {
+            __sync_fetch_and_add(&counterp->sequential, 1);
+        } else {
+            __sync_fetch_and_add(&counterp->random, 1);
+        }
+        __sync_fetch_and_add(&counterp->bytes, nr_sector * 512);
+    }
+    counterp->last_sector = sector + nr_sector;
+
+    return 0;
+}
+"""
+
+dev_minor_bits = 20
+
+def mkdev(major, minor):
+   return (major << dev_minor_bits) | minor
+
+
+partitions = {}
+
+with open("/proc/partitions", 'r') as f:
+    lines = f.readlines()
+    for line in lines[2:]:
+        words = line.strip().split()
+        major = int(words[0])
+        minor = int(words[1])
+        part_name = words[3]
+        partitions[mkdev(major, minor)] = part_name
+
+if args.disk is not None:
+    disk_path = os.path.join('/dev', args.disk)
+    if os.path.exists(disk_path) == False:
+        print("no such disk '%s'" % args.disk)
+        exit(1)
+
+    stat_info = os.stat(disk_path)
+    major = os.major(stat_info.st_rdev)
+    minor = os.minor(stat_info.st_rdev)
+    bpf_text = bpf_text.replace('DISK_FILTER',
+                                'if (dev != %s) { return 0; }' % mkdev(major, minor))
+else:
+    bpf_text = bpf_text.replace('DISK_FILTER', '')
+
+b = BPF(text=bpf_text)
+
+exiting = 0 if args.interval else 1
+counters = b.get_table("counters")
+
+print("%-9s %-7s %5s %5s %8s %10s" % 
+    ("TIME", "DISK", "%RND", "%SEQ", "COUNT", "KBYTES"))
+
+while True:
+    try:
+        sleep(int(args.interval))
+    except KeyboardInterrupt:
+        exiting = 1
+    
+    for k, v in counters.items():
+        total = v.random + v.sequential
+        if total == 0:
+            continue
+
+        part_name = partitions.get(k.value, "Unknown")
+
+        print("%-9s %-7s %5d %5d %8d %10d" % (
+            strftime("%H:%M:%S"),
+            part_name,
+            v.random * 100 / total,
+            v.sequential * 100 / total,
+            total,
+            v.bytes / 1024))
+
+    counters.clear()
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()
+
diff --git a/tools/biopattern_example.txt b/tools/biopattern_example.txt
new file mode 100644
index 0000000..ac3e5c6
--- /dev/null
+++ b/tools/biopattern_example.txt
@@ -0,0 +1,45 @@
+Demonstrations of biopattern, the Linux eBPF/bcc version.
+
+
+biopattern identifies random/sequential disk access patterns. Example:
+
+# ./biopattern.py
+TIME      DISK     %RND  %SEQ    COUNT     KBYTES
+22:03:51  vdb         0    99      788       3184
+22:03:51  Unknown     0   100        4          0
+22:03:51  vda        85    14       21        488
+[...]
+
+
+The -d option only print the matched disk.
+
+# ./biopattern.py -d vdb 1 10
+TIME      DISK     %RND  %SEQ    COUNT     KBYTES
+22:12:57  vdb         0    99      193        772
+22:12:58  vdb         0   100     1119       4476
+22:12:59  vdb         0   100     1126       4504
+22:13:00  vdb         0   100     1009       4036
+22:13:01  vdb         0   100      958       3832
+22:13:02  vdb         0    99      957       3856
+22:13:03  vdb         0   100     1130       4520
+22:13:04  vdb         0   100     1051       4204
+22:13:05  vdb         0   100     1158       4632
+[...]
+
+
+USAGE message:
+
+Show block device I/O pattern.
+
+positional arguments:
+  interval              Output interval in seconds
+  count                 Number of outputs
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -d DISK, --disk DISK  Trace this disk only
+
+examples:
+    ./biopattern            # show block device I/O pattern.
+    ./biopattern 1 10       # print 1 second summaries, 10 times
+    ./biopattern -d sdb     # show sdb only
diff --git a/tools/biosnoop.py b/tools/biosnoop.py
index 2b954ac..5e7c6e6 100755
--- a/tools/biosnoop.py
+++ b/tools/biosnoop.py
@@ -37,7 +37,7 @@
 # define BPF program
 bpf_text="""
 #include <uapi/linux/ptrace.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 
 // for saving the timestamp and __data_len of each request
 struct start_req_t {
@@ -125,7 +125,7 @@
         data.pid = valp->pid;
         data.sector = req->__sector;
         bpf_probe_read_kernel(&data.name, sizeof(data.name), valp->name);
-        struct gendisk *rq_disk = req->rq_disk;
+        struct gendisk *rq_disk = req->__RQ_DISK__;
         bpf_probe_read_kernel(&data.disk_name, sizeof(data.disk_name),
                        rq_disk->disk_name);
     }
@@ -156,6 +156,10 @@
     bpf_text = bpf_text.replace('##QUEUE##', '1')
 else:
     bpf_text = bpf_text.replace('##QUEUE##', '0')
+if BPF.kernel_struct_has_field(b'request', b'rq_disk'):
+    bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk')
+else:
+    bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk')
 if debug or args.ebpf:
     print(bpf_text)
     if args.ebpf:
@@ -176,7 +180,7 @@
     b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
 
 # header
-print("%-11s %-14s %-6s %-7s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID",
+print("%-11s %-14s %-7s %-9s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID",
     "DISK", "T", "SECTOR", "BYTES"), end="")
 if args.queue:
     print("%7s " % ("QUE(ms)"), end="")
@@ -202,10 +206,13 @@
 
     delta = float(event.ts) - start_ts
 
-    print("%-11.6f %-14.14s %-6s %-7s %-1s %-10s %-7s" % (
+    disk_name = event.disk_name.decode('utf-8', 'replace')
+    if not disk_name:
+        disk_name = '<unknown>'
+
+    print("%-11.6f %-14.14s %-7s %-9s %-1s %-10s %-7s" % (
         delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid,
-        event.disk_name.decode('utf-8', 'replace'), rwflg, event.sector,
-        event.len), end="")
+        disk_name, rwflg, event.sector, event.len), end="")
     if args.queue:
         print("%7.2f " % (float(event.qdelta) / 1000000), end="")
     print("%7.2f" % (float(event.delta) / 1000000))
diff --git a/tools/biotop.py b/tools/biotop.py
index eac4dab..3c9c071 100755
--- a/tools/biotop.py
+++ b/tools/biotop.py
@@ -4,7 +4,7 @@
 # biotop  block device (disk) I/O by process.
 #         For Linux, uses BCC, eBPF.
 #
-# USAGE: biotop.py [-h] [-C] [-r MAXROWS] [interval] [count]
+# USAGE: biotop.py [-h] [-C] [-r MAXROWS] [-p PID] [interval] [count]
 #
 # This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
 # request, as well as a starting timestamp for calculating I/O latency.
@@ -13,6 +13,7 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #
 # 06-Feb-2016   Brendan Gregg   Created this.
+# 17-Mar-2022   Rocky Xing      Added PID filter support.
 
 from __future__ import print_function
 from bcc import BPF
@@ -24,6 +25,7 @@
 examples = """examples:
     ./biotop            # block device I/O top, 1 second refresh
     ./biotop -C         # don't clear the screen
+    ./biotop -p 181     # only trace PID 181
     ./biotop 5          # 5 second summaries
     ./biotop 5 10       # 5 second summaries, 10 times only
 """
@@ -35,6 +37,8 @@
     help="don't clear the screen")
 parser.add_argument("-r", "--maxrows", default=20,
     help="maximum rows to print, default 20")
+parser.add_argument("-p", "--pid", type=int, metavar="PID",
+    help="trace this PID only")
 parser.add_argument("interval", nargs="?", default=1,
     help="output interval, in seconds")
 parser.add_argument("count", nargs="?", default=99999999,
@@ -54,7 +58,7 @@
 # load BPF program
 bpf_text = """
 #include <uapi/linux/ptrace.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 
 // for saving the timestamp and __data_len of each request
 struct start_req_t {
@@ -92,9 +96,14 @@
 int trace_pid_start(struct pt_regs *ctx, struct request *req)
 {
     struct who_t who = {};
+    u32 pid;
 
     if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
-        who.pid = bpf_get_current_pid_tgid() >> 32;
+        pid = bpf_get_current_pid_tgid() >> 32;
+        if (FILTER_PID)
+            return 0;
+
+        who.pid = pid;
         whobyreq.update(&req, &who);
     }
 
@@ -124,13 +133,25 @@
     }
 
     struct who_t *whop;
+    u32 pid;
+
+    whop = whobyreq.lookup(&req);
+    pid = whop != 0 ? whop->pid : 0;
+    if (FILTER_PID) {
+        start.delete(&req);
+        if (whop != 0) {
+            whobyreq.delete(&req);
+        }
+        return 0;
+    }
+
     struct val_t *valp, zero = {};
     u64 delta_us = (bpf_ktime_get_ns() - startp->ts) / 1000;
 
     // setup info_t key
     struct info_t info = {};
-    info.major = req->rq_disk->major;
-    info.minor = req->rq_disk->first_minor;
+    info.major = req->__RQ_DISK__->major;
+    info.minor = req->__RQ_DISK__->first_minor;
 /*
  * The following deals with a kernel version change (in mainline 4.7, although
  * it may be backported to earlier kernels) with how block request write flags
@@ -146,7 +167,6 @@
     info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
 #endif
 
-    whop = whobyreq.lookup(&req);
     if (whop == 0) {
         // missed pid who, save stats as pid 0
         valp = counts.lookup_or_try_init(&info, &zero);
@@ -174,6 +194,16 @@
     print(bpf_text)
     exit()
 
+if BPF.kernel_struct_has_field(b'request', b'rq_disk'):
+    bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk')
+else:
+    bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk')
+
+if args.pid is not None:
+    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %d' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER_PID', '0')
+
 b = BPF(text=bpf_text)
 if BPF.get_kprobe_functions(b'__blk_account_io_start'):
     b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
@@ -211,7 +241,7 @@
         print()
     with open(loadavg) as stats:
         print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
-    print("%-6s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM",
+    print("%-7s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM",
         "D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms"))
 
     # by-PID output
@@ -229,7 +259,7 @@
 
         # print line
         avg_ms = (float(v.us) / 1000) / v.io
-        print("%-6d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid,
+        print("%-7d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid,
             k.name.decode('utf-8', 'replace'), "W" if k.rwflag else "R",
             k.major, k.minor, diskname, v.io, v.bytes / 1024, avg_ms))
 
diff --git a/tools/btrfsdist.py b/tools/btrfsdist.py
index 72ea304..a9bf6e4 100755
--- a/tools/btrfsdist.py
+++ b/tools/btrfsdist.py
@@ -231,7 +231,7 @@
     if args.interval and (not args.notimestamp):
         print(strftime("%H:%M:%S:"))
 
-    dist.print_log2_hist(label, "operation")
+    dist.print_log2_hist(label, "operation", section_print_fn=bytes.decode)
     dist.clear()
 
     countdown -= 1
diff --git a/tools/btrfsslower.py b/tools/btrfsslower.py
index 9e46243..6de02e0 100755
--- a/tools/btrfsslower.py
+++ b/tools/btrfsslower.py
@@ -310,7 +310,7 @@
             type, event.size, event.offset, event.delta_us,
             event.file.decode('utf-8', 'replace')))
         return
-    print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"),
+    print("%-8s %-14.14s %-7d %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"),
         event.task.decode('utf-8', 'replace'), event.pid, type, event.size,
         event.offset / 1024, float(event.delta_us) / 1000,
         event.file.decode('utf-8', 'replace')))
@@ -336,7 +336,7 @@
         print("Tracing btrfs operations")
     else:
         print("Tracing btrfs operations slower than %d ms" % min_ms)
-    print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T",
+    print("%-8s %-14s %-7s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T",
         "BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
 
 # read events
diff --git a/tools/cachetop.py b/tools/cachetop.py
index 7c02455..d02b72b 100755
--- a/tools/cachetop.py
+++ b/tools/cachetop.py
@@ -11,6 +11,7 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #
 # 13-Jul-2016   Emmanuel Bretelle first version
+# 17-Mar-2022   Rocky Xing        Added PID filter support.
 
 from __future__ import absolute_import
 from __future__ import division
@@ -152,12 +153,15 @@
     BPF_HASH(counts, struct key_t);
 
     int do_count(struct pt_regs *ctx) {
+        u32 pid = bpf_get_current_pid_tgid() >> 32;
+        if (FILTER_PID)
+            return 0;
+
         struct key_t key = {};
-        u64 pid = bpf_get_current_pid_tgid();
         u32 uid = bpf_get_current_uid_gid();
 
         key.ip = PT_REGS_IP(ctx);
-        key.pid = pid >> 32;
+        key.pid = pid;
         key.uid = uid;
         bpf_get_current_comm(&(key.comm), 16);
 
@@ -166,6 +170,12 @@
     }
 
     """
+
+    if args.pid:
+        bpf_text = bpf_text.replace('FILTER_PID', 'pid != %d' % args.pid)
+    else:
+        bpf_text = bpf_text.replace('FILTER_PID', '0')
+
     b = BPF(text=bpf_text)
     b.attach_kprobe(event="add_to_page_cache_lru", fn_name="do_count")
     b.attach_kprobe(event="mark_page_accessed", fn_name="do_count")
@@ -251,9 +261,11 @@
 
 def parse_arguments():
     parser = argparse.ArgumentParser(
-        description='show Linux page cache hit/miss statistics including read '
+        description='Show Linux page cache hit/miss statistics including read '
                     'and write hit % per processes in a UI like top.'
     )
+    parser.add_argument("-p", "--pid", type=int, metavar="PID",
+        help="trace this PID only")
     parser.add_argument(
         'interval', type=int, default=5, nargs='?',
         help='Interval between probes.'
diff --git a/tools/compactsnoop.py b/tools/compactsnoop.py
index 71ef95b..9daaf48 100755
--- a/tools/compactsnoop.py
+++ b/tools/compactsnoop.py
@@ -18,6 +18,7 @@
 import argparse
 import platform
 from datetime import datetime, timedelta
+import sys
 
 # arguments
 examples = """examples:
@@ -390,6 +391,8 @@
             print("\t%s" % sym)
         print("")
 
+    sys.stdout.flush()
+
 # loop with callback to print_event
 b["events"].open_perf_buffer(print_event, page_cnt=64)
 start_time = datetime.now()
diff --git a/tools/cpudist.py b/tools/cpudist.py
index a4303f8..3f58aa1 100755
--- a/tools/cpudist.py
+++ b/tools/cpudist.py
@@ -3,13 +3,17 @@
 #
 # cpudist   Summarize on- and off-CPU time per task as a histogram.
 #
-# USAGE: cpudist [-h] [-O] [-T] [-m] [-P] [-L] [-p PID] [interval] [count]
+# USAGE: cpudist [-h] [-O] [-T] [-m] [-P] [-L] [-p PID] [-I] [interval] [count]
 #
 # This measures the time a task spends on or off the CPU, and shows this time
 # as a histogram, optionally per-process.
 #
+# By default CPU idle time are excluded by simply excluding PID 0.
+#
 # Copyright 2016 Sasha Goldshtein
 # Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 27-Mar-2022   Rocky Xing      Changed to exclude CPU idle time by default.
 
 from __future__ import print_function
 from bcc import BPF
@@ -23,6 +27,7 @@
     cpudist -mT 1        # 1s summaries, milliseconds, and timestamps
     cpudist -P           # show each PID separately
     cpudist -p 185       # trace PID 185 only
+    cpudist -I           # include CPU idle time
 """
 parser = argparse.ArgumentParser(
     description="Summarize on-CPU time per task as a histogram.",
@@ -40,6 +45,8 @@
     help="print a histogram per thread ID")
 parser.add_argument("-p", "--pid",
     help="trace this PID only")
+parser.add_argument("-I", "--include-idle", action="store_true",
+    help="include CPU idle time")
 parser.add_argument("interval", nargs="?", default=99999999,
     help="output interval, in seconds")
 parser.add_argument("count", nargs="?", default=99999999,
@@ -58,29 +65,42 @@
     bpf_text += "#define ONCPU\n"
 
 bpf_text += """
+typedef struct entry_key {
+    u32 pid;
+    u32 cpu;
+} entry_key_t;
+
 typedef struct pid_key {
     u64 id;
     u64 slot;
 } pid_key_t;
 
 
-BPF_HASH(start, u32, u64, MAX_PID);
+BPF_HASH(start, entry_key_t, u64, MAX_PID);
 STORAGE
 
-static inline void store_start(u32 tgid, u32 pid, u64 ts)
+static inline void store_start(u32 tgid, u32 pid, u32 cpu, u64 ts)
 {
-    if (FILTER)
+    if (PID_FILTER)
         return;
 
-    start.update(&pid, &ts);
+    if (IDLE_FILTER)
+        return;
+
+    entry_key_t entry_key = { .pid = pid, .cpu = cpu };
+    start.update(&entry_key, &ts);
 }
 
-static inline void update_hist(u32 tgid, u32 pid, u64 ts)
+static inline void update_hist(u32 tgid, u32 pid, u32 cpu, u64 ts)
 {
-    if (FILTER)
+    if (PID_FILTER)
         return;
 
-    u64 *tsp = start.lookup(&pid);
+    if (IDLE_FILTER)
+        return;
+
+    entry_key_t entry_key = { .pid = pid, .cpu = cpu };
+    u64 *tsp = start.lookup(&entry_key);
     if (tsp == 0)
         return;
 
@@ -99,20 +119,21 @@
     u64 ts = bpf_ktime_get_ns();
     u64 pid_tgid = bpf_get_current_pid_tgid();
     u32 tgid = pid_tgid >> 32, pid = pid_tgid;
+    u32 cpu = bpf_get_smp_processor_id();
 
     u32 prev_pid = prev->pid;
     u32 prev_tgid = prev->tgid;
 #ifdef ONCPU
-    update_hist(prev_tgid, prev_pid, ts);
+    update_hist(prev_tgid, prev_pid, cpu, ts);
 #else
-    store_start(prev_tgid, prev_pid, ts);
+    store_start(prev_tgid, prev_pid, cpu, ts);
 #endif
 
 BAIL:
 #ifdef ONCPU
-    store_start(tgid, pid, ts);
+    store_start(tgid, pid, cpu, ts);
 #else
-    update_hist(tgid, pid, ts);
+    update_hist(tgid, pid, cpu, ts);
 #endif
 
     return 0;
@@ -120,9 +141,16 @@
 """
 
 if args.pid:
-    bpf_text = bpf_text.replace('FILTER', 'tgid != %s' % args.pid)
+    bpf_text = bpf_text.replace('PID_FILTER', 'tgid != %s' % args.pid)
 else:
-    bpf_text = bpf_text.replace('FILTER', '0')
+    bpf_text = bpf_text.replace('PID_FILTER', '0')
+
+# set idle filter
+idle_filter = 'pid == 0'
+if args.include_idle:
+    idle_filter = '0'
+bpf_text = bpf_text.replace('IDLE_FILTER', idle_filter)
+
 if args.milliseconds:
     bpf_text = bpf_text.replace('FACTOR', 'delta /= 1000000;')
     label = "msecs"
diff --git a/tools/cpudist_example.txt b/tools/cpudist_example.txt
index 7da4354..43be7a0 100644
--- a/tools/cpudist_example.txt
+++ b/tools/cpudist_example.txt
@@ -6,6 +6,8 @@
 overhead due to excessive context switching (e.g. a common shared lock for
 multiple threads), uneven workload distribution, too-granular tasks, and more.
 
+By default CPU idle time are excluded by simply excluding PID 0.
+
 Alternatively, the same options are available for summarizing task off-CPU
 time, which helps understand how often threads are being descheduled and how
 long they spend waiting for I/O, locks, timers, and other causes of suspension.
@@ -280,7 +282,7 @@
 
 # ./cpudist.py -h
 
-usage: cpudist.py [-h] [-O] [-T] [-m] [-P] [-L] [-p PID] [interval] [count]
+usage: cpudist.py [-h] [-O] [-T] [-m] [-P] [-L] [-p PID] [-I] [interval] [count]
 
 Summarize on-CPU time per task as a histogram.
 
@@ -296,6 +298,7 @@
   -P, --pids          print a histogram per process ID
   -L, --tids          print a histogram per thread ID
   -p PID, --pid PID   trace this PID only
+  -I, --include-idle  include CPU idle time
 
 examples:
     cpudist              # summarize on-CPU time as a histogram
@@ -304,3 +307,5 @@
     cpudist -mT 1        # 1s summaries, milliseconds, and timestamps
     cpudist -P           # show each PID separately
     cpudist -p 185       # trace PID 185 only
+    cpudist -I           # include CPU idle time
+
diff --git a/tools/dbslower.py b/tools/dbslower.py
index 090d521..1d45917 100755
--- a/tools/dbslower.py
+++ b/tools/dbslower.py
@@ -212,7 +212,7 @@
 
 def print_event(cpu, data, size):
     event = bpf["events"].event(data)
-    print("%-14.6f %-6d %8.3f %s" % (
+    print("%-14.6f %-7d %8.3f %s" % (
         float(event.timestamp - start) / 1000000000,
         event.pid, float(event.duration) / 1000000, event.query))
 
@@ -223,7 +223,7 @@
     print("Tracing database queries for pids %s slower than %d ms..." %
         (', '.join(map(str, args.pids)), args.threshold))
 
-print("%-14s %-6s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY"))
+print("%-14s %-7s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY"))
 
 bpf["events"].open_perf_buffer(print_event, page_cnt=64)
 while True:
diff --git a/tools/dcsnoop.py b/tools/dcsnoop.py
index 274eaa5..74a3914 100755
--- a/tools/dcsnoop.py
+++ b/tools/dcsnoop.py
@@ -148,13 +148,13 @@
 
 def print_event(cpu, data, size):
     event = b["events"].event(data)
-    print("%-11.6f %-6d %-16s %1s %s" % (
+    print("%-11.6f %-7d %-16s %1s %s" % (
             time.time() - start_ts, event.pid,
             event.comm.decode('utf-8', 'replace'), mode_s[event.type],
             event.filename.decode('utf-8', 'replace')))
 
 # header
-print("%-11s %-6s %-16s %1s %s" % ("TIME(s)", "PID", "COMM", "T", "FILE"))
+print("%-11s %-7s %-16s %1s %s" % ("TIME(s)", "PID", "COMM", "T", "FILE"))
 
 b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
diff --git a/tools/drsnoop.py b/tools/drsnoop.py
index e4ea922..e0344d1 100755
--- a/tools/drsnoop.py
+++ b/tools/drsnoop.py
@@ -20,6 +20,7 @@
 from datetime import datetime, timedelta
 import os
 import math
+import sys
 
 # symbols
 kallsyms = "/proc/kallsyms"
@@ -224,6 +225,8 @@
     else:
         print("")
 
+    sys.stdout.flush()
+
 
 # loop with callback to print_event
 b["events"].open_perf_buffer(print_event, page_cnt=64)
diff --git a/tools/execsnoop.py b/tools/execsnoop.py
index 53052d3..ea8f40b 100755
--- a/tools/execsnoop.py
+++ b/tools/execsnoop.py
@@ -236,7 +236,7 @@
     print("%-8s" % ("TIME(s)"), end="")
 if args.print_uid:
     print("%-6s" % ("UID"), end="")
-print("%-16s %-6s %-6s %3s %s" % ("PCOMM", "PID", "PPID", "RET", "ARGS"))
+print("%-16s %-7s %-7s %3s %s" % ("PCOMM", "PID", "PPID", "RET", "ARGS"))
 
 class EventType(object):
     EVENT_ARG = 0
@@ -290,7 +290,7 @@
             ppid = event.ppid if event.ppid > 0 else get_ppid(event.pid)
             ppid = b"%d" % ppid if ppid > 0 else b"?"
             argv_text = b' '.join(argv[event.pid]).replace(b'\n', b'\\n')
-            printb(b"%-16s %-6d %-6s %3d %s" % (event.comm, event.pid,
+            printb(b"%-16s %-7d %-7s %3d %s" % (event.comm, event.pid,
                    ppid, event.retval, argv_text))
         try:
             del(argv[event.pid])
diff --git a/tools/ext4slower.py b/tools/ext4slower.py
index 90663a5..5cd75ab 100755
--- a/tools/ext4slower.py
+++ b/tools/ext4slower.py
@@ -101,7 +101,7 @@
 // own function, for reads. So we need to trace that and then filter on ext4,
 // which I do by checking file->f_op.
 // The new Linux version (since form 4.10) uses ext4_file_read_iter(), And if the 'CONFIG_FS_DAX' 
-// is not set ,then ext4_file_read_iter() will call generic_file_read_iter(), else it will call 
+// is not set, then ext4_file_read_iter() will call generic_file_read_iter(), else it will call
 // ext4_dax_read_iter(), and trace generic_file_read_iter() will fail.
 int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb)
 {
diff --git a/tools/filelife.py b/tools/filelife.py
index 9b7562f..e869607 100755
--- a/tools/filelife.py
+++ b/tools/filelife.py
@@ -118,12 +118,12 @@
 b.attach_kprobe(event="vfs_unlink", fn_name="trace_unlink")
 
 # header
-print("%-8s %-6s %-16s %-7s %s" % ("TIME", "PID", "COMM", "AGE(s)", "FILE"))
+print("%-8s %-7s %-16s %-7s %s" % ("TIME", "PID", "COMM", "AGE(s)", "FILE"))
 
 # process event
 def print_event(cpu, data, size):
     event = b["events"].event(data)
-    print("%-8s %-6d %-16s %-7.2f %s" % (strftime("%H:%M:%S"), event.pid,
+    print("%-8s %-7d %-16s %-7.2f %s" % (strftime("%H:%M:%S"), event.pid,
         event.comm.decode('utf-8', 'replace'), float(event.delta) / 1000,
         event.fname.decode('utf-8', 'replace')))
 
diff --git a/tools/filetop.py b/tools/filetop.py
index 9a79a64..aec11a8 100755
--- a/tools/filetop.py
+++ b/tools/filetop.py
@@ -67,6 +67,7 @@
 struct info_t {
     unsigned long inode;
     dev_t dev;
+    dev_t rdev;
     u32 pid;
     u32 name_len;
     char comm[TASK_COMM_LEN];
@@ -105,7 +106,8 @@
     struct info_t info = {
         .pid = pid,
         .inode = file->f_inode->i_ino,
-        .dev = file->f_inode->i_rdev,
+        .dev = file->f_inode->i_sb->s_dev,
+        .rdev = file->f_inode->i_rdev,
     };
     bpf_get_current_comm(&info.comm, sizeof(info.comm));
     info.name_len = d_name.len;
diff --git a/tools/funcslower.py b/tools/funcslower.py
index ffa618d..ddd786f 100755
--- a/tools/funcslower.py
+++ b/tools/funcslower.py
@@ -88,6 +88,13 @@
     u64 args[5];
 #endif
 #endif
+#ifdef USER_STACKS
+    int user_stack_id;
+#endif
+#ifdef KERNEL_STACKS
+    int kernel_stack_id;
+    u64 kernel_ip;
+#endif
 };
 
 struct data_t {
@@ -143,6 +150,40 @@
 #endif
 #endif
 
+#ifdef USER_STACKS
+    entry.user_stack_id = stacks.get_stackid(ctx, BPF_F_USER_STACK);
+#endif
+
+#ifdef KERNEL_STACKS
+    entry.kernel_stack_id = stacks.get_stackid(ctx, 0);
+
+    if (entry.kernel_stack_id >= 0) {
+        u64 ip = PT_REGS_IP(ctx);
+        u64 page_offset;
+
+        // if ip isn't sane, leave key ips as zero for later checking
+#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE)
+        // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it
+        page_offset = __PAGE_OFFSET_BASE;
+#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4)
+        // x64, 4.17, and later
+#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL)
+        page_offset = __PAGE_OFFSET_BASE_L5;
+#else
+        page_offset = __PAGE_OFFSET_BASE_L4;
+#endif
+#else
+        // earlier x86_64 kernels, e.g., 4.6, comes here
+        // arm64, s390, powerpc, x86_32
+        page_offset = PAGE_OFFSET;
+#endif
+
+        if (ip > page_offset) {
+            entry.kernel_ip = ip;
+        }
+    }
+#endif
+
     entryinfo.update(&tgid_pid, &entry);
 
     return 0;
@@ -172,37 +213,12 @@
     data.retval = PT_REGS_RC(ctx);
 
 #ifdef USER_STACKS
-    data.user_stack_id = stacks.get_stackid(ctx, BPF_F_USER_STACK);
+    data.user_stack_id = entryp->user_stack_id;
 #endif
 
 #ifdef KERNEL_STACKS
-    data.kernel_stack_id = stacks.get_stackid(ctx, 0);
-
-    if (data.kernel_stack_id >= 0) {
-        u64 ip = PT_REGS_IP(ctx);
-        u64 page_offset;
-
-        // if ip isn't sane, leave key ips as zero for later checking
-#if defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE)
-        // x64, 4.16, ..., 4.11, etc., but some earlier kernel didn't have it
-        page_offset = __PAGE_OFFSET_BASE;
-#elif defined(CONFIG_X86_64) && defined(__PAGE_OFFSET_BASE_L4)
-        // x64, 4.17, and later
-#if defined(CONFIG_DYNAMIC_MEMORY_LAYOUT) && defined(CONFIG_X86_5LEVEL)
-        page_offset = __PAGE_OFFSET_BASE_L5;
-#else
-        page_offset = __PAGE_OFFSET_BASE_L4;
-#endif
-#else
-        // earlier x86_64 kernels, e.g., 4.6, comes here
-        // arm64, s390, powerpc, x86_32
-        page_offset = PAGE_OFFSET;
-#endif
-
-        if (ip > page_offset) {
-            data.kernel_ip = ip;
-        }
-    }
+    data.kernel_stack_id = entryp->kernel_stack_id;
+    data.kernel_ip = entryp->kernel_ip;
 #endif
 
 #ifdef GRAB_ARGS
diff --git a/tools/hardirqs.py b/tools/hardirqs.py
index 0eedddd..3bcf649 100755
--- a/tools/hardirqs.py
+++ b/tools/hardirqs.py
@@ -4,7 +4,7 @@
 # hardirqs  Summarize hard IRQ (interrupt) event time.
 #           For Linux, uses BCC, eBPF.
 #
-# USAGE: hardirqs [-h] [-T] [-N] [-C] [-d] [interval] [outputs]
+# USAGE: hardirqs [-h] [-T] [-N] [-C] [-d] [-c CPU] [interval] [outputs]
 #
 # Thanks Amer Ather for help understanding irq behavior.
 #
@@ -13,11 +13,13 @@
 #
 # 19-Oct-2015   Brendan Gregg   Created this.
 # 22-May-2021   Hengqi Chen     Migrated to kernel tracepoints.
+# 07-Mar-2022   Rocky Xing      Added CPU filter support.
 
 from __future__ import print_function
 from bcc import BPF
 from time import sleep, strftime
 import argparse
+import sys
 
 # arguments
 examples = """examples:
@@ -25,6 +27,7 @@
     ./hardirqs -d         # show hard irq event time as histograms
     ./hardirqs 1 10       # print 1 second summaries, 10 times
     ./hardirqs -NT 1      # 1s summaries, nanoseconds, and timestamps
+    ./hardirqs -c 1       # sum hard irq event time on CPU 1 only
 """
 parser = argparse.ArgumentParser(
     description="Summarize hard irq event time as histograms",
@@ -38,6 +41,8 @@
     help="show event counts instead of timing")
 parser.add_argument("-d", "--dist", action="store_true",
     help="show distributions as histograms")
+parser.add_argument("-c", "--cpu", type=int,
+    help="trace this CPU only")
 parser.add_argument("interval", nargs="?", default=99999999,
     help="output interval, in seconds")
 parser.add_argument("outputs", nargs="?", default=99999999,
@@ -94,9 +99,12 @@
 {
     struct entry_key key = {};
     irq_name_t name = {};
+    u32 cpu = bpf_get_smp_processor_id();
+
+    FILTER_CPU
 
     key.tid = bpf_get_current_pid_tgid();
-    key.cpu_id = bpf_get_smp_processor_id();
+    key.cpu_id = cpu;
 
     TP_DATA_LOC_READ_STR(&name.name, name, sizeof(name));
     irqnames.update(&key, &name);
@@ -106,9 +114,12 @@
 TRACEPOINT_PROBE(irq, irq_handler_exit)
 {
     struct entry_key key = {};
+    u32 cpu = bpf_get_smp_processor_id();
+
+    FILTER_CPU
 
     key.tid = bpf_get_current_pid_tgid();
-    key.cpu_id = bpf_get_smp_processor_id();
+    key.cpu_id = cpu;
 
     // check ret value of irq handler is not IRQ_NONE to make sure
     // the current event belong to this irq handler
@@ -137,9 +148,12 @@
     u64 ts = bpf_ktime_get_ns();
     irq_name_t name = {};
     struct entry_key key = {};
+    u32 cpu = bpf_get_smp_processor_id();
+
+    FILTER_CPU
 
     key.tid = bpf_get_current_pid_tgid();
-    key.cpu_id = bpf_get_smp_processor_id();
+    key.cpu_id = cpu;
 
     TP_DATA_LOC_READ_STR(&name.name, name, sizeof(name));
     irqnames.update(&key, &name);
@@ -152,9 +166,10 @@
     u64 *tsp, delta;
     irq_name_t *namep;
     struct entry_key key = {};
+    u32 cpu = bpf_get_smp_processor_id();
 
     key.tid = bpf_get_current_pid_tgid();
-    key.cpu_id = bpf_get_smp_processor_id();
+    key.cpu_id = cpu;
 
     // check ret value of irq handler is not IRQ_NONE to make sure
     // the current event belong to this irq handler
@@ -195,6 +210,11 @@
         'irq_key_t key = {.slot = 0 /* ignore */};' +
         'bpf_probe_read_kernel(&key.name, sizeof(key.name), name);' +
         'dist.atomic_increment(key, delta);')
+if args.cpu is not None:
+    bpf_text = bpf_text.replace('FILTER_CPU',
+        'if (cpu != %d) { return 0; }' % int(args.cpu))
+else:
+    bpf_text = bpf_text.replace('FILTER_CPU', '')
 if debug or args.ebpf:
     print(bpf_text)
     if args.ebpf:
@@ -222,13 +242,15 @@
         print("%-8s\n" % strftime("%H:%M:%S"), end="")
 
     if args.dist:
-        dist.print_log2_hist(label, "hardirq")
+        dist.print_log2_hist(label, "hardirq", section_print_fn=bytes.decode)
     else:
         print("%-26s %11s" % ("HARDIRQ", "TOTAL_" + label))
         for k, v in sorted(dist.items(), key=lambda dist: dist[1].value):
             print("%-26s %11d" % (k.name.decode('utf-8', 'replace'), v.value / factor))
     dist.clear()
 
+    sys.stdout.flush()
+
     countdown -= 1
     if exiting or countdown == 0:
         exit()
diff --git a/tools/klockstat.py b/tools/klockstat.py
index d157b7b..b8cafd9 100755
--- a/tools/klockstat.py
+++ b/tools/klockstat.py
@@ -367,9 +367,30 @@
 
 """
 
+program_kfunc_nested = """
+KFUNC_PROBE(mutex_unlock, void *lock)
+{
+    return do_mutex_unlock_enter();
+}
+
+KRETFUNC_PROBE(mutex_lock_nested, void *lock, int ret)
+{
+    return do_mutex_lock_return();
+}
+
+KFUNC_PROBE(mutex_lock_nested, void *lock)
+{
+    return do_mutex_lock_enter(ctx, 3);
+}
+
+"""
+
 is_support_kfunc = BPF.support_kfunc()
 if is_support_kfunc:
-    program += program_kfunc
+    if BPF.get_kprobe_functions(b"mutex_lock_nested"):
+        program += program_kfunc_nested
+    else:
+        program += program_kfunc
 else:
     program += program_kprobe
 
@@ -428,9 +449,14 @@
 b = BPF(text=program)
 
 if not is_support_kfunc:
-    b.attach_kprobe(event="mutex_unlock",  fn_name="mutex_unlock_enter")
-    b.attach_kretprobe(event="mutex_lock", fn_name="mutex_lock_return")
-    b.attach_kprobe(event="mutex_lock",    fn_name="mutex_lock_enter")
+    b.attach_kprobe(event="mutex_unlock", fn_name="mutex_unlock_enter")
+    # Depending on whether DEBUG_LOCK_ALLOC is set, the proper kprobe may be either mutex_lock or mutex_lock_nested
+    if BPF.get_kprobe_functions(b"mutex_lock_nested"):
+        b.attach_kretprobe(event="mutex_lock_nested", fn_name="mutex_lock_return")
+        b.attach_kprobe(event="mutex_lock_nested", fn_name="mutex_lock_enter")
+    else:
+        b.attach_kretprobe(event="mutex_lock", fn_name="mutex_lock_return")
+        b.attach_kprobe(event="mutex_lock", fn_name="mutex_lock_enter")
 
 enabled = b.get_table("enabled");
 
diff --git a/tools/mdflush.py b/tools/mdflush.py
index 8a23520..5dea0b4 100755
--- a/tools/mdflush.py
+++ b/tools/mdflush.py
@@ -55,12 +55,12 @@
 
 # header
 print("Tracing md flush requests... Hit Ctrl-C to end.")
-print("%-8s %-6s %-16s %s" % ("TIME", "PID", "COMM", "DEVICE"))
+print("%-8s %-7s %-16s %s" % ("TIME", "PID", "COMM", "DEVICE"))
 
 # process event
 def print_event(cpu, data, size):
     event = b["events"].event(data)
-    print("%-8s %-6d %-16s %s" % (strftime("%H:%M:%S"), event.pid,
+    print("%-8s %-7d %-16s %s" % (strftime("%H:%M:%S"), event.pid,
         event.comm.decode('utf-8', 'replace'),
         event.disk.decode('utf-8', 'replace')))
 
diff --git a/tools/memleak.py b/tools/memleak.py
index 6cda150..27a2e09 100755
--- a/tools/memleak.py
+++ b/tools/memleak.py
@@ -272,6 +272,19 @@
         return gen_alloc_exit(ctx);
 }
 
+int mmap_enter(struct pt_regs *ctx) {
+        size_t size = (size_t)PT_REGS_PARM2(ctx);
+        return gen_alloc_enter(ctx, size);
+}
+
+int mmap_exit(struct pt_regs *ctx) {
+        return gen_alloc_exit(ctx);
+}
+
+int munmap_enter(struct pt_regs *ctx, void *address) {
+        return gen_free_enter(ctx, address);
+}
+
 int posix_memalign_enter(struct pt_regs *ctx, void **memptr, size_t alignment,
                          size_t size) {
         u64 memptr64 = (u64)(size_t)memptr;
@@ -449,6 +462,7 @@
         attach_probes("malloc")
         attach_probes("calloc")
         attach_probes("realloc")
+        attach_probes("mmap")
         attach_probes("posix_memalign")
         attach_probes("valloc", can_fail=True) # failed on Android, is deprecated in libc.so from bionic directory
         attach_probes("memalign")
@@ -456,6 +470,8 @@
         attach_probes("aligned_alloc", can_fail=True)  # added in C11
         bpf.attach_uprobe(name=obj, sym="free", fn_name="free_enter",
                                   pid=pid)
+        bpf.attach_uprobe(name=obj, sym="munmap", fn_name="munmap_enter",
+                                  pid=pid)
 
 else:
         print("Attaching to kernel allocators, Ctrl+C to quit.")
@@ -494,7 +510,7 @@
                         stack = list(stack_traces.walk(info.stack_id))
                         combined = []
                         for addr in stack:
-                                combined.append(bpf.sym(addr, pid,
+                                combined.append(('0x'+format(addr, '016x')+'\t').encode('utf-8') + bpf.sym(addr, pid,
                                         show_module=True, show_offset=True))
                         alloc_info[info.stack_id] = Allocation(combined,
                                                                info.size)
diff --git a/tools/mountsnoop.py b/tools/mountsnoop.py
index a6d7ece..d186602 100755
--- a/tools/mountsnoop.py
+++ b/tools/mountsnoop.py
@@ -420,6 +420,7 @@
                 print('{:16} {:<7} {:<7} {:<11} {}'.format(
                     syscall['comm'].decode('utf-8', 'replace'), syscall['tgid'],
                     syscall['pid'], syscall['mnt_ns'], call))
+        sys.stdout.flush()
     except KeyError:
         # This might happen if we lost an event.
         pass
diff --git a/tools/mysqld_qslower.py b/tools/mysqld_qslower.py
index 088cd63..e5e3b84 100755
--- a/tools/mysqld_qslower.py
+++ b/tools/mysqld_qslower.py
@@ -108,7 +108,7 @@
 # header
 print("Tracing MySQL server queries for PID %d slower than %s ms..." % (pid,
     min_ms_text))
-print("%-14s %-6s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY"))
+print("%-14s %-7s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY"))
 
 # process event
 start = 0
@@ -117,7 +117,7 @@
     event = b["events"].event(data)
     if start == 0:
         start = event.ts
-    print("%-14.6f %-6d %8.3f %s" % (float(event.ts - start) / 1000000000,
+    print("%-14.6f %-7d %8.3f %s" % (float(event.ts - start) / 1000000000,
         event.pid, float(event.delta) / 1000000, event.query))
 
 # loop with callback to print_event
diff --git a/tools/oomkill.py b/tools/oomkill.py
index 3d6e927..1bf441c 100755
--- a/tools/oomkill.py
+++ b/tools/oomkill.py
@@ -37,12 +37,12 @@
 
 void kprobe__oom_kill_process(struct pt_regs *ctx, struct oom_control *oc, const char *message)
 {
-    unsigned long totalpages;
     struct task_struct *p = oc->chosen;
     struct data_t data = {};
     u32 pid = bpf_get_current_pid_tgid() >> 32;
+
     data.fpid = pid;
-    data.tpid = p->pid;
+    data.tpid = p->tgid;
     data.pages = oc->totalpages;
     bpf_get_current_comm(&data.fcomm, sizeof(data.fcomm));
     bpf_probe_read_kernel(&data.tcomm, sizeof(data.tcomm), p->comm);
diff --git a/tools/softirqs.py b/tools/softirqs.py
index ba0dac3..0ed18c4 100755
--- a/tools/softirqs.py
+++ b/tools/softirqs.py
@@ -4,25 +4,30 @@
 # softirqs  Summarize soft IRQ (interrupt) event time.
 #           For Linux, uses BCC, eBPF.
 #
-# USAGE: softirqs [-h] [-T] [-N] [-d] [interval] [count]
+# USAGE: softirqs [-h] [-T] [-N] [-C] [-d] [-c CPU] [interval] [count]
 #
 # Copyright (c) 2015 Brendan Gregg.
 # Licensed under the Apache License, Version 2.0 (the "License")
 #
 # 20-Oct-2015   Brendan Gregg     Created this.
 # 03-Apr-2017   Sasha Goldshtein  Migrated to kernel tracepoints.
+# 07-Mar-2022   Rocky Xing        Added CPU filter support.
+# 24-Mar-2022   Rocky Xing        Added event counting support.
 
 from __future__ import print_function
 from bcc import BPF
 from time import sleep, strftime
 import argparse
+import sys
 
 # arguments
 examples = """examples:
     ./softirqs            # sum soft irq event time
+    ./softirqs -C         # show the number of soft irq events
     ./softirqs -d         # show soft irq event time as histograms
     ./softirqs 1 10       # print 1 second summaries, 10 times
     ./softirqs -NT 1      # 1s summaries, nanoseconds, and timestamps
+    ./softirqs -c 1       # sum soft irq event time on CPU 1 only
 """
 parser = argparse.ArgumentParser(
     description="Summarize soft irq event time as histograms.",
@@ -32,8 +37,12 @@
     help="include timestamp on output")
 parser.add_argument("-N", "--nanoseconds", action="store_true",
     help="output in nanoseconds")
+parser.add_argument("-C", "--events", action="store_true",
+    help="show the number of soft irq events")
 parser.add_argument("-d", "--dist", action="store_true",
     help="show distributions as histograms")
+parser.add_argument("-c", "--cpu", type=int,
+    help="trace this CPU only")
 parser.add_argument("interval", nargs="?", default=99999999,
     help="output interval, in seconds")
 parser.add_argument("count", nargs="?", default=99999999,
@@ -42,7 +51,13 @@
     help=argparse.SUPPRESS)
 args = parser.parse_args()
 countdown = int(args.count)
-if args.nanoseconds:
+if args.events and (args.dist or args.nanoseconds):
+    print("The --events option can't be used with time-based options")
+    exit()
+if args.events:
+    factor = 1
+    label = "count"
+elif args.nanoseconds:
     factor = 1
     label = "nsecs"
 else:
@@ -70,16 +85,36 @@
 } account_val_t;
 
 BPF_HASH(start, entry_key_t, account_val_t);
-BPF_HASH(iptr, u32);
 BPF_HISTOGRAM(dist, irq_key_t);
+"""
 
+bpf_text_count = """
+TRACEPOINT_PROBE(irq, softirq_entry)
+{
+    u32 cpu = bpf_get_smp_processor_id();
+
+    FILTER_CPU
+
+    irq_key_t key = { .slot = 0 /* ignore */ };
+    key.vec = args->vec;
+
+    dist.atomic_increment(key);
+
+    return 0;
+}
+"""
+
+bpf_text_time = """
 TRACEPOINT_PROBE(irq, softirq_entry)
 {
     account_val_t val = {};
     entry_key_t key = {};
+    u32 cpu = bpf_get_smp_processor_id();
+
+    FILTER_CPU
 
     key.pid = bpf_get_current_pid_tgid();
-    key.cpu = bpf_get_smp_processor_id();
+    key.cpu = cpu;
     val.ts = bpf_ktime_get_ns();
     val.vec = args->vec;
 
@@ -95,9 +130,12 @@
     account_val_t *valp;
     irq_key_t key = {0};
     entry_key_t entry_key = {};
+    u32 cpu = bpf_get_smp_processor_id();
+
+    FILTER_CPU
 
     entry_key.pid = bpf_get_current_pid_tgid();
-    entry_key.cpu = bpf_get_smp_processor_id();
+    entry_key.cpu = cpu;
 
     // fetch timestamp and calculate delta
     valp = start.lookup(&entry_key);
@@ -115,6 +153,11 @@
 }
 """
 
+if args.events:
+    bpf_text += bpf_text_count
+else:
+    bpf_text += bpf_text_time
+
 # code substitutions
 if args.dist:
     bpf_text = bpf_text.replace('STORE',
@@ -124,6 +167,11 @@
     bpf_text = bpf_text.replace('STORE',
         'key.vec = valp->vec; ' +
         'dist.atomic_increment(key, delta);')
+if args.cpu is not None:
+    bpf_text = bpf_text.replace('FILTER_CPU',
+        'if (cpu != %d) { return 0; }' % int(args.cpu))
+else:
+    bpf_text = bpf_text.replace('FILTER_CPU', '')
 if debug or args.ebpf:
     print(bpf_text)
     if args.ebpf:
@@ -138,7 +186,10 @@
     return ["hi", "timer", "net_tx", "net_rx", "block", "irq_poll",
             "tasklet", "sched", "hrtimer", "rcu"][vec]
 
-print("Tracing soft irq event time... Hit Ctrl-C to end.")
+if args.events:
+    print("Tracing soft irq events... Hit Ctrl-C to end.")
+else:
+    print("Tracing soft irq event time... Hit Ctrl-C to end.")
 
 # output
 exiting = 0 if args.interval else 1
@@ -161,6 +212,8 @@
             print("%-16s %11d" % (vec_to_name(k.vec), v.value / factor))
     dist.clear()
 
+    sys.stdout.flush()
+
     countdown -= 1
     if exiting or countdown == 0:
         exit()
diff --git a/tools/softirqs_example.txt b/tools/softirqs_example.txt
index ef3174a..a914143 100644
--- a/tools/softirqs_example.txt
+++ b/tools/softirqs_example.txt
@@ -179,12 +179,27 @@
      16384 -> 32767      : 24       |**                                      |
 
 
+Sometimes you just want counts of events, and don't need the distribution
+of times. You can use the -C or --events option:
+
+# ./softirqs.py -C
+Tracing soft irq events... Hit Ctrl-C to end.
+^C
+SOFTIRQ          TOTAL_count
+block                      5
+tasklet                    6
+net_rx                   402
+sched                   5251
+rcu                     5748
+timer                   9530
+
+
 USAGE message:
 
 # ./softirqs -h
-usage: softirqs [-h] [-T] [-N] [-d] [interval] [count]
+usage: softirqs [-h] [-T] [-N] [-C] [-d] [-c CPU] [interval] [count]
 
-Summarize soft irq event time as histograms
+Summarize soft irq event time as histograms.
 
 positional arguments:
   interval           output interval, in seconds
@@ -194,10 +209,15 @@
   -h, --help         show this help message and exit
   -T, --timestamp    include timestamp on output
   -N, --nanoseconds  output in nanoseconds
+  -C, --events       show the number of soft irq events
   -d, --dist         show distributions as histograms
+  -c CPU, --cpu CPU  trace this CPU only
 
 examples:
     ./softirqs            # sum soft irq event time
+    ./softirqs -C         # show the number of soft irq events
     ./softirqs -d         # show soft irq event time as histograms
     ./softirqs 1 10       # print 1 second summaries, 10 times
     ./softirqs -NT 1      # 1s summaries, nanoseconds, and timestamps
+    ./softirqs -c 1       # sum soft irq event time on CPU 1 only
+
diff --git a/tools/sslsniff.py b/tools/sslsniff.py
index 8bc61ce..4621e9f 100755
--- a/tools/sslsniff.py
+++ b/tools/sslsniff.py
@@ -5,7 +5,7 @@
 #           For Linux, uses BCC, eBPF.
 #
 # USAGE: sslsniff.py [-h] [-p PID] [-u UID] [-x] [-c COMM] [-o] [-g] [-n] [-d]
-#                    [--hexdump] [--max-buffer-size SIZE]
+#                    [--hexdump] [--max-buffer-size SIZE] [-l] [--handshake]
 #
 # Licensed under the Apache License, Version 2.0 (the "License")
 #
@@ -19,6 +19,7 @@
 import argparse
 import binascii
 import textwrap
+import os.path
 
 # arguments
 examples = """examples:
@@ -31,7 +32,29 @@
     ./sslsniff --no-nss     # don't show NSS calls
     ./sslsniff --hexdump    # show data as hex instead of trying to decode it as UTF-8
     ./sslsniff -x           # show process UID and TID
+    ./sslsniff -l           # show function latency
+    ./sslsniff -l --handshake  # show SSL handshake latency
+    ./sslsniff --extra-lib openssl:/path/libssl.so.1.1 # sniff extra library
 """
+
+
+def ssllib_type(input_str):
+    valid_types = frozenset(['openssl', 'gnutls', 'nss'])
+
+    try:
+        lib_type, lib_path = input_str.split(':', 1)
+    except ValueError:
+        raise argparse.ArgumentTypeError("Invalid SSL library param: %r" % input_str)
+
+    if lib_type not in valid_types:
+        raise argparse.ArgumentTypeError("Invalid SSL library type: %r" % lib_type)
+
+    if not os.path.isfile(lib_path):
+        raise argparse.ArgumentTypeError("Invalid library path: %r" % lib_path)
+
+    return lib_type, lib_path
+
+
 parser = argparse.ArgumentParser(
     description="Sniff SSL data",
     formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -57,6 +80,12 @@
                     help="show data as hexdump instead of trying to decode it as UTF-8")
 parser.add_argument('--max-buffer-size', type=int, default=8192,
                     help='Size of captured buffer')
+parser.add_argument("-l", "--latency", action="store_true",
+                    help="show function latency")
+parser.add_argument("--handshake", action="store_true",
+                    help="show SSL handshake latency, enabled only if latency option is on.")
+parser.add_argument("--extra-lib", type=ssllib_type, action='append',
+                    help="Intercept calls from extra library (format: lib_type:lib_path)")
 args = parser.parse_args()
 
 
@@ -68,11 +97,13 @@
 
 struct probe_SSL_data_t {
         u64 timestamp_ns;
+        u64 delta_ns;
         u32 pid;
         u32 tid;
         u32 uid;
         u32 len;
         int buf_filled;
+        int rw;
         char comm[TASK_COMM_LEN];
         u8 buf[MAX_BUF_SIZE];
 };
@@ -80,69 +111,37 @@
 #define BASE_EVENT_SIZE ((size_t)(&((struct probe_SSL_data_t*)0)->buf))
 #define EVENT_SIZE(X) (BASE_EVENT_SIZE + ((size_t)(X)))
 
-
 BPF_PERCPU_ARRAY(ssl_data, struct probe_SSL_data_t, 1);
-BPF_PERF_OUTPUT(perf_SSL_write);
+BPF_PERF_OUTPUT(perf_SSL_rw);
 
-int probe_SSL_write(struct pt_regs *ctx, void *ssl, void *buf, int num) {
+BPF_HASH(start_ns, u32);
+BPF_HASH(bufs, u32, u64);
+
+int probe_SSL_rw_enter(struct pt_regs *ctx, void *ssl, void *buf, int num) {
         int ret;
         u32 zero = 0;
         u64 pid_tgid = bpf_get_current_pid_tgid();
         u32 pid = pid_tgid >> 32;
         u32 tid = pid_tgid;
         u32 uid = bpf_get_current_uid_gid();
-
-        PID_FILTER
-        UID_FILTER
-        struct probe_SSL_data_t *data = ssl_data.lookup(&zero);
-        if (!data)
-                return 0;
-
-        data->timestamp_ns = bpf_ktime_get_ns();
-        data->pid = pid;
-        data->tid = tid;
-        data->uid = uid;
-        data->len = num;
-        data->buf_filled = 0;
-        bpf_get_current_comm(&data->comm, sizeof(data->comm));
-        u32 buf_copy_size = min((size_t)MAX_BUF_SIZE, (size_t)num);
-
-        if (buf != 0)
-                ret = bpf_probe_read_user(data->buf, buf_copy_size, buf);
-
-        if (!ret)
-                data->buf_filled = 1;
-        else
-                buf_copy_size = 0;
-
-        perf_SSL_write.perf_submit(ctx, data, EVENT_SIZE(buf_copy_size));
-        return 0;
-}
-
-BPF_PERF_OUTPUT(perf_SSL_read);
-
-BPF_HASH(bufs, u32, u64);
-
-int probe_SSL_read_enter(struct pt_regs *ctx, void *ssl, void *buf, int num) {
-        u64 pid_tgid = bpf_get_current_pid_tgid();
-        u32 pid = pid_tgid >> 32;
-        u32 tid = (u32)pid_tgid;
-        u32 uid = bpf_get_current_uid_gid();
+        u64 ts = bpf_ktime_get_ns();
 
         PID_FILTER
         UID_FILTER
 
         bufs.update(&tid, (u64*)&buf);
+        start_ns.update(&tid, &ts);
         return 0;
 }
 
-int probe_SSL_read_exit(struct pt_regs *ctx, void *ssl, void *buf, int num) {
+static int SSL_exit(struct pt_regs *ctx, int rw) {
+        int ret;
         u32 zero = 0;
         u64 pid_tgid = bpf_get_current_pid_tgid();
         u32 pid = pid_tgid >> 32;
         u32 tid = (u32)pid_tgid;
         u32 uid = bpf_get_current_uid_gid();
-        int ret;
+        u64 ts = bpf_ktime_get_ns();
 
         PID_FILTER
         UID_FILTER
@@ -151,20 +150,26 @@
         if (bufp == 0)
                 return 0;
 
+        u64 *tsp = start_ns.lookup(&tid);
+        if (tsp == 0)
+                return 0;
+
         int len = PT_REGS_RC(ctx);
-        if (len <= 0) // read failed
+        if (len <= 0) // no data
                 return 0;
 
         struct probe_SSL_data_t *data = ssl_data.lookup(&zero);
         if (!data)
                 return 0;
 
-        data->timestamp_ns = bpf_ktime_get_ns();
+        data->timestamp_ns = ts;
+        data->delta_ns = ts - *tsp;
         data->pid = pid;
         data->tid = tid;
         data->uid = uid;
         data->len = (u32)len;
         data->buf_filled = 0;
+        data->rw = rw;
         u32 buf_copy_size = min((size_t)MAX_BUF_SIZE, (size_t)len);
 
         bpf_get_current_comm(&data->comm, sizeof(data->comm));
@@ -173,13 +178,76 @@
                 ret = bpf_probe_read_user(&data->buf, buf_copy_size, (char *)*bufp);
 
         bufs.delete(&tid);
+        start_ns.delete(&tid);
 
         if (!ret)
                 data->buf_filled = 1;
         else
                 buf_copy_size = 0;
 
-        perf_SSL_read.perf_submit(ctx, data, EVENT_SIZE(buf_copy_size));
+        perf_SSL_rw.perf_submit(ctx, data, EVENT_SIZE(buf_copy_size));
+        return 0;
+}
+
+int probe_SSL_read_exit(struct pt_regs *ctx) {
+        return (SSL_exit(ctx, 0));
+}
+
+int probe_SSL_write_exit(struct pt_regs *ctx) {
+        return (SSL_exit(ctx, 1));
+}
+
+BPF_PERF_OUTPUT(perf_SSL_do_handshake);
+
+int probe_SSL_do_handshake_enter(struct pt_regs *ctx, void *ssl) {
+        u64 pid_tgid = bpf_get_current_pid_tgid();
+        u32 pid = pid_tgid >> 32;
+        u32 tid = (u32)pid_tgid;
+        u64 ts = bpf_ktime_get_ns();
+
+        PID_FILTER
+        UID_FILTER
+
+        start_ns.update(&tid, &ts);
+        return 0;
+}
+
+int probe_SSL_do_handshake_exit(struct pt_regs *ctx) {
+        u32 zero = 0;
+        u64 pid_tgid = bpf_get_current_pid_tgid();
+        u32 pid = pid_tgid >> 32;
+        u32 tid = (u32)pid_tgid;
+        u32 uid = bpf_get_current_uid_gid();
+        u64 ts = bpf_ktime_get_ns();
+        int ret;
+
+        PID_FILTER
+        UID_FILTER
+
+        u64 *tsp = start_ns.lookup(&tid);
+        if (tsp == 0)
+                return 0;
+
+        ret = PT_REGS_RC(ctx);
+        if (ret <= 0) // handshake failed
+                return 0;
+
+        struct probe_SSL_data_t *data = ssl_data.lookup(&zero);
+        if (!data)
+                return 0;
+
+        data->timestamp_ns = ts;
+        data->delta_ns = ts - *tsp;
+        data->pid = pid;
+        data->tid = tid;
+        data->uid = uid;
+        data->len = ret;
+        data->buf_filled = 0;
+        data->rw = 2;
+        bpf_get_current_comm(&data->comm, sizeof(data->comm));
+        start_ns.delete(&tid);
+
+        perf_SSL_do_handshake.perf_submit(ctx, data, EVENT_SIZE(0));
         return 0;
 }
 """
@@ -208,59 +276,92 @@
 # need to stash the buffer address in a map on the function entry and read it
 # on its exit (Mark Drayton)
 #
+def attach_openssl(lib):
+    b.attach_uprobe(name=lib, sym="SSL_write",
+                    fn_name="probe_SSL_rw_enter", pid=args.pid or -1)
+    b.attach_uretprobe(name=lib, sym="SSL_write",
+                       fn_name="probe_SSL_write_exit", pid=args.pid or -1)
+    b.attach_uprobe(name=lib, sym="SSL_read",
+                    fn_name="probe_SSL_rw_enter", pid=args.pid or -1)
+    b.attach_uretprobe(name=lib, sym="SSL_read",
+                       fn_name="probe_SSL_read_exit", pid=args.pid or -1)
+    if args.latency and args.handshake:
+        b.attach_uprobe(name="ssl", sym="SSL_do_handshake",
+                        fn_name="probe_SSL_do_handshake_enter", pid=args.pid or -1)
+        b.attach_uretprobe(name="ssl", sym="SSL_do_handshake",
+                           fn_name="probe_SSL_do_handshake_exit", pid=args.pid or -1)
+
+def attach_gnutls(lib):
+    b.attach_uprobe(name=lib, sym="gnutls_record_send",
+                    fn_name="probe_SSL_rw_enter", pid=args.pid or -1)
+    b.attach_uretprobe(name=lib, sym="gnutls_record_send",
+                       fn_name="probe_SSL_write_exit", pid=args.pid or -1)
+    b.attach_uprobe(name=lib, sym="gnutls_record_recv",
+                    fn_name="probe_SSL_rw_enter", pid=args.pid or -1)
+    b.attach_uretprobe(name=lib, sym="gnutls_record_recv",
+                       fn_name="probe_SSL_read_exit", pid=args.pid or -1)
+
+def attach_nss(lib):
+    b.attach_uprobe(name=lib, sym="PR_Write",
+                    fn_name="probe_SSL_rw_enter", pid=args.pid or -1)
+    b.attach_uretprobe(name=lib, sym="PR_Write",
+                       fn_name="probe_SSL_write_exit", pid=args.pid or -1)
+    b.attach_uprobe(name=lib, sym="PR_Send",
+                    fn_name="probe_SSL_rw_enter", pid=args.pid or -1)
+    b.attach_uretprobe(name=lib, sym="PR_Send",
+                       fn_name="probe_SSL_write_exit", pid=args.pid or -1)
+    b.attach_uprobe(name=lib, sym="PR_Read",
+                    fn_name="probe_SSL_rw_enter", pid=args.pid or -1)
+    b.attach_uretprobe(name=lib, sym="PR_Read",
+                       fn_name="probe_SSL_read_exit", pid=args.pid or -1)
+    b.attach_uprobe(name=lib, sym="PR_Recv",
+                    fn_name="probe_SSL_rw_enter", pid=args.pid or -1)
+    b.attach_uretprobe(name=lib, sym="PR_Recv",
+                       fn_name="probe_SSL_read_exit", pid=args.pid or -1)
+
+
+LIB_TRACERS = {
+    "openssl": attach_openssl,
+    "gnutls": attach_gnutls,
+    "nss": attach_nss,
+}
+
+
 if args.openssl:
-    b.attach_uprobe(name="ssl", sym="SSL_write", fn_name="probe_SSL_write",
-                    pid=args.pid or -1)
-    b.attach_uprobe(name="ssl", sym="SSL_read", fn_name="probe_SSL_read_enter",
-                    pid=args.pid or -1)
-    b.attach_uretprobe(name="ssl", sym="SSL_read",
-                       fn_name="probe_SSL_read_exit", pid=args.pid or -1)
-
+    attach_openssl("ssl")
 if args.gnutls:
-    b.attach_uprobe(name="gnutls", sym="gnutls_record_send",
-                    fn_name="probe_SSL_write", pid=args.pid or -1)
-    b.attach_uprobe(name="gnutls", sym="gnutls_record_recv",
-                    fn_name="probe_SSL_read_enter", pid=args.pid or -1)
-    b.attach_uretprobe(name="gnutls", sym="gnutls_record_recv",
-                       fn_name="probe_SSL_read_exit", pid=args.pid or -1)
-
+    attach_gnutls("gnutls")
 if args.nss:
-    b.attach_uprobe(name="nspr4", sym="PR_Write", fn_name="probe_SSL_write",
-                    pid=args.pid or -1)
-    b.attach_uprobe(name="nspr4", sym="PR_Send", fn_name="probe_SSL_write",
-                    pid=args.pid or -1)
-    b.attach_uprobe(name="nspr4", sym="PR_Read", fn_name="probe_SSL_read_enter",
-                    pid=args.pid or -1)
-    b.attach_uretprobe(name="nspr4", sym="PR_Read",
-                       fn_name="probe_SSL_read_exit", pid=args.pid or -1)
-    b.attach_uprobe(name="nspr4", sym="PR_Recv", fn_name="probe_SSL_read_enter",
-                    pid=args.pid or -1)
-    b.attach_uretprobe(name="nspr4", sym="PR_Recv",
-                       fn_name="probe_SSL_read_exit", pid=args.pid or -1)
+    attach_nss("nspr4")
+
+
+if args.extra_lib:
+    for lib_type, lib_path in args.extra_lib:
+        LIB_TRACERS[lib_type](lib_path)
 
 # define output data structure in Python
 
 
 # header
-header = "%-12s %-18s %-16s %-7s %-6s" % ("FUNC", "TIME(s)", "COMM", "PID", "LEN")
+header = "%-12s %-18s %-16s %-7s %-7s" % ("FUNC", "TIME(s)", "COMM", "PID", "LEN")
 
 if args.extra:
     header += " %-7s %-7s" % ("UID", "TID")
 
+if args.latency:
+    header += " %-7s" % ("LAT(ms)")
+
 print(header)
 # process event
 start = 0
 
+def print_event_rw(cpu, data, size):
+    print_event(cpu, data, size, "perf_SSL_rw")
 
-def print_event_write(cpu, data, size):
-    print_event(cpu, data, size, "WRITE/SEND", "perf_SSL_write")
+def print_event_handshake(cpu, data, size):
+    print_event(cpu, data, size, "perf_SSL_do_handshake")
 
-
-def print_event_read(cpu, data, size):
-    print_event(cpu, data, size, "READ/RECV", "perf_SSL_read")
-
-
-def print_event(cpu, data, size, rw, evt):
+def print_event(cpu, data, size, evt):
     global start
     event = b[evt].event(data)
     if event.len <= args.max_buffer_size:
@@ -283,6 +384,8 @@
         start = event.timestamp_ns
     time_s = (float(event.timestamp_ns - start)) / 1000000000
 
+    lat_str = "%.3f" % (event.delta_ns / 1000000) if event.delta_ns else "N/A"
+
     s_mark = "-" * 5 + " DATA " + "-" * 5
 
     e_mark = "-" * 5 + " END DATA " + "-" * 5
@@ -297,6 +400,9 @@
     if args.extra:
         base_fmt += " %(uid)-7d %(tid)-7d"
 
+    if args.latency:
+        base_fmt += " %(lat)-7s"
+
     fmt = ''.join([base_fmt, "\n%(begin)s\n%(data)s\n%(end)s\n\n"])
     if args.hexdump:
         unwrapped_data = binascii.hexlify(buf)
@@ -304,9 +410,16 @@
     else:
         data = buf.decode('utf-8', 'replace')
 
+    rw_event = {
+        0: "READ/RECV",
+        1: "WRITE/SEND",
+        2: "HANDSHAKE"
+    }
+
     fmt_data = {
-        'func': rw,
+        'func': rw_event[event.rw],
         'time': time_s,
+        'lat': lat_str,
         'comm': event.comm.decode('utf-8', 'replace'),
         'pid': event.pid,
         'tid': event.tid,
@@ -317,11 +430,14 @@
         'data': data
     }
 
-    print(fmt % fmt_data)
+    # use base_fmt if no buf filled
+    if buf_size == 0:
+        print(base_fmt % fmt_data)
+    else:
+        print(fmt % fmt_data)
 
-
-b["perf_SSL_write"].open_perf_buffer(print_event_write)
-b["perf_SSL_read"].open_perf_buffer(print_event_read)
+b["perf_SSL_rw"].open_perf_buffer(print_event_rw)
+b["perf_SSL_do_handshake"].open_perf_buffer(print_event_handshake)
 while 1:
     try:
         b.perf_buffer_poll()
diff --git a/tools/sslsniff_example.txt b/tools/sslsniff_example.txt
index fa36c40..905f8a0 100644
--- a/tools/sslsniff_example.txt
+++ b/tools/sslsniff_example.txt
@@ -103,10 +103,75 @@
 characters.
 
 
+Use -l or --latency option to show function latency, and show handshake latency
+by using both -l and --handshake. This is useful for SSL/TLS performance
+analysis. Tracing output of "echo | openssl s_client -connect example.com:443":
+
+# ./sslsniff.py -l --handshake
+FUNC         TIME(s)            COMM             PID     LEN    LAT(ms)
+WRITE/SEND   0.000000000        openssl          10377   1      0.005
+----- DATA -----
+
+
+----- END DATA -----
+
+Trace localhost server instead of example.com. It takes 0.7ms for server
+handshake before secure connection is ready for initial SSL_read or SSL_write.
+
+# ./sslsniff.py -l --handshake
+FUNC         TIME(s)            COMM             PID     LEN    LAT(ms)
+HANDSHAKE    0.000000000        nginx            7081    1      0.699
+WRITE/SEND   0.000132180        openssl          14800   1      0.010
+----- DATA -----
+
+
+----- END DATA -----
+
+READ/RECV    0.000136583        nginx            7081    1      0.004
+----- DATA -----
+
+
+----- END DATA -----
+
+Tracing output of "echo | gnutls-cli -p 443 example.com":
+
+# ./sslsniff.py -l --handshake
+FUNC         TIME(s)            COMM             PID     LEN    LAT(ms)
+WRITE/SEND   0.000000000        gnutls-cli       43554   1      0.012
+----- DATA -----
+
+
+----- END DATA -----
+
+Tracing output of "echo | gnutls-cli -p 443 --insecure localhost":
+
+# ./sslsniff.py -l --handshake
+FUNC         TIME(s)            COMM             PID     LEN    LAT(ms)
+HANDSHAKE    0.000000000        nginx            7081    1      0.710
+WRITE/SEND   0.000045126        gnutls-cli       43752   1      0.014
+----- DATA -----
+
+
+----- END DATA -----
+
+READ/RECV    0.000049464        nginx            7081    1      0.004
+----- DATA -----
+
+
+----- END DATA -----
+
+Tracing few extra libraries (useful for docker containers and other isolated
+apps)
+
+# ./sslsniff.py --extra-lib openssl:/var/lib/docker/overlay2/l/S4EMHE/lib/libssl.so.1.1
+
+
+
 USAGE message:
 
 usage: sslsniff.py [-h] [-p PID] [-u UID] [-x] [-c COMM] [-o] [-g] [-n] [-d]
-                   [--hexdump] [--max-buffer-size MAX_BUFFER_SIZE]
+                   [--hexdump] [--max-buffer-size MAX_BUFFER_SIZE] [-l]
+                   [--handshake] [--extra-lib EXTRA_LIB]
 
 Sniff SSL data
 
@@ -124,6 +189,14 @@
                         UTF-8
   --max-buffer-size MAX_BUFFER_SIZE
                         Size of captured buffer
+  -l, --latency         show function latency
+  --handshake           show SSL handshake latency, enabled only if latency
+                        option is on. 
+  --extra-lib EXTRA_LIB
+                        Intercept calls from extra library
+                        (format: lib_type:lib_path)
+
+
 
 examples:
     ./sslsniff              # sniff OpenSSL and GnuTLS functions
@@ -135,3 +208,6 @@
     ./sslsniff --no-nss     # don't show NSS calls
     ./sslsniff --hexdump    # show data as hex instead of trying to decode it as UTF-8
     ./sslsniff -x           # show process UID and TID
+    ./sslsniff -l           # show function latency
+    ./sslsniff -l --handshake  # show SSL handshake latency
+    ./sslsniff --extra-lib openssl:/path/libssl.so.1.1 # sniff extra library
diff --git a/tools/swapin.py b/tools/swapin.py
index e94000a..67a10db 100755
--- a/tools/swapin.py
+++ b/tools/swapin.py
@@ -74,11 +74,11 @@
 
     if not args.notime:
         print(strftime("%H:%M:%S"))
-    print("%-16s %-6s %s" % ("COMM", "PID", "COUNT"))
+    print("%-16s %-7s %s" % ("COMM", "PID", "COUNT"))
     counts = b.get_table("counts")
     for k, v in sorted(counts.items(),
 		       key=lambda counts: counts[1].value):
-        print("%-16s %-6d %d" % (k.comm, k.pid, v.value))
+        print("%-16s %-7d %d" % (k.comm, k.pid, v.value))
     counts.clear()
     print()
 
diff --git a/tools/syncsnoop.py b/tools/syncsnoop.py
index e5fa78e..e96cd3c 100755
--- a/tools/syncsnoop.py
+++ b/tools/syncsnoop.py
@@ -15,6 +15,7 @@
 
 from __future__ import print_function
 from bcc import BPF
+import sys
 
 # load BPF program
 b = BPF(text="""
@@ -40,6 +41,7 @@
 def print_event(cpu, data, size):
     event = b["events"].event(data)
     print("%-18.9f sync()" % (float(event.ts) / 1000000))
+    sys.stdout.flush()
 
 # loop with callback to print_event
 b["events"].open_perf_buffer(print_event)
diff --git a/tools/tcpaccept.py b/tools/tcpaccept.py
index d3e4414..b2ace4f 100755
--- a/tools/tcpaccept.py
+++ b/tools/tcpaccept.py
@@ -116,7 +116,7 @@
         return 0;
 
     // check this is TCP
-    u8 protocol = 0;
+    u16 protocol = 0;
     // workaround for reading the sk_protocol bitfield:
 
     // Following comments add by Joe Yin:
@@ -132,7 +132,12 @@
     int gso_max_segs_offset = offsetof(struct sock, sk_gso_max_segs);
     int sk_lingertime_offset = offsetof(struct sock, sk_lingertime);
 
-    if (sk_lingertime_offset - gso_max_segs_offset == 4)
+
+    // Since kernel v5.6 sk_protocol is its own u16 field and gso_max_segs
+    // precedes sk_lingertime.
+    if (sk_lingertime_offset - gso_max_segs_offset == 2)
+        protocol = newsk->sk_protocol;
+    else if (sk_lingertime_offset - gso_max_segs_offset == 4)
         // 4.10+ with little endian
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
         protocol = *(u8 *)((u64)&newsk->sk_gso_max_segs - 3);
diff --git a/tools/tcpcong.py b/tools/tcpcong.py
new file mode 100755
index 0000000..671cd11
--- /dev/null
+++ b/tools/tcpcong.py
@@ -0,0 +1,559 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# tcpcong  Measure tcp congestion control status duration.
+#           For Linux, uses BCC, eBPF.
+#
+# USAGE: tcpcong [-h] [-T] [-L] [-R] [-m] [-d] [interval] [outputs]
+#
+# Copyright (c) Ping Gan.
+#
+# 27-Jan-2022   Ping Gan   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+from time import sleep, strftime
+from struct import pack
+from socket import inet_ntop, AF_INET, AF_INET6
+from struct import pack
+import argparse
+
+examples = """examples:
+    ./tcpcong                 # show tcp congestion status duration
+    ./tcpcong 1 10            # show 1 second summaries, 10 times
+    ./tcpcong -L 3000-3006 1  # 1s summaries, local port 3000-3006
+    ./tcpcong -R 5000-5005 1  # 1s summaries, remote port 5000-5005
+    ./tcpcong -uT 1           # 1s summaries, microseconds, and timestamps
+    ./tcpcong -d              # show the duration as histograms
+"""
+
+parser = argparse.ArgumentParser(
+    description="Summarize tcp socket congestion control status duration",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-L", "--localport",
+            help="trace local ports only")
+parser.add_argument("-R", "--remoteport",
+            help="trace the dest ports only")
+parser.add_argument("-T", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("-d", "--dist", action="store_true",
+    help="show distributions as histograms")
+parser.add_argument("-u", "--microseconds", action="store_true",
+    help="output in microseconds")
+parser.add_argument("interval", nargs="?", default=99999999,
+    help="output interval, in seconds")
+parser.add_argument("outputs", nargs="?", default=99999999,
+    help="number of outputs")
+parser.add_argument("--ebpf", action="store_true",
+    help=argparse.SUPPRESS)
+args = parser.parse_args()
+countdown = int(args.outputs)
+debug = 0
+
+start_rport = end_rport = -1
+if args.remoteport:
+    rports = args.remoteport.split("-")
+    if (len(rports) != 2) and (len(rports) != 1):
+        print("unrecognized remote port range")
+        exit(1)
+    if len(rports) == 2:
+        start_rport = int(rports[0])
+        end_rport = int(rports[1])
+    else:
+        start_rport = int(rports[0])
+        end_rport = int(rports[0])
+if start_rport > end_rport:
+    tmp = start_rport
+    start_rport = end_rport
+    end_rport = tmp
+
+start_lport = end_lport = -1
+if args.localport:
+    lports = args.localport.split("-")
+    if (len(lports) != 2) and (len(lports) != 1):
+        print("unrecognized local port range")
+        exit(1)
+    if len(lports) == 2:
+        start_lport = int(lports[0])
+        end_lport = int(lports[1])
+    else:
+        start_lport = int(lports[0])
+        end_lport = int(lports[0])
+if start_lport > end_lport:
+    tmp = start_lport
+    start_lport = end_lport
+    end_lport = tmp
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <net/sock.h>
+#include <bcc/proto.h>
+#include <net/tcp.h>
+#include <net/inet_connection_sock.h>
+
+typedef struct ipv4_flow_key {
+    u32 saddr;
+    u32 daddr;
+    u16 lport;
+    u16 dport;
+} ipv4_flow_key_t;
+
+typedef struct ipv6_flow_key {
+    unsigned __int128 saddr;
+    unsigned __int128 daddr;
+    u16 lport;
+    u16 dport;
+} ipv6_flow_key_t;
+
+typedef struct process_key {
+    char comm[TASK_COMM_LEN];
+    u32  tid;
+} process_key_t;
+
+typedef struct ipv4_flow_val {
+    ipv4_flow_key_t ipv4_key;
+    u16  cong_state;
+} ipv4_flow_val_t;
+
+typedef struct ipv6_flow_val {
+    ipv6_flow_key_t ipv6_key;
+    u16  cong_state;
+} ipv6_flow_val_t;
+
+BPF_HASH(start_ipv4, process_key_t, ipv4_flow_val_t);
+BPF_HASH(start_ipv6, process_key_t, ipv6_flow_val_t);
+SOCK_STORE_DEF
+
+typedef struct data_val {
+    DEF_TEXT
+    u64  last_ts;
+    u16  last_cong_stat;
+} data_val_t;
+
+typedef struct cong {
+    u8  cong_stat:5,
+        ca_inited:1,
+        ca_setsockopt:1,
+        ca_dstlocked:1;
+} cong_status_t;
+
+BPF_HASH(ipv4_stat, ipv4_flow_key_t, data_val_t);
+BPF_HASH(ipv6_stat, ipv6_flow_key_t, data_val_t);
+
+HIST_TABLE
+
+static int entry_state_update_func(struct sock *sk)
+{
+    u16 dport = 0, lport = 0;
+    u32 tid = bpf_get_current_pid_tgid();
+    process_key_t key = {0};
+    bpf_get_current_comm(&key.comm, sizeof(key.comm));
+    key.tid = tid;
+
+    u64 family = sk->__sk_common.skc_family;
+    struct inet_connection_sock *icsk = inet_csk(sk);
+    cong_status_t cong_status;
+    bpf_probe_read_kernel(&cong_status, sizeof(cong_status),
+        (void *)((long)&icsk->icsk_retransmits) - 1);
+    if (family == AF_INET) {
+        ipv4_flow_val_t ipv4_val = {0};
+        ipv4_val.ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr;
+        ipv4_val.ipv4_key.daddr = sk->__sk_common.skc_daddr;
+        ipv4_val.ipv4_key.lport = sk->__sk_common.skc_num;
+        dport = sk->__sk_common.skc_dport;
+        dport = ntohs(dport);
+        lport = ipv4_val.ipv4_key.lport;
+        FILTER_LPORT
+        FILTER_DPORT
+        ipv4_val.ipv4_key.dport = dport;
+        ipv4_val.cong_state = cong_status.cong_stat + 1;
+        start_ipv4.update(&key, &ipv4_val);
+    } else if (family == AF_INET6) {
+        ipv6_flow_val_t ipv6_val = {0};
+        bpf_probe_read_kernel(&ipv6_val.ipv6_key.saddr,
+            sizeof(ipv6_val.ipv6_key.saddr),
+            &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
+        bpf_probe_read_kernel(&ipv6_val.ipv6_key.daddr,
+            sizeof(ipv6_val.ipv6_key.daddr),
+            &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
+        ipv6_val.ipv6_key.lport = sk->__sk_common.skc_num;
+        dport = sk->__sk_common.skc_dport;
+        dport = ntohs(dport);
+        lport = ipv6_val.ipv6_key.lport;
+        FILTER_LPORT
+        FILTER_DPORT
+        ipv6_val.ipv6_key.dport = dport;
+        ipv6_val.cong_state = cong_status.cong_stat + 1;
+        start_ipv6.update(&key, &ipv6_val);
+    }
+    SOCK_STORE_ADD
+    return 0;
+}
+
+static int ret_state_update_func(struct sock *sk)
+{
+    u64 ts, ts1;
+    u16 family, last_cong_state;
+    u16 dport = 0, lport = 0;
+    u32 tid = bpf_get_current_pid_tgid();
+    process_key_t key = {0};
+    bpf_get_current_comm(&key.comm, sizeof(key.comm));
+    key.tid = tid;
+
+    struct inet_connection_sock *icsk = inet_csk(sk);
+    cong_status_t cong_status;
+    bpf_probe_read_kernel(&cong_status, sizeof(cong_status),
+        (void *)((long)&icsk->icsk_retransmits) - 1);
+    data_val_t *datap, data = {0};
+    STATE_KEY
+    bpf_probe_read_kernel(&family, sizeof(family),
+        &sk->__sk_common.skc_family);
+    if (family == AF_INET) {
+        ipv4_flow_val_t *val4 = start_ipv4.lookup(&key);
+        if (val4 == 0) {
+            SOCK_STORE_DEL
+            return 0; //missed
+        }
+        ipv4_flow_key_t keyv4 = {0};
+        bpf_probe_read_kernel(&keyv4, sizeof(ipv4_flow_key_t),
+            &(val4->ipv4_key));
+        dport = keyv4.dport;
+        lport = keyv4.lport;
+        FILTER_LPORT
+        FILTER_DPORT
+        datap = ipv4_stat.lookup(&keyv4);
+        if (datap == 0) {
+            data.last_ts = bpf_ktime_get_ns();
+            data.last_cong_stat = val4->cong_state;
+            ipv4_stat.update(&keyv4, &data);
+        } else {
+            last_cong_state = val4->cong_state;
+            if ((cong_status.cong_stat + 1) != last_cong_state) {
+                ts1 = bpf_ktime_get_ns();
+                ts = ts1 - datap->last_ts;
+                datap->last_ts = ts1;
+                datap->last_cong_stat = cong_status.cong_stat + 1;
+                ts /= 1000;
+                STORE
+            }
+        }
+        start_ipv4.delete(&key);
+    } else if (family == AF_INET6) {
+        ipv6_flow_val_t *val6 = start_ipv6.lookup(&key);
+        if (val6 == 0) {
+            SOCK_STORE_DEL
+            return 0; //missed
+        }
+        ipv6_flow_key_t keyv6 = {0};
+        bpf_probe_read_kernel(&keyv6, sizeof(ipv6_flow_key_t),
+            &(val6->ipv6_key));
+        dport = keyv6.dport;
+        lport = keyv6.lport;
+        FILTER_LPORT
+        FILTER_DPORT
+        datap = ipv6_stat.lookup(&keyv6);
+        if (datap == 0) {
+            data.last_ts = bpf_ktime_get_ns();
+            data.last_cong_stat = val6->cong_state;
+            ipv6_stat.update(&keyv6, &data);
+        } else {
+            last_cong_state = val6->cong_state;
+            if ((cong_status.cong_stat + 1) != last_cong_state) {
+                ts1 = bpf_ktime_get_ns();
+                ts = ts1 - datap->last_ts;
+                datap->last_ts = ts1;
+                datap->last_cong_stat = (cong_status.cong_stat + 1);
+                ts /= 1000;
+                STORE
+            }
+        }
+        start_ipv6.delete(&key);
+    }
+    SOCK_STORE_DEL
+    return 0;
+}
+"""
+
+kprobe_program = """
+int entry_func(struct pt_regs *ctx, struct sock *sk)
+{
+    return entry_state_update_func(sk);
+}
+
+int ret_func(struct pt_regs *ctx)
+{
+    u32 tid = bpf_get_current_pid_tgid();
+    process_key_t key = {0};
+    bpf_get_current_comm(&key.comm, sizeof(key.comm));
+    key.tid = tid;
+    struct sock **sockpp;
+    sockpp = sock_store.lookup(&key);
+    if (sockpp == 0) {
+        return 0; //miss the entry
+    }
+    struct sock *sk = *sockpp;
+    return ret_state_update_func(sk);
+}
+"""
+
+kfunc_program = """
+KFUNC_PROBE(tcp_fastretrans_alert, struct sock *sk)
+{
+    return entry_state_update_func(sk);
+}
+
+KRETFUNC_PROBE(tcp_fastretrans_alert, struct sock *sk)
+{
+    return ret_state_update_func(sk);
+}
+
+KFUNC_PROBE(tcp_enter_cwr, struct sock *sk)
+{
+    return entry_state_update_func(sk);
+}
+
+KRETFUNC_PROBE(tcp_enter_cwr, struct sock *sk)
+{
+    return ret_state_update_func(sk);
+}
+
+KFUNC_PROBE(tcp_enter_loss, struct sock *sk)
+{
+    return entry_state_update_func(sk);
+}
+
+KRETFUNC_PROBE(tcp_enter_loss, struct sock *sk)
+{
+    return ret_state_update_func(sk);
+}
+
+KFUNC_PROBE(tcp_enter_recovery, struct sock *sk)
+{
+    return entry_state_update_func(sk);
+}
+
+KRETFUNC_PROBE(tcp_enter_recovery, struct sock *sk)
+{
+    return ret_state_update_func(sk);
+}
+
+KFUNC_PROBE(tcp_process_tlp_ack, struct sock *sk)
+{
+    return entry_state_update_func(sk);
+}
+
+KRETFUNC_PROBE(tcp_process_tlp_ack, struct sock *sk)
+{
+    return ret_state_update_func(sk);
+}
+"""
+
+# code replace
+is_support_kfunc = BPF.support_kfunc()
+if is_support_kfunc:
+    bpf_text += kfunc_program
+    bpf_text = bpf_text.replace('SOCK_STORE_DEF', '')
+    bpf_text = bpf_text.replace('SOCK_STORE_ADD', '')
+    bpf_text = bpf_text.replace('SOCK_STORE_DEL', '')
+else:
+    bpf_text += kprobe_program
+    bpf_text = bpf_text.replace('SOCK_STORE_DEF',
+                   'BPF_HASH(sock_store, process_key_t, struct sock *);')
+    bpf_text = bpf_text.replace('SOCK_STORE_ADD',
+                   'sock_store.update(&key, &sk);')
+    bpf_text = bpf_text.replace('SOCK_STORE_DEL',
+                   'sock_store.delete(&key);')
+
+if args.localport:
+    bpf_text = bpf_text.replace('FILTER_LPORT',
+        'if (lport < %d || lport > %d) { return 0; }'
+        % (start_lport, end_lport))
+else:
+    bpf_text = bpf_text.replace('FILTER_LPORT', '')
+
+if args.remoteport:
+    bpf_text = bpf_text.replace('FILTER_DPORT',
+        'if (dport < %d || dport > %d) { return 0; }'
+        % (start_rport, end_rport))
+else:
+    bpf_text = bpf_text.replace('FILTER_DPORT', '')
+
+table_def_text = """
+    u64  open_dura;
+    u64  loss_dura;
+    u64  disorder_dura;
+    u64  recover_dura;
+    u64  cwr_dura;
+    u64  total_changes;
+"""
+
+store_text = """
+                datap->total_changes += 1;
+                if (last_cong_state == (TCP_CA_Open + 1)) {
+                    datap->open_dura += ts;
+                } else if (last_cong_state == (TCP_CA_Disorder + 1)) {
+                    datap->disorder_dura += ts;
+                } else if (last_cong_state == (TCP_CA_CWR + 1)) {
+                    datap->cwr_dura += ts;
+                } else if (last_cong_state == (TCP_CA_Recovery + 1)) {
+                    datap->recover_dura += ts;
+                } else if (last_cong_state == (TCP_CA_Loss + 1)) {
+                    datap->loss_dura += ts;
+                }
+"""
+
+store_dist_text = """
+                if (last_cong_state == (TCP_CA_Open + 1)) {
+                    key_s.state = TCP_CA_Open;
+                } else if (last_cong_state == (TCP_CA_Disorder + 1)) {
+                    key_s.state = TCP_CA_Disorder;
+                } else if (last_cong_state == (TCP_CA_CWR + 1)) {
+                    key_s.state = TCP_CA_CWR;
+                } else if (last_cong_state == (TCP_CA_Recovery + 1)) {
+                    key_s.state = TCP_CA_Recovery;
+                } else if (last_cong_state == (TCP_CA_Loss + 1)) {
+                    key_s.state = TCP_CA_Loss;
+                }
+                TIME_UNIT
+                key_s.slot = bpf_log2l(ts);
+                dist.atomic_increment(key_s);
+"""
+
+hist_table_text = """
+typedef struct congest_state_key {
+    u32  state;
+    u64  slot;
+}congest_state_key_t;
+
+BPF_HISTOGRAM(dist, congest_state_key_t);
+"""
+
+if args.dist:
+    bpf_text = bpf_text.replace('DEF_TEXT', '')
+    bpf_text = bpf_text.replace('STORE', store_dist_text)
+    bpf_text = bpf_text.replace('STATE_KEY',
+        'congest_state_key_t key_s = {0};')
+    bpf_text = bpf_text.replace('HIST_TABLE', hist_table_text)
+    if args.microseconds:
+        bpf_text = bpf_text.replace('TIME_UNIT', '')
+    else:
+        bpf_text = bpf_text.replace('TIME_UNIT', 'ts /= 1000;')
+else:
+    bpf_text = bpf_text.replace('DEF_TEXT', table_def_text)
+    bpf_text = bpf_text.replace('STORE', store_text)
+    bpf_text = bpf_text.replace('STATE_KEY', '')
+    bpf_text = bpf_text.replace('HIST_TABLE', '')
+
+
+if debug or args.ebpf:
+    print(bpf_text)
+    if args.ebpf:
+        exit()
+
+# load BPF program
+b = BPF(text=bpf_text)
+
+if not is_support_kfunc:
+    # all the tcp congestion control status update functions
+    # are called by below 5 functions.
+    b.attach_kprobe(event="tcp_fastretrans_alert", fn_name="entry_func")
+    b.attach_kretprobe(event="tcp_fastretrans_alert", fn_name="ret_func")
+    b.attach_kprobe(event="tcp_enter_cwr", fn_name="entry_func")
+    b.attach_kretprobe(event="tcp_enter_cwr", fn_name="ret_func")
+    b.attach_kprobe(event="tcp_process_tlp_ack", fn_name="entry_func")
+    b.attach_kretprobe(event="tcp_process_tlp_ack", fn_name="ret_func")
+    b.attach_kprobe(event="tcp_enter_loss", fn_name="entry_func")
+    b.attach_kretprobe(event="tcp_enter_loss", fn_name="ret_func")
+    b.attach_kprobe(event="tcp_enter_recovery", fn_name="entry_func")
+    b.attach_kretprobe(event="tcp_enter_recovery", fn_name="ret_func")
+
+print("Tracing tcp congestion control status duration... Hit Ctrl-C to end.")
+
+
+def cong_state_to_name(state):
+    # this need to match with kernel state
+    state_name = ["open", "disorder", "cwr", "recovery", "loss"]
+    return state_name[state]
+
+# output
+exiting = 0 if args.interval else 1
+ipv6_stat = b.get_table("ipv6_stat")
+ipv4_stat = b.get_table("ipv4_stat")
+if args.dist:
+    dist = b.get_table("dist")
+label = "ms"
+if args.microseconds:
+    label = "us"
+while (1):
+    try:
+        sleep(int(args.interval))
+    except KeyboardInterrupt:
+        exiting = 1
+
+    print()
+    if args.timestamp:
+        print("%-8s\n" % strftime("%H:%M:%S"), end="")
+    if args.dist:
+        if args.microseconds:
+            dist.print_log2_hist("usecs", "tcp_congest_state",
+                section_print_fn=cong_state_to_name)
+        else:
+            dist.print_log2_hist("msecs", "tcp_congest_state",
+                section_print_fn=cong_state_to_name)
+        dist.clear()
+    else:
+        if ipv4_stat:
+            print("%-21s% -21s %-7s %-6s %-7s %-7s %-6s %-5s" % ("LAddrPort",
+                "RAddrPort", "Open_" + label, "Dod_" + label,
+                "Rcov_" + label, "Cwr_" + label, "Los_" + label, "Chgs"))
+        laddr = ""
+        raddr = ""
+        for k, v in sorted(ipv4_stat.items(), key=lambda ipv4_stat: ipv4_stat[0].lport):
+            laddr = inet_ntop(AF_INET, pack("I", k.saddr))
+            raddr = inet_ntop(AF_INET, pack("I", k.daddr))
+            open_dura = v.open_dura
+            disorder_dura = v.disorder_dura
+            recover_dura = v.recover_dura
+            cwr_dura = v.cwr_dura
+            loss_dura = v.loss_dura
+            if not args.microseconds:
+                open_dura /= 1000
+                disorder_dura /= 1000
+                recover_dura /= 1000
+                cwr_dura /= 1000
+                loss_dura /= 1000
+            if v.total_changes != 0:
+                print("%-21s %-21s %-7d %-6d %-7d %-7d %-6d %-5d" % (laddr +
+                    "/" + str(k.lport), raddr + "/" + str(k.dport), open_dura,
+                    disorder_dura, recover_dura, cwr_dura, loss_dura,
+                    v.total_changes))
+        if ipv6_stat:
+            print("%-32s %-32s %-7s %-6s %-7s %-7s %-6s %-5s" % ("LAddrPort6",
+                "RAddrPort6", "Open_" + label, "Dod_" + label, "Rcov_" + label,
+                "Cwr_" + label, "Los_" + label, "Chgs"))
+        for k, v in sorted(ipv6_stat.items(), key=lambda ipv6_stat: ipv6_stat[0].lport):
+            laddr = inet_ntop(AF_INET6, bytes(k.saddr))
+            raddr = inet_ntop(AF_INET6, bytes(k.daddr))
+            open_dura = v.open_dura
+            disorder_dura = v.disorder_dura
+            recover_dura = v.recover_dura
+            cwr_dura = v.cwr_dura
+            loss_dura = v.loss_dura
+            if not args.microseconds:
+                open_dura /= 1000
+                disorder_dura /= 1000
+                recover_dura /= 1000
+                cwr_dura /= 1000
+                loss_dura /= 1000
+            if v.total_changes != 0:
+                print("%-32s %-32s %-7d %-7d %-7d %-6d %-6d %-5d" % (laddr +
+                    "/" + str(k.lport), raddr + "/" + str(k.dport), open_dura,
+                    disorder_dura, recover_dura, cwr_dura, loss_dura,
+                    v.total_changes))
+    ipv4_stat.clear()
+    ipv6_stat.clear()
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()
diff --git a/tools/tcpcong_example.txt b/tools/tcpcong_example.txt
new file mode 100644
index 0000000..837c3b2
--- /dev/null
+++ b/tools/tcpcong_example.txt
@@ -0,0 +1,491 @@
+Demonstrations of tcpcong, the Linux eBPF/bcc version.
+
+This tool traces linux kernel's tcp congestion control status change functions,
+then calculate duration of every status and record it, at last prints it as 
+tables or histogram, which can be used for evaluating the tcp congestion 
+algorithm's performance.
+
+For example:
+
+./tcpcong 
+Tracing tcp congestion control status duration... Hit Ctrl-C to end.
+^C
+LAddrPort            RAddrPort             Open_ms Dod_ms Rcov_ms Cwr_ms  Los_ms Chgs 
+192.168.219.3/34968   192.168.219.4/19230   884     12     102     507     0      2721
+192.168.219.3/34976   192.168.219.4/19230   869     12     133     490     0      2737
+192.168.219.3/34982   192.168.219.4/19230   807     0      0       699     0      3158
+192.168.219.3/34988   192.168.219.4/19230   892     16     88      508     0      2540
+192.168.219.3/38946   192.168.219.4/19229   894     13     97      500     0      2697
+192.168.219.3/38950   192.168.219.4/19229   840     10     73      579     1      1840
+192.168.219.3/38970   192.168.219.4/19229   862     17     91      534     0      2339
+192.168.219.3/38982   192.168.219.4/19229   812     13     92      587     0      2102
+192.168.219.3/39070   192.168.219.1/19225   855     7      61      580     0      2826
+192.168.219.3/39098   192.168.219.1/19225   880     8      47      568     0      2557
+192.168.219.3/39112   192.168.219.1/19225   674     2      10      819     0      2867
+192.168.219.3/39120   192.168.219.1/19225   757     1      11      736     0      2978
+192.168.219.3/41146   192.168.219.1/19227   736     1      10      758     0      2972
+192.168.219.3/41162   192.168.219.1/19227   662     2      10      830     0      2889
+192.168.219.3/41178   192.168.219.1/19227   646     2      11      846     0      2858
+192.168.219.3/41192   192.168.219.1/19227   812     9      67      615     0      2204
+192.168.219.3/43856   192.168.219.2/19225   745     1      5       754     0      3067
+192.168.219.3/43858   192.168.219.2/19225   827     4      36      636     0      2130
+192.168.219.3/43872   192.168.219.2/19225   739     0      2       764     0      3035
+192.168.219.3/43880   192.168.219.2/19225   747     0      3       756     0      3144
+192.168.219.3/47230   192.168.219.2/19227   830     4      38      632     0      2554
+192.168.219.3/47242   192.168.219.2/19227   782     3      32      687     0      2136
+192.168.219.3/47272   192.168.219.2/19227   611     1      3       889     0      2629
+192.168.219.3/47294   192.168.219.2/19227   832     3      38      630     0      2631
+192.168.219.3/49716   192.168.219.2/19226   846     4      44      610     0      2562
+192.168.219.3/49746   192.168.219.2/19226   765     0      4       736     0      2998
+192.168.219.3/49760   192.168.219.2/19226   812     2      47      644     0      2273
+192.168.219.3/49766   192.168.219.2/19226   724     0      2       779     0      3106
+192.168.219.3/54076   192.168.219.1/19226   690     1      9       804     0      2939
+192.168.219.3/54096   192.168.219.1/19226   715     2      10      778     0      2974
+192.168.219.3/54114   192.168.219.1/19226   878     6      61      558     0      2742
+192.168.219.3/54120   192.168.219.1/19226   738     0      9       757     0      2959
+192.168.219.3/60926   192.168.219.4/19228   711     11     80      702     0      1870
+192.168.219.3/60930   192.168.219.4/19228   785     0      0       720     0      3325
+192.168.219.3/60942   192.168.219.4/19228   762     0      1       743     0      3342
+192.168.219.3/60948   192.168.219.4/19228   877     11     102     514     0      2654
+
+The example shows all tcp socket's congestion status duration for milliseconds,
+open_ms column is the duration of tcp connection in open status whose cwnd can
+increase; dod_ms column is the duration of tcp connection in disorder status 
+who receives disordered packet; rcov_ms column is the duration of tcp 
+connection in recovery status who receives 3 duplicated acks; cwr_ms column 
+is the duration of tcp connection who receives explicitly congest notifier and
+two acks to reduce the cwnd. the last column chgs prints total status change 
+number of the socket.
+
+An interval can be provided, and also optionally a count. Eg, printing output
+every 1 second, and including timestamps (-T):
+./tcpcong -T 1 3 
+Tracing tcp congestion control status duration... Hit Ctrl-C to end.
+
+07:37:55
+LAddrPort            RAddrPort             Open_ms Dod_ms Rcov_ms Cwr_ms  Los_ms Chgs 
+192.168.219.3/34968   192.168.219.4/19230   742     15     82      311     0      1678
+192.168.219.3/34976   192.168.219.4/19230   700     12     98      340     0      1965
+192.168.219.3/34982   192.168.219.4/19230   634     0      1       516     0      2471
+192.168.219.3/34988   192.168.219.4/19230   692     12     94      354     0      1941
+192.168.219.3/38946   192.168.219.4/19229   722     12     90      323     0      2006
+192.168.219.3/38950   192.168.219.4/19229   420     7      264     439     1      951
+192.168.219.3/38970   192.168.219.4/19229   724     14     90      323     0      1986
+192.168.219.3/38982   192.168.219.4/19229   686     13     87      365     0      1675
+192.168.219.3/39070   192.168.219.1/19225   653     5      46      446     0      1998
+192.168.219.3/39098   192.168.219.1/19225   667     4      38      440     0      2098
+192.168.219.3/39112   192.168.219.1/19225   606     0      1       543     0      2146
+192.168.219.3/39120   192.168.219.1/19225   492     0      205     453     0      1916
+192.168.219.3/41146   192.168.219.1/19227   583     0      3       564     0      2332
+192.168.219.3/41162   192.168.219.1/19227   536     0      1       613     0      2192
+192.168.219.3/41178   192.168.219.1/19227   499     0      2       649     0      2064
+192.168.219.3/41192   192.168.219.1/19227   622     6      34      488     0      1660
+192.168.219.3/43856   192.168.219.2/19225   555     0      1       593     0      2359
+192.168.219.3/43858   192.168.219.2/19225   618     3      28      502     0      1773
+192.168.219.3/43872   192.168.219.2/19225   558     0      0       592     0      2318
+192.168.219.3/43880   192.168.219.2/19225   580     0      1       569     0      2303
+192.168.219.3/47230   192.168.219.2/19227   646     1      18      485     0      1776
+192.168.219.3/47242   192.168.219.2/19227   634     0      20      495     0      1582
+192.168.219.3/47272   192.168.219.2/19227   463     0      1       687     0      1854
+192.168.219.3/47294   192.168.219.2/19227   636     2      27      486     0      1901
+192.168.219.3/49716   192.168.219.2/19226   646     2      28      475     0      1832
+192.168.219.3/49746   192.168.219.2/19226   583     0      0       567     0      2333
+192.168.219.3/49760   192.168.219.2/19226   628     2      26      495     0      1755
+192.168.219.3/49766   192.168.219.2/19226   558     0      0       592     0      2412
+192.168.219.3/54076   192.168.219.1/19226   581     0      2       567     0      2042
+192.168.219.3/54096   192.168.219.1/19226   554     0      2       594     0      2239
+192.168.219.3/54114   192.168.219.1/19226   685     4      33      427     0      1859
+192.168.219.3/54120   192.168.219.1/19226   611     0      3       537     0      2322
+192.168.219.3/60926   192.168.219.4/19228   681     20     101     347     0      1636
+192.168.219.3/60930   192.168.219.4/19228   616     0      1       532     0      2310
+192.168.219.3/60942   192.168.219.4/19228   607     0      1       543     0      2433
+192.168.219.3/60948   192.168.219.4/19228   597     11     76      293     0      1641
+
+07:37:57
+LAddrPort            RAddrPort             Open_ms Dod_ms Rcov_ms Cwr_ms  Los_ms Chgs 
+192.168.219.3/34968   192.168.219.4/19230   469     9      255     265     0      1305
+192.168.219.3/34976   192.168.219.4/19230   580     11     91      316     0      1916
+192.168.219.3/34982   192.168.219.4/19230   566     0      0       433     0      2092
+192.168.219.3/34988   192.168.219.4/19230   583     9      63      345     0      1871
+192.168.219.3/38946   192.168.219.4/19229   449     16     69      464     0      1425
+192.168.219.3/38950   192.168.219.4/19229   569     10     68      349     0      1848
+192.168.219.3/38970   192.168.219.4/19229   573     20     66      339     0      1839
+192.168.219.3/38982   192.168.219.4/19229   553     9      60      378     0      1483
+192.168.219.3/39070   192.168.219.1/19225   471     3      243     280     0      1279
+192.168.219.3/39098   192.168.219.1/19225   598     4      37      355     0      1717
+192.168.219.3/39112   192.168.219.1/19225   522     0      1       476     0      1816
+192.168.219.3/39120   192.168.219.1/19225   518     0      1       480     0      2031
+192.168.219.3/41146   192.168.219.1/19227   500     0      3       497     0      1996
+192.168.219.3/41162   192.168.219.1/19227   448     0      2       548     0      1849
+192.168.219.3/41178   192.168.219.1/19227   441     0      4       554     0      1693
+192.168.219.3/41192   192.168.219.1/19227   555     4      34      405     0      1341
+192.168.219.3/43856   192.168.219.2/19225   471     0      3       525     0      2118
+192.168.219.3/43858   192.168.219.2/19225   541     1      25      430     0      1446
+192.168.219.3/43872   192.168.219.2/19225   483     0      1       516     0      2044
+192.168.219.3/43880   192.168.219.2/19225   492     0      0       507     0      2073
+192.168.219.3/47230   192.168.219.2/19227   581     3      29      385     0      1453
+192.168.219.3/47242   192.168.219.2/19227   571     2      22      403     0      1292
+192.168.219.3/47272   192.168.219.2/19227   393     0      0       604     0      1516
+192.168.219.3/47294   192.168.219.2/19227   575     2      27      393     0      1660
+192.168.219.3/49716   192.168.219.2/19226   584     1      25      389     0      1582
+192.168.219.3/49746   192.168.219.2/19226   513     0      0       486     0      2017
+192.168.219.3/49760   192.168.219.2/19226   560     1      24      412     0      1370
+192.168.219.3/49766   192.168.219.2/19226   474     0      0       525     0      2121
+192.168.219.3/54076   192.168.219.1/19226   504     0      1       494     0      1724
+192.168.219.3/54096   192.168.219.1/19226   490     0      2       507     0      1906
+192.168.219.3/54114   192.168.219.1/19226   611     3      25      360     0      1560
+192.168.219.3/54120   192.168.219.1/19226   520     0      1       479     0      2010
+192.168.219.3/60926   192.168.219.4/19228   527     9      53      408     0      1473
+192.168.219.3/60930   192.168.219.4/19228   551     0      0       448     0      1951
+192.168.219.3/60942   192.168.219.4/19228   538     0      0       461     0      2038
+192.168.219.3/60948   192.168.219.4/19228   511     9      68      295     1      1701
+
+07:37:58
+LAddrPort            RAddrPort             Open_ms Dod_ms Rcov_ms Cwr_ms  Los_ms Chgs
+192.168.219.3/34968   192.168.219.4/19230   293     1      226     211     0      755
+192.168.219.3/34976   192.168.219.4/19230   424     4      36      354     0      1489
+192.168.219.3/34982   192.168.219.4/19230   552     0      0       446     0      2249
+192.168.219.3/34988   192.168.219.4/19230   493     4      42      327     0      1715
+192.168.219.3/38946   192.168.219.4/19229   425     4      37      340     41     1478
+192.168.219.3/38950   192.168.219.4/19229   465     5      45      335     0      1586
+192.168.219.3/38970   192.168.219.4/19229   531     5      41      420     0      1863
+192.168.219.3/38982   192.168.219.4/19229   525     5      41      427     0      1625
+192.168.219.3/39070   192.168.219.1/19225   576     4      44      374     0      1787
+192.168.219.3/39098   192.168.219.1/19225   596     6      41      355     0      1782
+192.168.219.3/39112   192.168.219.1/19225   501     0      3       494     0      1887
+192.168.219.3/39120   192.168.219.1/19225   511     0      4       483     0      2070
+192.168.219.3/41146   192.168.219.1/19227   503     0      3       492     0      2068
+192.168.219.3/41162   192.168.219.1/19227   449     1      3       545     0      1962
+192.168.219.3/41178   192.168.219.1/19227   445     0      5       546     0      1907
+192.168.219.3/41192   192.168.219.1/19227   436     4      248     309     0      1208
+192.168.219.3/43856   192.168.219.2/19225   480     0      0       519     0      2108
+192.168.219.3/43858   192.168.219.2/19225   534     3      24      437     0      1644
+192.168.219.3/43872   192.168.219.2/19225   480     0      0       519     0      2068
+192.168.219.3/43880   192.168.219.2/19225   490     0      0       508     0      2083
+192.168.219.3/47230   192.168.219.2/19227   561     3      22      411     0      1556
+192.168.219.3/47242   192.168.219.2/19227   550     2      22      424     0      1485
+192.168.219.3/47272   192.168.219.2/19227   398     0      0       601     0      1537
+192.168.219.3/47294   192.168.219.2/19227   551     1      19      427     0      1712
+192.168.219.3/49716   192.168.219.2/19226   570     1      20      405     0      1712
+192.168.219.3/49746   192.168.219.2/19226   494     0      0       503     0      2052
+192.168.219.3/49760   192.168.219.2/19226   547     1      18      431     0      1673
+192.168.219.3/49766   192.168.219.2/19226   497     0      0       501     0      1983
+192.168.219.3/54076   192.168.219.1/19226   495     0      4       499     0      1849
+192.168.219.3/54096   192.168.219.1/19226   485     0      4       508     0      2037
+192.168.219.3/54114   192.168.219.1/19226   603     5      37      354     0      1671
+192.168.219.3/54120   192.168.219.1/19226   516     0      1       482     0      2047
+192.168.219.3/60926   192.168.219.4/19228   543     5      39      412     0      1708
+192.168.219.3/60930   192.168.219.4/19228   530     0      0       469     0      2096
+192.168.219.3/60942   192.168.219.4/19228   510     0      0       489     0      2234
+192.168.219.3/60948   192.168.219.4/19228   565     4      61      367     0      1956
+
+An local port and remote port can be specified, and also optionally a count.
+Eg printing output every 1 second, and including timestamps (-T) for local
+ports 30000-40000 and remote ports 19225-19227:
+./tcpcong -T -L 30000-40000 -R 19225-19227 1 3    
+Tracing tcp congestion control status duration... Hit Ctrl-C to end.
+
+07:39:11
+LAddrPort            RAddrPort             Open_ms Dod_ms Rcov_ms Cwr_ms  Los_ms Chgs 
+192.168.219.3/39070   192.168.219.1/19225   668     4      32      455     0      1706
+192.168.219.3/39098   192.168.219.1/19225   692     4      38      424     0      2110
+192.168.219.3/39112   192.168.219.1/19225   564     0      2       593     0      2291
+192.168.219.3/39120   192.168.219.1/19225   599     0      4       555     0      2387
+
+07:39:12
+LAddrPort            RAddrPort             Open_ms Dod_ms Rcov_ms Cwr_ms  Los_ms Chgs 
+192.168.219.3/39070   192.168.219.1/19225   576     3      27      391     0      1525
+192.168.219.3/39098   192.168.219.1/19225   580     3      36      379     0      1893
+192.168.219.3/39112   192.168.219.1/19225   474     1      10      512     0      2009
+192.168.219.3/39120   192.168.219.1/19225   505     1      9       483     0      2022
+
+07:39:13
+LAddrPort            RAddrPort             Open_ms Dod_ms Rcov_ms Cwr_ms  Los_ms Chgs 
+192.168.219.3/39070   192.168.219.1/19225   546     6      27      418     0      1659
+192.168.219.3/39098   192.168.219.1/19225   564     4      40      390     0      1937
+192.168.219.3/39112   192.168.219.1/19225   479     0      3       514     0      2008
+192.168.219.3/39120   192.168.219.1/19225   515     0      4       479     0      1982
+
+The (-u) option can be specified for recording the duration as miroseconds.
+Eg printing output every 1 second, and including timestamps (-T) and 
+microseconds (-u) for local ports 30000-40000 and remote ports 19225-19227:
+./tcpcong -T -u -L 30000-40000 -R 19225-19227 1 3 
+Tracing tcp congestion control status duration... Hit Ctrl-C to end.
+
+07:39:44
+LAddrPort            RAddrPort             Open_us Dod_us Rcov_us Cwr_us  Los_us Chgs 
+192.168.219.3/39070   192.168.219.1/19225   600971  3232   38601   509796  0      1843
+192.168.219.3/39098   192.168.219.1/19225   667184  5585   26285   453575  0      1969
+192.168.219.3/39112   192.168.219.1/19225   580982  22     1502    569479  0      2210
+192.168.219.3/39120   192.168.219.1/19225   600280  201    955     550752  0      2327
+
+07:39:45
+LAddrPort            RAddrPort             Open_us Dod_us Rcov_us Cwr_us  Los_us Chgs 
+192.168.219.3/39070   192.168.219.1/19225   567189  2029   25966   404698  0      1612
+192.168.219.3/39098   192.168.219.1/19225   597201  2263   24073   376454  0      1578
+192.168.219.3/39112   192.168.219.1/19225   500792  846    9297    489264  0      1850
+192.168.219.3/39120   192.168.219.1/19225   518700  94     749     480171  0      1967
+
+07:39:46
+LAddrPort            RAddrPort             Open_us Dod_us Rcov_us Cwr_us  Los_us Chgs 
+192.168.219.3/39070   192.168.219.1/19225   587340  5324   37035   370066  0      1602
+192.168.219.3/39098   192.168.219.1/19225   532986  5630   31624   345336  0      1319
+192.168.219.3/39112   192.168.219.1/19225   481936  1129   6244    510235  0      1909
+192.168.219.3/39120   192.168.219.1/19225   507196  316    6200    485737  0      1957
+
+
+the ipv6 example with (-u) option can be shown.
+Eg printing output every 1 second, and including timestamps (-T) and
+microseconds (-u) for local ports 30000-40000 and remote ports 19225-19227:
+./tcpcong.py -T -u -L 30000-40000 -R 19225-19227 1 3
+Tracing tcp congestion control status duration... Hit Ctrl-C to end.
+
+11:31:55
+LAddrPort6                       RAddrPort6                       Open_us Dod_us Rcov_us Cwr_us  Los_us Chgs
+fe80::bace:f6ff:fe14:d21c/32810  fe80::bace:f6ff:fe43:fe96/19226  876328  0       0       137957 0      235
+fe80::bace:f6ff:fe14:d21c/32812  fe80::bace:f6ff:fe43:fe96/19226  757739  0       0       283114 0      590
+fe80::bace:f6ff:fe14:d21c/32814  fe80::bace:f6ff:fe43:fe96/19226  855426  0       0       136134 0      231
+fe80::bace:f6ff:fe14:d21c/32816  fe80::bace:f6ff:fe43:fe96/19226  695271  0       0       345443 0      606
+
+11:31:56
+LAddrPort6                       RAddrPort6                       Open_us Dod_us Rcov_us Cwr_us  Los_us Chgs
+fe80::bace:f6ff:fe14:d21c/32810  fe80::bace:f6ff:fe43:fe96/19226  913925  0       0       81995  0      92
+fe80::bace:f6ff:fe14:d21c/32812  fe80::bace:f6ff:fe43:fe96/19226  785024  0       0       202819 0      777
+fe80::bace:f6ff:fe14:d21c/32814  fe80::bace:f6ff:fe43:fe96/19226  920963  0       0       80715  0      111
+fe80::bace:f6ff:fe14:d21c/32816  fe80::bace:f6ff:fe43:fe96/19226  765172  0       0       222897 0      734
+
+11:31:57
+LAddrPort6                       RAddrPort6                       Open_us Dod_us Rcov_us Cwr_us  Los_us Chgs
+fe80::bace:f6ff:fe14:d21c/32810  fe80::bace:f6ff:fe43:fe96/19226  839563  0       0       98313  0      149
+fe80::bace:f6ff:fe14:d21c/32812  fe80::bace:f6ff:fe43:fe96/19226  534816  0       0       329683 0      495
+fe80::bace:f6ff:fe14:d21c/32814  fe80::bace:f6ff:fe43:fe96/19226  841706  103     2404    91273  0      132
+fe80::bace:f6ff:fe14:d21c/32816  fe80::bace:f6ff:fe43:fe96/19226  633320  0       0       286584 0      565
+
+
+The distribution of congestion status duration can be printed as a histogram 
+with the -d option and also optionally a count. Eg printing output every 
+1 second for microseconds, and including timestamps (-T):
+./tcpcong.py -d -u -T 1 2
+Tracing tcp congestion control status duration... Hit Ctrl-C to end.
+
+07:40:12
+
+tcp_congest_state = cwr
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 11       |                                        |
+         8 -> 15         : 10       |                                        |
+        16 -> 31         : 25       |                                        |
+        32 -> 63         : 58       |                                        |
+        64 -> 127        : 117      |                                        |
+       128 -> 255        : 2924     |*******                                 |
+       256 -> 511        : 16249    |****************************************|
+       512 -> 1023       : 15340    |*************************************   |
+      1024 -> 2047       : 786      |*                                       |
+      2048 -> 4095       : 24       |                                        |
+      4096 -> 8191       : 7        |                                        |
+      8192 -> 16383      : 0        |                                        |
+     16384 -> 32767      : 0        |                                        |
+     32768 -> 65535      : 1        |                                        |
+     65536 -> 131071     : 0        |                                        |
+    131072 -> 262143     : 1        |                                        |
+
+tcp_congest_state = recovery
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 1        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 2        |                                        |
+        32 -> 63         : 9        |                                        |
+        64 -> 127        : 28       |                                        |
+       128 -> 255        : 895      |******************************          |
+       256 -> 511        : 1190     |****************************************|
+       512 -> 1023       : 384      |************                            |
+      1024 -> 2047       : 66       |**                                      |
+      2048 -> 4095       : 2        |                                        |
+      4096 -> 8191       : 4        |                                        |
+      8192 -> 16383      : 2        |                                        |
+     16384 -> 32767      : 0        |                                        |
+     32768 -> 65535      : 0        |                                        |
+     65536 -> 131071     : 0        |                                        |
+    131072 -> 262143     : 2        |                                        |
+
+tcp_congest_state = disorder
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 21       |**                                      |
+         8 -> 15         : 59       |*****                                   |
+        16 -> 31         : 102      |*********                               |
+        32 -> 63         : 256      |*************************               |
+        64 -> 127        : 409      |****************************************|
+       128 -> 255        : 255      |************************                |
+       256 -> 511        : 104      |**********                              |
+       512 -> 1023       : 8        |                                        |
+
+tcp_congest_state = open
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 11       |                                        |
+         4 -> 7          : 266      |                                        |
+         8 -> 15         : 319      |                                        |
+        16 -> 31         : 396      |*                                       |
+        32 -> 63         : 488      |*                                       |
+        64 -> 127        : 695      |**                                      |
+       128 -> 255        : 4395     |*************                           |
+       256 -> 511        : 13329    |****************************************|
+       512 -> 1023       : 12727    |**************************************  |
+      1024 -> 2047       : 3327     |*********                               |
+      2048 -> 4095       : 601      |*                                       |
+      4096 -> 8191       : 45       |                                        |
+      8192 -> 16383      : 3        |                                        |
+     16384 -> 32767      : 1        |                                        |
+     32768 -> 65535      : 1        |                                        |
+
+tcp_congest_state = loss
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 1        |****************************************|
+       256 -> 511        : 1        |****************************************|
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 0        |                                        |
+      4096 -> 8191       : 0        |                                        |
+      8192 -> 16383      : 0        |                                        |
+     16384 -> 32767      : 0        |                                        |
+     32768 -> 65535      : 1        |****************************************|
+
+07:40:14
+
+tcp_congest_state = cwr
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 7        |                                        |
+         4 -> 7          : 162      |                                        |
+         8 -> 15         : 591      |*                                       |
+        16 -> 31         : 462      |                                        |
+        32 -> 63         : 351      |                                        |
+        64 -> 127        : 441      |                                        |
+       128 -> 255        : 4073     |********                                |
+       256 -> 511        : 19188    |****************************************|
+       512 -> 1023       : 16127    |*********************************       |
+      1024 -> 2047       : 725      |*                                       |
+      2048 -> 4095       : 23       |                                        |
+      4096 -> 8191       : 3        |                                        |
+      8192 -> 16383      : 2        |                                        |
+     16384 -> 32767      : 0        |                                        |
+     32768 -> 65535      : 4        |                                        |
+     65536 -> 131071     : 0        |                                        |
+    131072 -> 262143     : 2        |                                        |
+
+tcp_congest_state = recovery
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 3        |                                        |
+         8 -> 15         : 16       |                                        |
+        16 -> 31         : 22       |                                        |
+        32 -> 63         : 37       |*                                       |
+        64 -> 127        : 75       |**                                      |
+       128 -> 255        : 1082     |*******************************         |
+       256 -> 511        : 1364     |****************************************|
+       512 -> 1023       : 369      |**********                              |
+      1024 -> 2047       : 67       |*                                       |
+      2048 -> 4095       : 0        |                                        |
+      4096 -> 8191       : 2        |                                        |
+      8192 -> 16383      : 0        |                                        |
+     16384 -> 32767      : 0        |                                        |
+     32768 -> 65535      : 0        |                                        |
+     65536 -> 131071     : 0        |                                        |
+    131072 -> 262143     : 5        |                                        |
+
+tcp_congest_state = disorder
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 4        |                                        |
+         4 -> 7          : 43       |****                                    |
+         8 -> 15         : 107      |***********                             |
+        16 -> 31         : 145      |***************                         |
+        32 -> 63         : 312      |*********************************       |
+        64 -> 127        : 370      |****************************************|
+       128 -> 255        : 256      |***************************             |
+       256 -> 511        : 101      |**********                              |
+       512 -> 1023       : 8        |                                        |
+
+tcp_congest_state = open
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 21       |                                        |
+         4 -> 7          : 359      |                                        |
+         8 -> 15         : 516      |*                                       |
+        16 -> 31         : 484      |*                                       |
+        32 -> 63         : 522      |*                                       |
+        64 -> 127        : 818      |**                                      |
+       128 -> 255        : 5081     |*************                           |
+       256 -> 511        : 14852    |****************************************|
+       512 -> 1023       : 13753    |*************************************   |
+      1024 -> 2047       : 3224     |********                                |
+      2048 -> 4095       : 598      |*                                       |
+      4096 -> 8191       : 41       |                                        |
+      8192 -> 16383      : 0        |                                        |
+     16384 -> 32767      : 1        |                                        |
+     32768 -> 65535      : 0        |                                        |
+     65536 -> 131071     : 0        |                                        |
+    131072 -> 262143     : 1        |                                        |
+
+tcp_congest_state = loss
+     usecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 1        |******                                  |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 2        |*************                           |
+       512 -> 1023       : 6        |****************************************|
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 0        |                                        |
+      4096 -> 8191       : 0        |                                        |
+      8192 -> 16383      : 0        |                                        |
+     16384 -> 32767      : 0        |                                        |
+     32768 -> 65535      : 1        |******                                  |
+
+
+USAGE:
+./tcpcong -h
+usage: tcpcong [-h] [-L LOCALPORT] [-R REMOTEPORT] [-T] [-d] [-u]
+                  [interval] [outputs]
+
+Summarize tcp socket congestion control status duration
+
+positional arguments:
+  interval              output interval, in seconds
+  outputs               number of outputs
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -L LOCALPORT, --localport LOCALPORT
+                        trace local ports only
+  -R REMOTEPORT, --remoteport REMOTEPORT
+                        trace the dest ports only
+  -T, --timestamp       include timestamp on output
+  -d, --dist            show distributions as histograms
+  -u, --microseconds    output in microseconds
+
+examples:
+    ./tcpcong                 # show tcp congestion status duration
+    ./tcpcong 1 10            # show 1 second summaries, 10 times
+    ./tcpcong -L 3000-3006 1  # 1s summaries, local port 3000-3006
+    ./tcpcong -R 5000-5005 1  # 1s summaries, remote port 5000-5005
+    ./tcpcong -uT 1           # 1s summaries, microseconds, and timestamps
+    ./tcpcong -d              # show the duration as histograms
diff --git a/tools/tcpconnect.py b/tools/tcpconnect.py
index 8b49c70..531459e 100755
--- a/tools/tcpconnect.py
+++ b/tools/tcpconnect.py
@@ -178,7 +178,7 @@
     u16 dport = skp->__sk_common.skc_dport;
 
     FILTER_PORT
-    
+
     FILTER_FAMILY
 
     if (ipver == 4) {
@@ -295,7 +295,7 @@
         return 0;
 
     struct msghdr *msghdr = (struct msghdr *)*msgpp;
-    if (msghdr->msg_iter.type != ITER_IOVEC)
+    if (msghdr->msg_iter.TYPE_FIELD != ITER_IOVEC)
         goto delete_and_return;
 
     int copied = (int)PT_REGS_RC(ctx);
@@ -361,6 +361,10 @@
 bpf_text = bpf_text.replace('FILTER_UID', '')
 
 if args.dns:
+    if BPF.kernel_struct_has_field(b'iov_iter', b'iter_type') == 1:
+        dns_bpf_text = dns_bpf_text.replace('TYPE_FIELD', 'iter_type')
+    else:
+        dns_bpf_text = dns_bpf_text.replace('TYPE_FIELD', 'type')
     bpf_text += dns_bpf_text
 
 if debug or args.ebpf:
@@ -380,12 +384,12 @@
         printb(b"%-6d" % event.uid, nl="")
     dest_ip = inet_ntop(AF_INET, pack("I", event.daddr)).encode()
     if args.lport:
-        printb(b"%-6d %-12.12s %-2d %-16s %-6d %-16s %-6d %s" % (event.pid,
+        printb(b"%-7d %-12.12s %-2d %-16s %-6d %-16s %-6d %s" % (event.pid,
             event.task, event.ip,
             inet_ntop(AF_INET, pack("I", event.saddr)).encode(), event.lport,
             dest_ip, event.dport, print_dns(dest_ip)))
     else:
-        printb(b"%-6d %-12.12s %-2d %-16s %-16s %-6d %s" % (event.pid,
+        printb(b"%-7d %-12.12s %-2d %-16s %-16s %-6d %s" % (event.pid,
             event.task, event.ip,
             inet_ntop(AF_INET, pack("I", event.saddr)).encode(),
             dest_ip, event.dport, print_dns(dest_ip)))
@@ -401,12 +405,12 @@
         printb(b"%-6d" % event.uid, nl="")
     dest_ip = inet_ntop(AF_INET6, event.daddr).encode()
     if args.lport:
-        printb(b"%-6d %-12.12s %-2d %-16s %-6d %-16s %-6d %s" % (event.pid,
+        printb(b"%-7d %-12.12s %-2d %-16s %-6d %-16s %-6d %s" % (event.pid,
             event.task, event.ip,
             inet_ntop(AF_INET6, event.saddr).encode(), event.lport,
             dest_ip, event.dport, print_dns(dest_ip)))
     else:
-        printb(b"%-6d %-12.12s %-2d %-16s %-16s %-6d %s" % (event.pid,
+        printb(b"%-7d %-12.12s %-2d %-16s %-16s %-6d %s" % (event.pid,
             event.task, event.ip,
             inet_ntop(AF_INET6, event.saddr).encode(),
             dest_ip, event.dport, print_dns(dest_ip)))
@@ -528,10 +532,10 @@
     if args.print_uid:
         print("%-6s" % ("UID"), end="")
     if args.lport:
-        print("%-6s %-12s %-2s %-16s %-6s %-16s %-6s" % ("PID", "COMM", "IP", "SADDR",
+        print("%-7s %-12s %-2s %-16s %-6s %-16s %-6s" % ("PID", "COMM", "IP", "SADDR",
             "LPORT", "DADDR", "DPORT"), end="")
     else:
-        print("%-6s %-12s %-2s %-16s %-16s %-6s" % ("PID", "COMM", "IP", "SADDR",
+        print("%-7s %-12s %-2s %-16s %-16s %-6s" % ("PID", "COMM", "IP", "SADDR",
             "DADDR", "DPORT"), end="")
     if args.dns:
         print(" QUERY")
diff --git a/tools/tcpconnlat.py b/tools/tcpconnlat.py
index 093f267..885b26d 100755
--- a/tools/tcpconnlat.py
+++ b/tools/tcpconnlat.py
@@ -231,13 +231,13 @@
             start_ts = event.ts_us
         print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="")
     if args.lport:
-        print("%-6d %-12.12s %-2d %-16s %-6d %-16s %-5d %.2f" % (event.pid,
+        print("%-7d %-12.12s %-2d %-16s %-6d %-16s %-5d %.2f" % (event.pid,
             event.task.decode('utf-8', 'replace'), event.ip,
             inet_ntop(AF_INET, pack("I", event.saddr)), event.lport,
             inet_ntop(AF_INET, pack("I", event.daddr)), event.dport,
             float(event.delta_us) / 1000))
     else:
-        print("%-6d %-12.12s %-2d %-16s %-16s %-5d %.2f" % (event.pid,
+        print("%-7d %-12.12s %-2d %-16s %-16s %-5d %.2f" % (event.pid,
             event.task.decode('utf-8', 'replace'), event.ip,
             inet_ntop(AF_INET, pack("I", event.saddr)),
             inet_ntop(AF_INET, pack("I", event.daddr)), event.dport,
@@ -251,13 +251,13 @@
             start_ts = event.ts_us
         print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="")
     if args.lport:
-        print("%-6d %-12.12s %-2d %-16s %-6d %-16s %-5d %.2f" % (event.pid,
+        print("%-7d %-12.12s %-2d %-16s %-6d %-16s %-5d %.2f" % (event.pid,
             event.task.decode('utf-8', 'replace'), event.ip,
             inet_ntop(AF_INET6, event.saddr), event.lport,
             inet_ntop(AF_INET6, event.daddr),
             event.dport, float(event.delta_us) / 1000))
     else:
-        print("%-6d %-12.12s %-2d %-16s %-16s %-5d %.2f" % (event.pid,
+        print("%-7d %-12.12s %-2d %-16s %-16s %-5d %.2f" % (event.pid,
             event.task.decode('utf-8', 'replace'), event.ip,
             inet_ntop(AF_INET6, event.saddr), inet_ntop(AF_INET6, event.daddr),
             event.dport, float(event.delta_us) / 1000))
@@ -266,10 +266,10 @@
 if args.timestamp:
     print("%-9s" % ("TIME(s)"), end="")
 if args.lport:
-    print("%-6s %-12s %-2s %-16s %-6s %-16s %-5s %s" % ("PID", "COMM",
+    print("%-7s %-12s %-2s %-16s %-6s %-16s %-5s %s" % ("PID", "COMM",
         "IP", "SADDR", "LPORT", "DADDR", "DPORT", "LAT(ms)"))
 else:
-    print("%-6s %-12s %-2s %-16s %-16s %-5s %s" % ("PID", "COMM", "IP",
+    print("%-7s %-12s %-2s %-16s %-16s %-5s %s" % ("PID", "COMM", "IP",
         "SADDR", "DADDR", "DPORT", "LAT(ms)"))
 
 # read events
diff --git a/tools/tcpretrans.py b/tools/tcpretrans.py
index 79b481b..79ff1ca 100755
--- a/tools/tcpretrans.py
+++ b/tools/tcpretrans.py
@@ -355,7 +355,7 @@
 # process event
 def print_ipv4_event(cpu, data, size):
     event = b["ipv4_events"].event(data)
-    print("%-8s %-6d %-2d %-20s %1s> %-20s" % (
+    print("%-8s %-7d %-2d %-20s %1s> %-20s" % (
         strftime("%H:%M:%S"), event.pid, event.ip,
         "%s:%d" % (inet_ntop(AF_INET, pack('I', event.saddr)), event.lport),
         type[event.type],
@@ -368,7 +368,7 @@
 
 def print_ipv6_event(cpu, data, size):
     event = b["ipv6_events"].event(data)
-    print("%-8s %-6d %-2d %-20s %1s> %-20s" % (
+    print("%-8s %-7d %-2d %-20s %1s> %-20s" % (
         strftime("%H:%M:%S"), event.pid, event.ip,
         "%s:%d" % (inet_ntop(AF_INET6, event.saddr), event.lport),
         type[event.type],
@@ -415,7 +415,7 @@
 # read events
 else:
     # header
-    print("%-8s %-6s %-2s %-20s %1s> %-20s" % ("TIME", "PID", "IP",
+    print("%-8s %-7s %-2s %-20s %1s> %-20s" % ("TIME", "PID", "IP",
         "LADDR:LPORT", "T", "RADDR:RPORT"), end='')
     if args.sequence:
         print(" %-12s %-10s" % ("STATE", "SEQ"))
diff --git a/tools/tcptop.py b/tools/tcptop.py
index c8bde8f..d369e13 100755
--- a/tools/tcptop.py
+++ b/tools/tcptop.py
@@ -281,14 +281,14 @@
     ipv4_recv_bytes.clear()
 
     if ipv4_throughput:
-        print("%-6s %-12s %-21s %-21s %6s %6s" % ("PID", "COMM",
+        print("%-7s %-12s %-21s %-21s %6s %6s" % ("PID", "COMM",
             "LADDR", "RADDR", "RX_KB", "TX_KB"))
 
     # output
     for k, (send_bytes, recv_bytes) in sorted(ipv4_throughput.items(),
                                               key=lambda kv: sum(kv[1]),
                                               reverse=True):
-        print("%-6d %-12.12s %-21s %-21s %6d %6d" % (k.pid,
+        print("%-7d %-12.12s %-21s %-21s %6d %6d" % (k.pid,
             k.name,
             k.laddr + ":" + str(k.lport),
             k.daddr + ":" + str(k.dport),
@@ -308,14 +308,14 @@
 
     if ipv6_throughput:
         # more than 80 chars, sadly.
-        print("\n%-6s %-12s %-32s %-32s %6s %6s" % ("PID", "COMM",
+        print("\n%-7s %-12s %-32s %-32s %6s %6s" % ("PID", "COMM",
             "LADDR6", "RADDR6", "RX_KB", "TX_KB"))
 
     # output
     for k, (send_bytes, recv_bytes) in sorted(ipv6_throughput.items(),
                                               key=lambda kv: sum(kv[1]),
                                               reverse=True):
-        print("%-6d %-12.12s %-32s %-32s %6d %6d" % (k.pid,
+        print("%-7d %-12.12s %-32s %-32s %6d %6d" % (k.pid,
             k.name,
             k.laddr + ":" + str(k.lport),
             k.daddr + ":" + str(k.dport),
diff --git a/tools/threadsnoop.py b/tools/threadsnoop.py
index 471b0c3..8adca2e 100755
--- a/tools/threadsnoop.py
+++ b/tools/threadsnoop.py
@@ -45,7 +45,7 @@
 except Exception:
     b.attach_uprobe(name="c", sym="pthread_create", fn_name="do_entry")
 
-print("%-10s %-6s %-16s %s" % ("TIME(ms)", "PID", "COMM", "FUNC"))
+print("%-10s %-7s %-16s %s" % ("TIME(ms)", "PID", "COMM", "FUNC"))
 
 start_ts = 0
 
@@ -58,7 +58,7 @@
     func = b.sym(event.start, event.pid)
     if (func == "[unknown]"):
         func = hex(event.start)
-    print("%-10d %-6d %-16s %s" % ((event.ts - start_ts) / 1000000,
+    print("%-10d %-7d %-16s %s" % ((event.ts - start_ts) / 1000000,
         event.pid, event.comm, func))
 
 b["events"].open_perf_buffer(print_event)
diff --git a/tools/trace.py b/tools/trace.py
index 0f6d90e..b51cccf 100755
--- a/tools/trace.py
+++ b/tools/trace.py
@@ -5,6 +5,7 @@
 #
 # usage: trace [-h] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S] [-c cgroup_path]
 #              [-M MAX_EVENTS] [-s SYMBOLFILES] [-T] [-t] [-K] [-U] [-a] [-I header]
+#              [-A]
 #              probe [probe ...]
 #
 # Licensed under the Apache License, Version 2.0 (the "License")
@@ -40,6 +41,8 @@
         uid = -1
         page_cnt = None
         build_id_enabled = False
+        aggregate = False
+        symcount = {}
 
         @classmethod
         def configure(cls, args):
@@ -58,6 +61,10 @@
                 cls.page_cnt = args.buffer_pages
                 cls.bin_cmp = args.bin_cmp
                 cls.build_id_enabled = args.sym_file_list is not None
+                cls.aggregate = args.aggregate
+                if cls.aggregate and cls.max_events is None:
+                        raise ValueError("-M/--max-events should be specified"
+                                         " with -A/--aggregate")
 
         def __init__(self, probe, string_size, kernel_stack, user_stack,
                      cgroup_map_name, name, msg_filter):
@@ -584,18 +591,20 @@
                 else:   # self.probe_type == 't'
                         return self.tp_event
 
-        def print_stack(self, bpf, stack_id, tgid):
+        def _stack_to_string(self, bpf, stack_id, tgid):
             if stack_id < 0:
-                print("        %d" % stack_id)
-                return
+                return ("        %d" % stack_id)
 
+            stackstr = ''
             stack = list(bpf.get_table(self.stacks_name).walk(stack_id))
             for addr in stack:
-                print("        ", end="")
+                stackstr += '        '
                 if Probe.print_address:
-                    print("%16x " % addr, end="")
-                print("%s" % (bpf.sym(addr, tgid,
-                                     show_module=True, show_offset=True)))
+                    stackstr += ("%16x " % addr)
+                symstr = bpf.sym(addr, tgid, show_module=True, show_offset=True)
+                stackstr += ('%s\n' % (symstr.decode('utf-8')))
+
+            return stackstr
 
         def _format_message(self, bpf, tgid, values):
                 # Replace each %K with kernel sym and %U with user sym in tgid
@@ -610,6 +619,11 @@
                                            show_module=True, show_offset=True)
                 return self.python_format % tuple(values)
 
+        def print_aggregate_events(self):
+                for k, v in sorted(self.symcount.items(), key=lambda item: \
+                                   item[1], reverse=True):
+                    print("%s-->COUNT %d\n\n" % (k, v), end="")
+
         def print_event(self, bpf, cpu, data, size):
                 # Cast as the generated structure type and display
                 # according to the format string in the probe.
@@ -621,32 +635,43 @@
                 msg = self._format_message(bpf, event.tgid, values)
                 if self.msg_filter and self.msg_filter not in msg:
                     return
+                eventstr = ''
                 if Probe.print_time:
                     time = strftime("%H:%M:%S") if Probe.use_localtime else \
                            Probe._time_off_str(event.timestamp_ns)
                     if Probe.print_unix_timestamp:
-                        print("%-17s " % time[:17], end="")
+                        eventstr += ("%-17s " % time[:17])
                     else:
-                        print("%-8s " % time[:8], end="")
+                        eventstr += ("%-8s " % time[:8])
                 if Probe.print_cpu:
-                    print("%-3s " % event.cpu, end="")
-                print("%-7d %-7d %-15s %-16s %s" %
+                    eventstr += ("%-3s " % event.cpu)
+                eventstr += ("%-7d %-7d %-15s %-16s %s\n" %
                       (event.tgid, event.pid,
                        event.comm.decode('utf-8', 'replace'),
                        self._display_function(), msg))
 
                 if self.kernel_stack:
-                        self.print_stack(bpf, event.kernel_stack_id, -1)
+                        eventstr += self._stack_to_string(bpf, event.kernel_stack_id, -1)
                 if self.user_stack:
-                        self.print_stack(bpf, event.user_stack_id, event.tgid)
-                if self.user_stack or self.kernel_stack:
+                        eventstr += self._stack_to_string(bpf, event.user_stack_id, event.tgid)
+
+                if self.aggregate is False:
+                    print(eventstr, end="")
+                    if self.kernel_stack or self.user_stack:
                         print("")
+                else:
+                    if eventstr in self.symcount:
+                        self.symcount[eventstr] += 1
+                    else:
+                        self.symcount[eventstr] = 1
 
                 Probe.event_count += 1
                 if Probe.max_events is not None and \
                    Probe.event_count >= Probe.max_events:
-                        exit()
-                sys.stdout.flush()
+                    if self.aggregate:
+                        self.print_aggregate_events()
+                    sys.stdout.flush()
+                    exit()
 
         def attach(self, bpf, verbose):
                 if len(self.library) == 0:
@@ -700,7 +725,7 @@
 trace kfree_skb+0x12
         Trace the kfree_skb kernel function after the instruction on the 0x12 offset
 trace 'do_sys_open "%s", arg2@user'
-        Trace the open syscall and print the filename. being opened @user is
+        Trace the open syscall and print the filename being opened @user is
         added to arg2 in kprobes to ensure that char * should be copied from
         the userspace stack to the bpf stack. If not specified, previous
         behaviour is expected.
@@ -752,6 +777,9 @@
         to 53 (DNS; 13568 in big endian order)
 trace -I 'linux/fs_struct.h' 'mntns_install "users = %d", $task->fs->users'
         Trace the number of users accessing the file system of the current task
+trace -s /lib/x86_64-linux-gnu/libc.so.6,/bin/ping 'p:c:inet_pton' -U
+        Trace inet_pton system call and use the specified libraries/executables for
+        symbol resolution.
 """
 
         def __init__(self):
@@ -815,6 +843,8 @@
                        "as either full path, "
                        "or relative to current working directory, "
                        "or relative to default kernel header search path")
+                parser.add_argument("-A", "--aggregate", action="store_true",
+                  help="aggregate amount of each trace")
                 parser.add_argument("--ebpf", action="store_true",
                   help=argparse.SUPPRESS)
                 self.args = parser.parse_args()
diff --git a/tools/trace_example.txt b/tools/trace_example.txt
index 36010d6..ccefdaa 100644
--- a/tools/trace_example.txt
+++ b/tools/trace_example.txt
@@ -237,6 +237,32 @@
 need to do that here because `struct timespec` is used internally by the tool,
 so it always includes this header file.
 
+To aggregate amount of trace, you need specify -A with -M EVENTS. A typical
+example:
+1, if we find that the sys CPU utilization is higher by 'top' command
+2, then find that the timer interrupt is more normal by 'irqtop' command
+3, to confirm kernel timer setting frequence by 'funccount -i 1 clockevents_program_event'
+4, to trace timer setting by 'trace clockevents_program_event -K -A -M 1000'
+
+1294576 1294584 CPU 0/KVM       clockevents_program_event
+        clockevents_program_event+0x1 [kernel]
+        hrtimer_start_range_ns+0x209 [kernel]
+        start_sw_timer+0x173 [kvm]
+        restart_apic_timer+0x6c [kvm]
+        kvm_set_msr_common+0x442 [kvm]
+        __kvm_set_msr+0xa2 [kvm]
+        kvm_emulate_wrmsr+0x36 [kvm]
+        vcpu_enter_guest+0x326 [kvm]
+        kvm_arch_vcpu_ioctl_run+0xcc [kvm]
+        kvm_vcpu_ioctl+0x22f [kvm]
+        do_vfs_ioctl+0xa1 [kernel]
+        ksys_ioctl+0x60 [kernel]
+        __x64_sys_ioctl+0x16 [kernel]
+        do_syscall_64+0x59 [kernel]
+        entry_SYSCALL_64_after_hwframe+0x44 [kernel]
+-->COUNT 271
+...
+So we can know that 271 timer setting in recent 1000(~27%).
 
 As a final example, let's trace open syscalls for a specific process. By
 default, tracing is system-wide, but the -p switch overrides this:
@@ -384,6 +410,7 @@
                         as either full path, or relative to current working
                         directory, or relative to default kernel header search
                         path
+  -A, --aggregate       aggregate amount of each trace
 
 EXAMPLES:
 
@@ -392,10 +419,11 @@
 trace kfree_skb+0x12
         Trace the kfree_skb kernel function after the instruction on the 0x12 offset
 trace 'do_sys_open "%s", arg2@user'
-        Trace the open syscall and print the filename being opened. @user is
+        Trace the open syscall and print the filename being opened @user is
         added to arg2 in kprobes to ensure that char * should be copied from
         the userspace stack to the bpf stack. If not specified, previous
         behaviour is expected.
+
 trace 'do_sys_open "%s", arg2@user' -n main
         Trace the open syscall and only print event that process names containing "main"
 trace 'do_sys_open "%s", arg2@user' --uid 1001
@@ -420,6 +448,8 @@
         Trace the block_rq_complete kernel tracepoint and print # of tx sectors
 trace 'u:pthread:pthread_create (arg4 != 0)'
         Trace the USDT probe pthread_create when its 4th argument is non-zero
+trace 'u:pthread:libpthread:pthread_create (arg4 != 0)'
+        Ditto, but the provider name "libpthread" is specified.
 trace 'p::SyS_nanosleep(struct timespec *ts) "sleep for %lld ns", ts->tv_nsec'
         Trace the nanosleep syscall and print the sleep duration in ns
 trace -c /sys/fs/cgroup/system.slice/workload.service '__x64_sys_nanosleep' '__x64_sys_clone'
@@ -435,7 +465,7 @@
         in kernel/sched/sched.h which is in kernel source tree and not in kernel-devel
         package.  So this command needs to run at the kernel source tree root directory
         so that the added header file can be found by the compiler.
-trace -I 'net/sock.h' \\
+trace -I 'net/sock.h' \
       'udpv6_sendmsg(struct sock *sk) (sk->sk_dport == 13568)'
         Trace udpv6 sendmsg calls only if socket's destination port is equal
         to 53 (DNS; 13568 in big endian order)
@@ -444,4 +474,3 @@
 trace -s /lib/x86_64-linux-gnu/libc.so.6,/bin/ping 'p:c:inet_pton' -U
         Trace inet_pton system call and use the specified libraries/executables for
         symbol resolution.
-"
diff --git a/tools/vfsstat.py b/tools/vfsstat.py
index a9c213d..a862d33 100755
--- a/tools/vfsstat.py
+++ b/tools/vfsstat.py
@@ -65,11 +65,11 @@
 """
 
 bpf_text_kfunc = """
-KFUNC_PROBE(vfs_read)   { stats_increment(S_READ); return 0; }
-KFUNC_PROBE(vfs_write)  { stats_increment(S_WRITE); return 0; }
-KFUNC_PROBE(vfs_fsync)  { stats_increment(S_FSYNC); return 0; }
-KFUNC_PROBE(vfs_open)   { stats_increment(S_OPEN); return 0; }
-KFUNC_PROBE(vfs_create) { stats_increment(S_CREATE); return 0; }
+KFUNC_PROBE(vfs_read)         { stats_increment(S_READ); return 0; }
+KFUNC_PROBE(vfs_write)        { stats_increment(S_WRITE); return 0; }
+KFUNC_PROBE(vfs_fsync_range)  { stats_increment(S_FSYNC); return 0; }
+KFUNC_PROBE(vfs_open)         { stats_increment(S_OPEN); return 0; }
+KFUNC_PROBE(vfs_create)       { stats_increment(S_CREATE); return 0; }
 """
 
 is_support_kfunc = BPF.support_kfunc()
@@ -81,11 +81,11 @@
 
 b = BPF(text=bpf_text)
 if not is_support_kfunc:
-    b.attach_kprobe(event="vfs_read",   fn_name="do_read")
-    b.attach_kprobe(event="vfs_write",  fn_name="do_write")
-    b.attach_kprobe(event="vfs_fsync",  fn_name="do_fsync")
-    b.attach_kprobe(event="vfs_open",   fn_name="do_open")
-    b.attach_kprobe(event="vfs_create", fn_name="do_create")
+    b.attach_kprobe(event="vfs_read",         fn_name="do_read")
+    b.attach_kprobe(event="vfs_write",        fn_name="do_write")
+    b.attach_kprobe(event="vfs_fsync_range",  fn_name="do_fsync")
+    b.attach_kprobe(event="vfs_open",         fn_name="do_open")
+    b.attach_kprobe(event="vfs_create",       fn_name="do_create")
 
 # stat column labels and indexes
 stat_types = {
diff --git a/tools/xfsdist.py b/tools/xfsdist.py
index 58f73af..163c220 100755
--- a/tools/xfsdist.py
+++ b/tools/xfsdist.py
@@ -169,7 +169,7 @@
     if args.interval and (not args.notimestamp):
         print(strftime("%H:%M:%S:"))
 
-    dist.print_log2_hist(label, "operation")
+    dist.print_log2_hist(label, "operation", section_print_fn=bytes.decode)
     dist.clear()
 
     countdown -= 1
diff --git a/tools/zfsdist.py b/tools/zfsdist.py
index a30671d..f9c229c 100755
--- a/tools/zfsdist.py
+++ b/tools/zfsdist.py
@@ -183,7 +183,7 @@
     if args.interval and (not args.notimestamp):
         print(strftime("%H:%M:%S:"))
 
-    dist.print_log2_hist(label, "operation")
+    dist.print_log2_hist(label, "operation", section_print_fn=bytes.decode)
     dist.clear()
 
     countdown -= 1