Merge pull request #701 from dkronst/master

Making selection of kernel headers type automatic
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a1bcfb0..9de4b70 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,11 +68,19 @@
 endif()
 
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
-# iterate over all available directories in LLVM_INCLUDE_DIRS to
-# generate a correctly tokenized list of parameters
-foreach(ONE_LLVM_INCLUDE_DIR ${LLVM_INCLUDE_DIRS})
-  set(CXX_ISYSTEM_DIRS "${CXX_ISYSTEM_DIRS} -isystem ${ONE_LLVM_INCLUDE_DIR}")
-endforeach()
+
+# As reported in issue #735, GCC 6 has some behavioral problems when
+# dealing with -isystem. Hence, skip the warning optimization
+# altogether on that compiler.
+execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+if (GCC_VERSION VERSION_LESS 6.0)
+  # iterate over all available directories in LLVM_INCLUDE_DIRS to
+  # generate a correctly tokenized list of parameters
+  foreach(ONE_LLVM_INCLUDE_DIR ${LLVM_INCLUDE_DIRS})
+    set(CXX_ISYSTEM_DIRS "${CXX_ISYSTEM_DIRS} -isystem ${ONE_LLVM_INCLUDE_DIR}")
+  endforeach()
+endif()
+
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall ${CXX_ISYSTEM_DIRS}")
 endif()
 
diff --git a/INSTALL.md b/INSTALL.md
index 9336e47..3c61681 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -82,7 +82,7 @@
 ```bash
 echo "deb [trusted=yes] https://repo.iovisor.org/apt/trusty trusty-nightly main" | sudo tee /etc/apt/sources.list.d/iovisor.list
 sudo apt-get update
-sudo apt-get install bcc-tools
+sudo apt-get install bcc-tools libbcc-examples
 ```
 
 Test it:
@@ -183,8 +183,13 @@
 ### Install binary clang
 
 ```
+# FC22
 wget http://llvm.org/releases/3.7.1/clang+llvm-3.7.1-x86_64-fedora22.tar.xz
 sudo tar xf clang+llvm-3.7.1-x86_64-fedora22.tar.xz -C /usr/local --strip 1
+
+# FC23 and FC24
+wget http://llvm.org/releases/3.9.0/clang+llvm-3.9.0-x86_64-fedora23.tar.xz
+sudo tar xf clang+llvm-3.9.0-x86_64-fedora23.tar.xz -C /usr/local --strip 1
 ```
 
 ### Install and compile BCC
diff --git a/README.md b/README.md
index 29e3aad..4766143 100644
--- a/README.md
+++ b/README.md
@@ -95,7 +95,7 @@
 - tools/[fileslower](tools/fileslower.py): Trace slow synchronous file reads and writes. [Examples](tools/fileslower_example.txt).
 - tools/[filetop](tools/filetop.py): File reads and writes by filename and process. Top for files. [Examples](tools/filetop_example.txt).
 - tools/[funccount](tools/funccount.py): Count kernel function calls. [Examples](tools/funccount_example.txt).
-- tools/[funclatency](tools/funclatency.py): Time kernel functions and show their latency distribution. [Examples](tools/funclatency_example.txt).
+- tools/[funclatency](tools/funclatency.py): Time functions and show their latency distribution. [Examples](tools/funclatency_example.txt).
 - tools/[gethostlatency](tools/gethostlatency.py): Show latency for getaddrinfo/gethostbyname[2] calls. [Examples](tools/gethostlatency_example.txt).
 - tools/[hardirqs](tools/hardirqs.py):  Measure hard IRQ (hard interrupt) event time. [Examples](tools/hardirqs_example.txt).
 - tools/[killsnoop](tools/killsnoop.py): Trace signals issued by the kill() syscall. [Examples](tools/killsnoop_example.txt).
@@ -123,6 +123,7 @@
 - tools/[tcptop](tools/tcptop.py): Summarize TCP send/recv throughput by host. Top for TCP. [Examples](tools/tcptop_example.txt).
 - tools/[tplist](tools/tplist.py): Display kernel tracepoints or USDT probes and their formats. [Examples](tools/tplist_example.txt).
 - tools/[trace](tools/trace.py): Trace arbitrary functions, with filters. [Examples](tools/trace_example.txt)
+- tools/[ttysnoop](tools/ttysnoop.py): Watch live output from a tty or pts device. [Examples](tools/ttysnoop_example.txt)
 - tools/[vfscount](tools/vfscount.py) tools/[vfscount.c](tools/vfscount.c): Count VFS calls. [Examples](tools/vfscount_example.txt).
 - tools/[vfsstat](tools/vfsstat.py) tools/[vfsstat.c](tools/vfsstat.c): Count some VFS calls, with column output. [Examples](tools/vfsstat_example.txt).
 - tools/[wakeuptime](tools/wakeuptime.py): Summarize sleep to wakeup time by waker kernel stack. [Examples](tools/wakeuptime_example.txt).
diff --git a/examples/lua/kprobe-latency.lua b/examples/lua/kprobe-latency.lua
new file mode 100644
index 0000000..60ac2c1
--- /dev/null
+++ b/examples/lua/kprobe-latency.lua
@@ -0,0 +1,79 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- This example program measures latency of block device operations and plots it
+-- in a histogram. It is similar to BPF example:
+-- https://github.com/torvalds/linux/blob/master/samples/bpf/tracex3_kern.c
+local ffi = require('ffi')
+local bpf = require('bpf')
+local S = require('syscall')
+
+-- Shared part of the program
+local bins = 100
+local map = bpf.map('hash', 512, ffi.typeof('uint64_t'), ffi.typeof('uint64_t'))
+local lat_map = bpf.map('array', bins)
+
+-- Kernel-space part of the program
+local trace_start = bpf.kprobe('myprobe:blk_start_request', function (ptregs)
+	map[ptregs.parm1] = time()
+end, false, -1, 0)
+local trace_end = bpf.kprobe('myprobe2:blk_account_io_completion', function (ptregs)
+	-- The lines below are computing index
+	-- using log10(x)*10 = log2(x)*10/log2(10) = log2(x)*3
+	-- index = 29 ~ 1 usec
+	-- index = 59 ~ 1 msec
+	-- index = 89 ~ 1 sec
+	-- index = 99 ~ 10sec or more
+	local delta = time() - map[ptregs.parm1]
+	local index = 3 * math.log2(delta)
+	if index >= bins then
+		index = bins-1
+	end
+	xadd(lat_map[index], 1)
+	return true
+end, false, -1, 0)
+-- User-space part of the program
+pcall(function()
+	local counter = 0
+	local sym = {' ',' ','.','.','*','*','o','o','O','O','#','#'}
+	while true do
+		-- Print header once in a while
+		if counter % 50 == 0 then
+			print('|1us      |10us     |100us    |1ms      |10ms     |100ms    |1s       |10s')
+			counter = 0
+		end
+		counter = counter + 1
+		-- Collect all events
+		local hist, events = {}, 0
+		for i=29,bins-1 do
+			local v = tonumber(lat_map[i] or 0)
+			if v > 0 then
+				hist[i] = hist[i] or 0 + v
+				events = events + v
+			end
+		end
+		-- Print histogram symbols based on relative frequency
+		local s = ''
+		for i=29,bins-1 do
+			if hist[i] then
+				local c = math.ceil((hist[i] / (events + 1)) * #sym)
+				s = s .. sym[c]
+			else s = s .. ' ' end
+		end
+		print(s .. string.format('  ; %d events', events))
+		S.sleep(1)
+	end
+end)
\ No newline at end of file
diff --git a/examples/lua/kprobe-write.lua b/examples/lua/kprobe-write.lua
new file mode 100644
index 0000000..38f5a20
--- /dev/null
+++ b/examples/lua/kprobe-write.lua
@@ -0,0 +1,35 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- Simple tracing example that executes a program on
+-- return from sys_write() and tracks the number of hits
+local ffi = require('ffi')
+local bpf = require('bpf')
+local S = require('syscall')
+
+-- Shared part of the program
+local map = bpf.map('array', 1)
+-- Kernel-space part of the program
+local probe = bpf.kprobe('myprobe:sys_write', function (ptregs)
+   xadd(map[0], 1)
+end, true)
+-- User-space part of the program
+pcall(function()
+	for _ = 1, 10 do
+	   print('hits: ', tonumber(map[0]))
+	   S.sleep(1)
+	end
+end)
diff --git a/examples/lua/sock-parse-dns.lua b/examples/lua/sock-parse-dns.lua
new file mode 100644
index 0000000..3c20517
--- /dev/null
+++ b/examples/lua/sock-parse-dns.lua
@@ -0,0 +1,56 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- Simple parsing example of UDP/DNS that counts frequency of QTYPEs.
+-- It shows how to parse packet variable-length packet structures.
+local ffi = require("ffi")
+local bpf = require("bpf")
+local S = require("syscall")
+
+-- Shared part of the program
+local map = assert(bpf.map('array', 256))
+-- Kernel-space part of the program
+local prog = bpf.socket('lo', function (skb)
+	local ip = pkt.ip   -- Accept only UDP messages
+	if ip.proto ~= c.ip.proto_udp then return false end
+	local udp = ip.udp  -- Only messages >12 octets (DNS header)
+	if udp.length < 12 then return false end
+	-- Unroll QNAME (up to 2 labels)
+	udp = udp.data + 12
+	local label = udp[0]
+	if label > 0 then
+		udp = udp + label + 1
+		label = udp[0]
+		if label > 0 then
+			udp = udp + label + 1
+		end
+	end
+	-- Track QTYPE (low types)
+	if udp[0] == 0 then
+		local qtype = udp[2] -- Low octet from QTYPE
+		xadd(map[qtype], 1)
+	end
+end)
+-- User-space part of the program
+for _ = 1, 10 do
+	for k,v in map.pairs,map,0 do
+		v = tonumber(v)
+		if v > 0 then
+			print(string.format('TYPE%d: %d', k, v))
+		end
+	end
+	S.sleep(1)
+end
\ No newline at end of file
diff --git a/examples/lua/sock-parse-http.lua b/examples/lua/sock-parse-http.lua
new file mode 100644
index 0000000..f8918fb
--- /dev/null
+++ b/examples/lua/sock-parse-http.lua
@@ -0,0 +1,57 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- Simple parsing example of TCP/HTTP that counts frequency of types of requests
+-- and shows more complicated pattern matching constructions and slices.
+-- Rewrite of a BCC example:
+-- https://github.com/iovisor/bcc/blob/master/examples/networking/http_filter/http-parse-simple.c
+local ffi = require("ffi")
+local bpf = require("bpf")
+local S = require("syscall")
+
+-- Shared part of the program
+local map = bpf.map('hash', 64)
+-- Kernel-space part of the program
+local prog = bpf.socket('lo', function (skb)
+	-- Only ingress so we don't count twice on loopback
+	if skb.ingress_ifindex == 0 then return end
+	local data = pkt.ip.tcp.data  -- Get TCP protocol dissector
+	-- Continue only if we have 7 bytes of TCP data
+	if data + 7 > skb.len then return end
+	-- Fetch 4 bytes of TCP data and compare
+	local h = data(0, 4)
+	if h == 'HTTP' or h == 'GET ' or
+	   h == 'POST' or h == 'PUT ' or 
+	   h == 'HEAD' or h == 'DELE' then
+	   	-- If hash key doesn't exist, create it
+	   	-- otherwise increment counter
+	   local v = map[h]
+	   if not v then map[h] = 1
+	   else          xadd(map[h], 1)
+	   end
+	end
+end)
+-- User-space part of the program
+for _ = 1, 10 do
+	local strkey = ffi.new('uint32_t [1]')
+	local s = ''
+	for k,v in map.pairs,map,0 do
+		strkey[0] = bpf.ntoh(k)
+		s = s..string.format('%s %d ', ffi.string(strkey, 4):match '^%s*(.-)%s*$', tonumber(v))
+	end
+	if #s > 0 then print(s..'messages') end
+	S.sleep(1)
+end
\ No newline at end of file
diff --git a/examples/lua/sock-proto.lua b/examples/lua/sock-proto.lua
new file mode 100644
index 0000000..ab9d3e2
--- /dev/null
+++ b/examples/lua/sock-proto.lua
@@ -0,0 +1,38 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- This program looks at IP, UDP and ICMP packets and
+-- increments counter for each packet of given type seen
+-- Rewrite of https://github.com/torvalds/linux/blob/master/samples/bpf/sock_example.c
+local ffi = require("ffi")
+local bpf = require("bpf")
+local S = require("syscall")
+
+-- Shared part of the program
+local map = bpf.map('hash', 256)
+map[1], map[6], map[17] = 0, 0, 0
+-- Kernel-space part of the program
+bpf.socket('lo', function (skb)
+   local proto = pkt.ip.proto  -- Get byte (ip.proto) from frame at [23]
+   xadd(map[proto], 1)         -- Atomic `map[proto] += 1`
+end)
+-- User-space part of the program
+for _ = 1, 10 do
+   local icmp, udp, tcp = map[1], map[17], map[6]
+   print(string.format('TCP %d UDP %d ICMP %d packets',
+   	     tonumber(tcp or 0), tonumber(udp or 0), tonumber(icmp or 0)))
+   S.sleep(1)
+end
\ No newline at end of file
diff --git a/examples/lua/sock-protolen.lua b/examples/lua/sock-protolen.lua
new file mode 100644
index 0000000..6ad6e3b
--- /dev/null
+++ b/examples/lua/sock-protolen.lua
@@ -0,0 +1,38 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- This program counts total bytes received per-protocol in 64-bit counters.
+-- The map backend is array in this case to avoid key allocations.
+-- increments counter for each packet of given type seen
+-- Rewrite of https://github.com/torvalds/linux/blob/master/samples/bpf/sock_example.c
+local ffi = require("ffi")
+local bpf = require("bpf")
+local S = require("syscall")
+
+-- Shared part of the program
+local map = bpf.map('array', 256, ffi.typeof('uint32_t'), ffi.typeof('uint64_t'))
+-- Kernel-space part of the program
+bpf.socket('lo', function (skb)
+	local proto = pkt.ip.proto  -- Get byte (ip.proto) from frame at [23]
+	xadd(map[proto], skb.len)   -- Atomic `map[proto] += <payload length>`
+end)
+-- User-space part of the program
+for _ = 1, 10 do
+	local icmp, udp, tcp = map[1], map[17], map[6]
+	print(string.format('TCP %d UDP %d ICMP %d bytes',
+		tonumber(tcp or 0), tonumber(udp or 0), tonumber(icmp or 0)))
+	S.sleep(1)
+end
\ No newline at end of file
diff --git a/examples/lua/tracepoint-offcputime.lua b/examples/lua/tracepoint-offcputime.lua
new file mode 100644
index 0000000..fccf0b7
--- /dev/null
+++ b/examples/lua/tracepoint-offcputime.lua
@@ -0,0 +1,80 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- Summarize off-CPU time by stack trace
+-- Related tool: https://github.com/iovisor/bcc/blob/master/tools/offcputime.py
+local ffi = require('ffi')
+local bpf = require('bpf')
+local S = require('syscall')
+-- Create BPF maps
+-- TODO: made smaller to fit default memory limits
+local key_t = 'struct { char name[16]; int32_t stack_id; }'
+local starts = assert(bpf.map('hash', 128, ffi.typeof('uint32_t'), ffi.typeof('uint64_t')))
+local counts = assert(bpf.map('hash', 128, ffi.typeof(key_t), ffi.typeof('uint64_t')))
+local stack_traces = assert(bpf.map('stack_trace', 16))
+-- Open tracepoint and attach BPF program
+-- The 'arg' parses tracepoint format automatically
+local tp = bpf.tracepoint('sched/sched_switch', function (arg)
+	-- Update previous thread sleep time
+	local pid = arg.prev_pid
+	local now = time()
+	starts[pid] = now
+	-- Calculate current thread's delta time
+	pid = arg.next_pid
+	local from = starts[pid]
+	if not from then
+		return 0
+	end
+	local delta = (now - from) / 1000
+	starts[pid] = nil
+	-- Check if the delta is below 1us
+	if delta < 1 then
+		return
+	end
+	-- Create key for this thread
+	local key = ffi.new(key_t)
+	comm(key.name)
+	key.stack_id = stack_id(stack_traces, BPF.F_FAST_STACK_CMP)
+	-- Update current thread off cpu time with delta
+	local val = counts[key]
+	if not val then
+		counts[key] = 0
+	end
+	xadd(counts[key], delta)
+end, 0, -1)
+-- Helper: load kernel symbols
+ffi.cdef 'unsigned long long strtoull(const char *, char **, int);'
+local ksyms = {}
+for l in io.lines('/proc/kallsyms') do
+	local addr, sym = l:match '(%w+) %w (%S+)'
+	if addr then ksyms[ffi.C.strtoull(addr, nil, 16)] = sym end
+end
+-- User-space part of the program
+while true do
+	for k,v in counts.pairs,counts,nil do
+		local s = ''
+		local traces = stack_traces[k.stack_id]
+		if traces then
+			for i, ip in ipairs(traces) do
+				s = s .. string.format("    %-16p %s", ip, ksyms[ip])
+			end
+		end
+		s = s .. string.format("    %-16s %s", "-", ffi.string(k.name))
+		s = s .. string.format("        %d", tonumber(v))
+		print(s)
+	end
+	S.sleep(1)
+end
diff --git a/examples/lua/uprobe-readline-perf.lua b/examples/lua/uprobe-readline-perf.lua
new file mode 100644
index 0000000..aaf3f40
--- /dev/null
+++ b/examples/lua/uprobe-readline-perf.lua
@@ -0,0 +1,42 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- Trace readline() call from all bash instances (print bash commands from all running shells).
+-- This is rough equivallent to `bashreadline` with output through perf event API.
+-- Source: http://www.brendangregg.com/blog/2016-02-08/linux-ebpf-bcc-uprobes.html
+local ffi = require('ffi')
+local bpf = require('bpf')
+local S = require('syscall')
+-- Perf event map
+local sample_t = 'struct { uint64_t pid; char str[80]; }'
+local events = bpf.map('perf_event_array')
+-- Kernel-space part of the program
+local probe = bpf.uprobe('/bin/bash:readline', function (ptregs)
+	local sample = ffi.new(sample_t)
+	sample.pid = pid_tgid()
+	ffi.copy(sample.str, ffi.cast('char *', ptregs.ax)) -- Cast `ax` to string pointer and copy to buffer
+	perf_submit(events, sample)                         -- Write buffer to perf event map
+end, true, -1, 0)
+-- User-space part of the program
+local log = events:reader(nil, 0, sample_t) -- Must specify PID or CPU_ID to observe
+print('            TASK-PID         TIMESTAMP  FUNCTION')
+print('               | |               |         |')
+while true do
+	log:block()               -- Wait until event reader is readable
+	for _,e in log:read() do  -- Collect available reader events
+		print(string.format('%12s%-16s %-10s %s', '', tonumber(e.pid), os.date("%H:%M:%S"), ffi.string(e.str)))
+	end
+end
diff --git a/examples/lua/uprobe-readline.lua b/examples/lua/uprobe-readline.lua
new file mode 100644
index 0000000..7c76950
--- /dev/null
+++ b/examples/lua/uprobe-readline.lua
@@ -0,0 +1,37 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- Trace readline() call from all bash instances (print bash commands from all running shells).
+-- This is rough equivallent to `bashreadline`
+-- Source: http://www.brendangregg.com/blog/2016-02-08/linux-ebpf-bcc-uprobes.html
+local ffi = require('ffi')
+local bpf = require('bpf')
+local S = require('syscall')
+-- Kernel-space part of the program
+local probe = bpf.uprobe('/bin/bash:readline', function (ptregs)
+	local line = ffi.new('char [40]')              -- Create a 40 byte buffer on stack
+	ffi.copy(line, ffi.cast('char *', ptregs.ax))  -- Cast `ax` to string pointer and copy to buffer
+	print('%s\n', line)                            -- Print to trace_pipe
+end, true, -1, 0)
+-- User-space part of the program
+local ok, err = pcall(function()
+	local log = bpf.tracelog()
+	print('            TASK-PID   CPU#         TIMESTAMP  FUNCTION')
+	print('               | |      |               |         |')
+	while true do
+		print(log:read())
+	end
+end)
diff --git a/examples/lua/uprobe-tailkt.lua b/examples/lua/uprobe-tailkt.lua
new file mode 100644
index 0000000..071b2de
--- /dev/null
+++ b/examples/lua/uprobe-tailkt.lua
@@ -0,0 +1,65 @@
+#!/usr/bin/env bcc-lua
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- Trace operations on keys matching given pattern in KyotoTycoon daemon.
+-- This can show you if certain keys were modified or read during the lifetime
+-- even if KT doesn't support this. It also shows how to attach to C++ mangled symbols.
+local ffi = require('ffi')
+local bpf = require('bpf')
+local S = require('syscall')
+local function help(err)
+	print(string.format('%s [get|set] [key]', arg[0]))
+	if err then print('error: '..err) end
+	os.exit(1)
+end
+-- Accept the same format as ktremotemgr for clarity: <get|set> <key>
+local writeable, watch_key, klen = 'any', arg[2] or '*', 80
+if     arg[1] == 'get' then writeable = 0
+elseif arg[1] == 'set' then writeable = 1
+elseif arg[1] == '-h' or arg[1] == '--help' then help()
+elseif arg[1] and arg[1] ~= 'any' then
+	help(string.format('bad cmd: "%s"', arg[1]))
+end
+if watch_key ~= '*' then klen = #watch_key end
+
+-- Find a good entrypoint that has both key and differentiates read/write in KT
+-- That is going to serve as an attachment point for BPF program
+-- ABI: bool accept(void *this, const char* kbuf, size_t ksiz, Visitor* visitor, bool writable)
+local key_type = string.format('char [%d]', klen)
+local probe = bpf.uprobe('/usr/local/bin/ktserver:kyotocabinet::StashDB::accept',
+function (ptregs)
+	-- Watch either get/set or both
+	if writeable ~= 'any' then
+		if ptregs.parm5 ~= writeable then return end
+	end
+	local line = ffi.new(key_type)
+	ffi.copy(line, ffi.cast('char *', ptregs.parm2))
+	-- Check if we're looking for specific key
+	if watch_key ~= '*' then
+		if ptregs.parm3 ~= klen then return false end
+		if line ~= watch_key then return false end
+	end
+	print('%s write:%d\n', line, ptregs.parm5)
+end, false, -1, 0)
+-- User-space part of the program
+local ok, err = pcall(function()
+	local log = bpf.tracelog()
+	print('            TASK-PID   CPU#         TIMESTAMP  FUNCTION')
+	print('               | |      |               |         |')
+	while true do
+		print(log:read())
+	end
+end)
diff --git a/man/man8/funclatency.8 b/man/man8/funclatency.8
index 587546c..7b7771b 100644
--- a/man/man8/funclatency.8
+++ b/man/man8/funclatency.8
@@ -1,10 +1,10 @@
 .TH funclatency 8  "2015-08-18" "USER COMMANDS"
 .SH NAME
-funclatency \- Time kernel functions and print latency as a histogram.
+funclatency \- Time functions and print latency as a histogram.
 .SH SYNOPSIS
-.B funclatency [\-h] [\-p PID] [\-i INTERVAL] [\-T] [\-u] [\-m] [\-r] [\-F] pattern
+.B funclatency [\-h] [\-p PID] [\-i INTERVAL] [\-T] [\-u] [\-m] [\-F] [\-r] [\-v] pattern
 .SH DESCRIPTION
-This tool traces kernel function calls and times their duration (latency), and
+This tool traces function calls and times their duration (latency), and
 shows the latency distribution as a histogram. The time is measured from when
 the function is called to when it returns, and is inclusive of both on-CPU
 time and time spent blocked.
@@ -17,7 +17,7 @@
 functions, or groups of functions that run at the same stack layer, and
 don't ultimately call each other.
 
-WARNING: This uses dynamic tracing of (what can be many) kernel functions, an
+WARNING: This uses dynamic tracing of (what can be many) functions, an
 activity that has had issues on some kernel versions (risk of panics or
 freezes). Test, and know what you are doing, before use.
 
@@ -51,12 +51,19 @@
 .TP
 \-r
 Use regular expressions for the search pattern.
+.TP
+\-v
+Print the BPF program (for debugging purposes).
 .SH EXAMPLES
 .TP
 Time the do_sys_open() kernel function, and print the distribution as a histogram:
 #
 .B funclatency do_sys_open
 .TP
+Time the read() function in libc across all processes on the system:
+#
+.B funclatency c:read
+.TP
 Time vfs_read(), and print the histogram in units of microseconds:
 #
 .B funclatency \-u vfs_read
@@ -116,6 +123,6 @@
 .SH STABILITY
 Unstable - in development.
 .SH AUTHOR
-Brendan Gregg
+Brendan Gregg, Sasha Goldshtein
 .SH SEE ALSO
 funccount(8)
diff --git a/man/man8/mountsnoop.8 b/man/man8/mountsnoop.8
new file mode 100644
index 0000000..450301a
--- /dev/null
+++ b/man/man8/mountsnoop.8
@@ -0,0 +1,55 @@
+.TH mountsnoop 8  "2016-10-14" "USER COMMANDS"
+.SH NAME
+mountsnoop \- Trace mount() and umount() syscalls. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B mountsnoop
+.SH DESCRIPTION
+mountsnoop traces the mount() and umount() syscalls, showing which processes
+are mounting and unmounting filesystems in what mount namespaces. This can be
+useful for troubleshooting system and container setup.
+
+This works by tracing the kernel sys_mount() and sys_umount() functions using
+dynamic tracing, and will need updating to match any changes to this function.
+
+This makes use of a Linux 4.4 feature (bpf_perf_event_output()).
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH FIELDS
+.TP
+COMM
+Process name
+.TP
+PID
+Process ID
+.TP
+TID
+Thread ID
+.TP
+MNT_NS
+Mount namespace inode number
+.TP
+CALL
+System call, arguments, and return value
+.SH OVERHEAD
+This traces the kernel mount and umount functions and prints output for each
+event. As the rate of these calls is generally expected to be very low, the
+overhead is also expected to be negligible. If your system calls mount() and
+umount() at a high rate, then test and understand overhead before use.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Omar Sandoval
+.SH SEE ALSO
+mount(2)
+umount(2)
diff --git a/man/man8/opensnoop.8 b/man/man8/opensnoop.8
index 5c17672..21bd87b 100644
--- a/man/man8/opensnoop.8
+++ b/man/man8/opensnoop.8
@@ -2,7 +2,7 @@
 .SH NAME
 opensnoop \- Trace open() syscalls. Uses Linux eBPF/bcc.
 .SH SYNOPSIS
-.B opensnoop [\-h] [\-t] [\-x] [\-p PID] [\-n name]
+.B opensnoop [\-h] [\-T] [\-x] [\-p PID] [\-t TID] [\-n name]
 .SH DESCRIPTION
 opensnoop traces the open() syscall, showing which processes are attempting
 to open which files. This can be useful for determining the location of config
@@ -24,7 +24,7 @@
 \-h
 Print usage message.
 .TP
-\-t
+\-T
 Include a timestamp column.
 .TP
 \-x
@@ -33,6 +33,9 @@
 \-p PID
 Trace this process ID only (filtered in-kernel).
 .TP
+\-t TID
+Trace this thread ID only (filtered in-kernel).
+.TP
 \-n name
 Only print processes where its name partially matches 'name'
 .SH EXAMPLES
@@ -43,7 +46,7 @@
 .TP
 Trace all open() syscalls, and include timestamps:
 #
-.B opensnoop \-t
+.B opensnoop \-T
 .TP
 Trace only open() syscalls that failed:
 #
@@ -64,6 +67,9 @@
 PID
 Process ID
 .TP
+TID
+Thread ID
+.TP
 COMM
 Process name
 .TP
diff --git a/man/man8/stackcount.8 b/man/man8/stackcount.8
index c3a9146..10999e2 100644
--- a/man/man8/stackcount.8
+++ b/man/man8/stackcount.8
@@ -1,13 +1,15 @@
 .TH stackcount 8  "2016-01-14" "USER COMMANDS"
 .SH NAME
-stackcount \- Count kernel function calls and their stack traces. Uses Linux eBPF/bcc.
+stackcount \- Count function calls and their stack traces. Uses Linux eBPF/bcc.
 .SH SYNOPSIS
-.B stackcount [\-h] [\-p PID] [\-i INTERVAL] [\-T] [\-r] pattern
+.B stackcount [\-h] [\-p PID] [\-i INTERVAL] [\-T] [\-r] [\-s]
+              [\-P] [\-v] [\-d] pattern
 .SH DESCRIPTION
-stackcount traces kernel functions and frequency counts them with their entire
-kernel stack trace, summarized in-kernel for efficiency. This allows higher
+stackcount traces functions and frequency counts them with their entire
+stack trace, summarized in-kernel for efficiency. This allows higher
 frequency events to be studied. The output consists of unique stack traces,
-and their occurrence counts.
+and their occurrence counts. In addition to kernel and user functions, kernel
+tracepoints and USDT tracepoint are also supported.
 
 The pattern is a string with optional '*' wildcards, similar to file globbing.
 If you'd prefer to use regular expressions, use the \-r option.
@@ -35,14 +37,18 @@
 \-v
 Show raw addresses.
 .TP
+\-d
+Print the source of the BPF program when loading it (for debugging purposes).
+.TP
 \-i interval
 Summary interval, in seconds.
 .TP
 \-p PID
 Trace this process ID only (filtered in-kernel).
 .TP
+.TP
 pattern
-A kernel function name, or a search pattern. Can include wildcards ("*"). If the
+A function name, or a search pattern. Can include wildcards ("*"). If the
 \-r option is used, can include regular expressions.
 .SH EXAMPLES
 .TP
@@ -77,6 +83,18 @@
 Only count stacks when PID 185 is on-CPU:
 #
 .B stackcount -p 185 ip_output
+.TP
+Count user stacks for dynamic heap allocations with malloc in PID 185:
+#
+.B stackcount -p 185 c:malloc
+.TP
+Count user stacks for thread creation (USDT tracepoint) in PID 185:
+#
+.B stackcount -p 185 u:pthread:pthread_create
+.TP
+Count kernel stacks for context switch events using a kernel tracepoint:
+#
+.B stackcount t:sched:sched_switch
 .SH OVERHEAD
 This summarizes unique stack traces in-kernel for efficiency, allowing it to
 trace a higher rate of function calls than methods that post-process in user
@@ -99,6 +117,6 @@
 .SH STABILITY
 Unstable - in development.
 .SH AUTHOR
-Brendan Gregg
+Brendan Gregg, Sasha Goldshtein
 .SH SEE ALSO
 stacksnoop(8), funccount(8)
diff --git a/man/man8/tplist.8 b/man/man8/tplist.8
index 474b6ad..da5edf3 100644
--- a/man/man8/tplist.8
+++ b/man/man8/tplist.8
@@ -22,7 +22,8 @@
 or executable can be found in the standard paths, a full path is not required.
 .TP
 \-v
-Display the variables associated with the tracepoint or USDT probe.
+Increase the verbosity level. Can be used to display the variables, locations,
+and arguments of tracepoints and USDT probes.
 .TP
 [filter]
 A wildcard expression that specifies which tracepoints or probes to print.
@@ -45,6 +46,10 @@
 Print all USDT probes in process 4717 from the libc provider:
 $
 .B tplist -p 4717 'libc:*'
+.TP
+Print all the USDT probes in the node executable:
+$
+.B tplist -l node
 .SH SOURCE
 This is from bcc.
 .IP
diff --git a/man/man8/trace.8 b/man/man8/trace.8
index 2a97a99..f33d5e4 100644
--- a/man/man8/trace.8
+++ b/man/man8/trace.8
@@ -92,6 +92,12 @@
 format specifier replacements may be any C expressions, and may refer to the
 same special keywords as in the predicate (arg1, arg2, etc.).
 
+In addition to the above format specifiers, you can also use %K and %U when
+the expression is an address that potentially points to executable code (i.e.,
+a symbol). trace will resolve %K specifiers to a kernel symbol, such as
+vfs__read, and will resolve %U specifiers to a user-space symbol in that
+process, such as sprintf.
+
 In tracepoints, both the predicate and the arguments may refer to the tracepoint
 format structure, which is stored in the special "args" variable. For example, the
 block:block_rq_complete tracepoint can print or filter by args->nr_sector. To 
diff --git a/man/man8/ttysnoop.8 b/man/man8/ttysnoop.8
new file mode 100644
index 0000000..9f37aaa
--- /dev/null
+++ b/man/man8/ttysnoop.8
@@ -0,0 +1,60 @@
+.TH ttysnoop 8  "2016-02-08" "USER COMMANDS"
+.SH NAME
+ttysnoop \- Watch output from a tty or pts device. Uses Linux eBPF/bcc.
+.SH SYNOPSIS
+.B ttysnoop [\-h] [\-C] device
+.SH DESCRIPTION
+ttysnoop watches a tty or pts device, and prints the same output that is
+appearing on that device. It can be used to mirror the output from a shell
+session, or the system console.
+
+This works by use of kernel dynamic tracing of the tty_write() function.
+This tool will need updating in case that kernel function changes in a future
+kernel version.
+
+Since this uses BPF, only the root user can use this tool.
+.SH REQUIREMENTS
+CONFIG_BPF and bcc.
+.SH OPTIONS
+.TP
+\-C
+Don't clear the screen.
+.TP
+device
+Either a path to a tty device (eg, /dev/tty0) or a pts number (eg, the "3"
+from /dev/pts/3).
+.SH EXAMPLES
+.TP
+Snoop output from /dev/pts/2
+#
+.B ttysnoop /dev/pts/2
+.TP
+Snoop output from /dev/pts/2 (shortcut)
+#
+.B ttysnoop 2
+.TP
+Snoop output from the system console
+#
+.B ttysnoop /dev/console
+.TP
+Snoop output from /dev/tty0
+#
+.B ttysnoop /dev/tty0
+.SH OVERHEAD
+As the rate of tty_write() is expected to be very low (<100/s), the overhead
+of this tool is expected to be negligible.
+.SH SOURCE
+This is from bcc.
+.IP
+https://github.com/iovisor/bcc
+.PP
+Also look in the bcc distribution for a companion _examples.txt file containing
+example usage, output, and commentary for this tool.
+.SH OS
+Linux
+.SH STABILITY
+Unstable - in development.
+.SH AUTHOR
+Brendan Gregg
+.SH SEE ALSO
+opensnoop(1)
diff --git a/src/cc/bcc_elf.c b/src/cc/bcc_elf.c
index 13d919a..5f38b91 100644
--- a/src/cc/bcc_elf.c
+++ b/src/cc/bcc_elf.c
@@ -165,7 +165,7 @@
         continue;
 
       if (callback(name, sym.st_value, sym.st_size, sym.st_info, payload) < 0)
-        break;
+        return 1;      // signal termination to caller
     }
   }
 
@@ -184,9 +184,13 @@
     if (header.sh_type != SHT_SYMTAB && header.sh_type != SHT_DYNSYM)
       continue;
 
-    if (list_in_scn(e, section, header.sh_link, header.sh_entsize, callback,
-                    payload) < 0)
-      return -1;
+    int rc = list_in_scn(e, section, header.sh_link, header.sh_entsize,
+                         callback, payload);
+    if (rc == 1)
+      break;    // callback signaled termination
+
+    if (rc < 0)
+      return rc;
   }
 
   return 0;
diff --git a/src/cc/bcc_syms.cc b/src/cc/bcc_syms.cc
index b1f1736..15a68fb 100644
--- a/src/cc/bcc_syms.cc
+++ b/src/cc/bcc_syms.cc
@@ -270,6 +270,32 @@
   return bcc_elf_foreach_sym(sym->module, _find_sym, sym);
 }
 
+struct sym_search_t {
+  struct bcc_symbol *syms;
+  int start;
+  int requested;
+  int *actual;
+};
+
+// see <elf.h>
+#define ELF_TYPE_IS_FUNCTION(flags) (((flags) & 0xf) == 2)
+
+static int _list_sym(const char *symname, uint64_t addr, uint64_t end,
+                     int flags, void *payload) {
+  if (!ELF_TYPE_IS_FUNCTION(flags) || addr == 0)
+    return 0;
+
+  SYM_CB cb = (SYM_CB) payload;
+  return cb(symname, addr); 
+}
+
+int bcc_foreach_symbol(const char *module, SYM_CB cb) {
+  if (module == 0 || cb == 0)
+    return -1;
+
+  return bcc_elf_foreach_sym(module, _list_sym, (void *)cb);
+}
+
 int bcc_resolve_symname(const char *module, const char *symname,
                         const uint64_t addr, struct bcc_symbol *sym) {
   uint64_t load_addr;
diff --git a/src/cc/bcc_syms.h b/src/cc/bcc_syms.h
index d130c5e..8d7258a 100644
--- a/src/cc/bcc_syms.h
+++ b/src/cc/bcc_syms.h
@@ -29,6 +29,8 @@
   uint64_t offset;
 };
 
+typedef int(* SYM_CB)(const char *symname, uint64_t addr);
+
 void *bcc_symcache_new(int pid);
 int bcc_symcache_resolve(void *symcache, uint64_t addr, struct bcc_symbol *sym);
 int bcc_symcache_resolve_name(void *resolver, const char *name, uint64_t *addr);
@@ -36,6 +38,7 @@
 
 int bcc_resolve_global_addr(int pid, const char *module, const uint64_t address,
                             uint64_t *global);
+int bcc_foreach_symbol(const char *module, SYM_CB cb);
 int bcc_find_symbol_addr(struct bcc_symbol *sym);
 int bcc_resolve_symname(const char *module, const char *symname,
                         const uint64_t addr, struct bcc_symbol *sym);
diff --git a/src/cc/bcc_usdt.h b/src/cc/bcc_usdt.h
index 4cbe29f..d03fd7e 100644
--- a/src/cc/bcc_usdt.h
+++ b/src/cc/bcc_usdt.h
@@ -35,8 +35,32 @@
     int num_arguments;
 };
 
+struct bcc_usdt_location {
+    uint64_t address;
+};
+
+#define BCC_USDT_ARGUMENT_NONE          0x0
+#define BCC_USDT_ARGUMENT_CONSTANT      0x1
+#define BCC_USDT_ARGUMENT_DEREF_OFFSET  0x2
+#define BCC_USDT_ARGUMENT_DEREF_IDENT   0x4
+#define BCC_USDT_ARGUMENT_REGISTER_NAME 0x8
+
+struct bcc_usdt_argument {
+    int size;
+    int valid;
+    int constant;
+    int deref_offset;
+    const char *deref_ident;
+    const char *register_name;
+};
+
 typedef void (*bcc_usdt_cb)(struct bcc_usdt *);
 void bcc_usdt_foreach(void *usdt, bcc_usdt_cb callback);
+int bcc_usdt_get_location(void *usdt, const char *probe_name,
+                          int index, struct bcc_usdt_location *location);
+int bcc_usdt_get_argument(void *usdt, const char *probe_name,
+                          int location_index, int argument_index,
+                          struct bcc_usdt_argument *argument);
 
 int bcc_usdt_enable_probe(void *, const char *, const char *);
 const char *bcc_usdt_genargs(void *);
diff --git a/src/cc/export/proto.h b/src/cc/export/proto.h
index 40e209d..ed47775 100644
--- a/src/cc/export/proto.h
+++ b/src/cc/export/proto.h
@@ -127,4 +127,19 @@
   unsigned int key:24;
   unsigned int rsv4:8;
 } BPF_PACKET_HEADER;
+
+struct vxlan_gbp_t {
+  unsigned int gflag:1;
+  unsigned int rsv1:3;
+  unsigned int iflag:1;
+  unsigned int rsv2:3;
+  unsigned int rsv3:1;
+  unsigned int dflag:1;
+  unsigned int rsv4:1;
+  unsigned int aflag:1;
+  unsigned int rsv5:3;
+  unsigned int tag:16;
+  unsigned int key:24;
+  unsigned int rsv6:8;
+} BPF_PACKET_HEADER;
 )********"
diff --git a/src/cc/usdt.cc b/src/cc/usdt.cc
index 29711b0..0bbc9dc 100644
--- a/src/cc/usdt.cc
+++ b/src/cc/usdt.cc
@@ -28,7 +28,7 @@
 
 namespace USDT {
 
-Probe::Location::Location(uint64_t addr, const char *arg_fmt) : address_(addr) {
+Location::Location(uint64_t addr, const char *arg_fmt) : address_(addr) {
   ArgumentParser_x64 parser(arg_fmt);
   while (!parser.done()) {
     Argument arg;
@@ -274,7 +274,7 @@
     if (!p->enabled())
       continue;
 
-    for (Probe::Location &loc : p->locations_) {
+    for (Location &loc : p->locations_) {
       callback(p->bin_path_.c_str(), p->attached_to_->c_str(), loc.address_,
                pid_.value_or(-1));
     }
@@ -357,6 +357,52 @@
   ctx->each(callback);
 }
 
+int bcc_usdt_get_location(void *usdt, const char *probe_name,
+                          int index, struct bcc_usdt_location *location) {
+    USDT::Context *ctx = static_cast<USDT::Context *>(usdt);
+    USDT::Probe *probe = ctx->get(probe_name);
+    if (!probe)
+        return -1;
+    if (index < 0 || (size_t)index >= probe->num_locations())
+        return -1;
+    location->address = probe->address(index);
+    return 0;
+}
+
+int bcc_usdt_get_argument(void *usdt, const char *probe_name,
+                          int location_index, int argument_index,
+                          struct bcc_usdt_argument *argument) {
+    USDT::Context *ctx = static_cast<USDT::Context *>(usdt);
+    USDT::Probe *probe = ctx->get(probe_name);
+    if (!probe)
+        return -1;
+    if (argument_index < 0 || (size_t)argument_index >= probe->num_arguments())
+        return -1;
+    if (location_index < 0 || (size_t)location_index >= probe->num_locations())
+        return -1;
+    auto const &location = probe->location(location_index);
+    auto const &arg = location.arguments_[argument_index];
+    argument->size = arg.arg_size();
+    argument->valid = BCC_USDT_ARGUMENT_NONE;
+    if (arg.constant()) {
+        argument->valid |= BCC_USDT_ARGUMENT_CONSTANT;
+        argument->constant = *(arg.constant());
+    }
+    if (arg.deref_offset()) {
+        argument->valid |= BCC_USDT_ARGUMENT_DEREF_OFFSET;
+        argument->deref_offset = *(arg.deref_offset());
+    }
+    if (arg.deref_ident()) {
+        argument->valid |= BCC_USDT_ARGUMENT_DEREF_IDENT;
+        argument->deref_ident = arg.deref_ident()->c_str();
+    }
+    if (arg.register_name()) {
+        argument->valid |= BCC_USDT_ARGUMENT_REGISTER_NAME;
+        argument->register_name = arg.register_name()->c_str();
+    }
+    return 0;
+}
+
 void bcc_usdt_foreach_uprobe(void *usdt, bcc_usdt_uprobe_cb callback) {
   USDT::Context *ctx = static_cast<USDT::Context *>(usdt);
   ctx->each_uprobe(callback);
diff --git a/src/cc/usdt.h b/src/cc/usdt.h
index 1676192..bdf9412 100644
--- a/src/cc/usdt.h
+++ b/src/cc/usdt.h
@@ -117,18 +117,18 @@
   ArgumentParser_x64(const char *arg) : ArgumentParser(arg) {}
 };
 
+struct Location {
+  uint64_t address_;
+  std::vector<Argument> arguments_;
+  Location(uint64_t addr, const char *arg_fmt);
+};
+
 class Probe {
   std::string bin_path_;
   std::string provider_;
   std::string name_;
   uint64_t semaphore_;
 
-  struct Location {
-    uint64_t address_;
-    std::vector<Argument> arguments_;
-    Location(uint64_t addr, const char *arg_fmt);
-  };
-
   std::vector<Location> locations_;
 
   optional<int> pid_;
@@ -153,6 +153,7 @@
   uint64_t semaphore()   const { return semaphore_; }
 
   uint64_t address(size_t n = 0) const { return locations_[n].address_; }
+  const Location &location(size_t n) const { return locations_[n]; }
   bool usdt_getarg(std::ostream &stream);
   std::string get_arg_ctype(int arg_index) {
     return largest_arg_type(arg_index);
diff --git a/src/cc/usdt_args.cc b/src/cc/usdt_args.cc
index ad33737..739d9ec 100644
--- a/src/cc/usdt_args.cc
+++ b/src/cc/usdt_args.cc
@@ -146,6 +146,9 @@
   } else {
     dest->deref_offset_ = 0;
     pos = parse_identifier(pos, &dest->deref_ident_);
+    if (arg_[pos] == '+' || arg_[pos] == '-') {
+      pos = parse_number(pos, &dest->deref_offset_);
+    }
   }
 
   if (arg_[pos] != '(')
@@ -184,10 +187,12 @@
   ssize_t res = parse_1(cur_pos_, dest);
   if (res < 0) {
     print_error(-res);
+    cur_pos_ = -res;
     return false;
   }
   if (!isspace(arg_[res]) && arg_[res] != '\0') {
     print_error(res);
+    cur_pos_ = res;
     return false;
   }
   while (isspace(arg_[res])) res++;
diff --git a/src/lua/CMakeLists.txt b/src/lua/CMakeLists.txt
index 94cd0d2..97e2bb4 100644
--- a/src/lua/CMakeLists.txt
+++ b/src/lua/CMakeLists.txt
@@ -4,7 +4,8 @@
 if (LUAJIT_LIBRARIES AND LUAJIT)
 	FILE(GLOB_RECURSE SRC_LUA
 		${CMAKE_CURRENT_SOURCE_DIR}/bcc/*.lua
-		${CMAKE_CURRENT_SOURCE_DIR}/bcc/vendor/*.lua)
+		${CMAKE_CURRENT_SOURCE_DIR}/bcc/vendor/*.lua
+		${CMAKE_CURRENT_SOURCE_DIR}/bpf/*.lua)
 
 	ADD_CUSTOM_COMMAND(
 		OUTPUT bcc.lua
diff --git a/src/lua/README.md b/src/lua/README.md
index 670392c..8f303a6 100644
--- a/src/lua/README.md
+++ b/src/lua/README.md
@@ -1,7 +1,7 @@
 Lua Tools for BCC
 -----------------
 
-This directory contains Lua tooling for [BCC](https://github.com/iovisor/bcc)
+This directory contains Lua tooling for [BCC][bcc]
 (the BPF Compiler Collection).
 
 BCC is a toolkit for creating userspace and kernel tracing programs. By
@@ -52,3 +52,104 @@
     ```
     sudo ./bcc-probe examples/lua/task_switch.lua
     ```
+
+## LuaJIT BPF compiler
+
+Now it is also possible to write Lua functions and compile them transparently to BPF bytecode, here is a simple socket filter example:
+
+```lua
+local S = require('syscall')
+local bpf = require('bpf')
+local map = bpf.map('array', 256)
+-- Kernel-space part of the program
+local prog = assert(bpf(function ()
+    local proto = pkt.ip.proto  -- Get byte (ip.proto) from frame at [23]
+    xadd(map[proto], 1)         -- Increment packet count
+end))
+-- User-space part of the program
+local sock = assert(bpf.socket('lo', prog))
+for i=1,10 do
+    local icmp, udp, tcp = map[1], map[17], map[6]
+    print('TCP', tcp, 'UDP', udp, 'ICMP', icmp, 'packets')
+    S.sleep(1)
+end
+```
+
+The other application of BPF programs is attaching to probes for [perf event tracing][tracing]. That means you can trace events inside the kernel (or user-space), and then collect results - for example histogram of `sendto()` latency, off-cpu time stack traces, syscall latency, and so on. While kernel probes and perf events have unstable ABI, with a dynamic language we can create and use proper type based on the tracepoint ABI on runtime.
+
+Runtime automatically recognizes reads that needs a helper to be accessed. The type casts denote source of the objects, for example the [bashreadline][bashreadline] example that prints entered bash commands from all running shells:
+
+```lua
+local ffi = require('ffi')
+local bpf = require('bpf')
+-- Perf event map
+local sample_t = 'struct { uint64_t pid; char str[80]; }'
+local events = bpf.map('perf_event_array')
+-- Kernel-space part of the program
+bpf.uprobe('/bin/bash:readline' function (ptregs)
+    local sample = ffi.new(sample_t)
+    sample.pid = pid_tgid()
+    ffi.copy(sample.str, ffi.cast('char *', req.ax)) -- Cast `ax` to string pointer and copy to buffer
+    perf_submit(events, sample)                      -- Write sample to perf event map
+end, true, -1, 0)
+-- User-space part of the program
+local log = events:reader(nil, 0, sample_t) -- Must specify PID or CPU_ID to observe
+while true do
+    log:block()               -- Wait until event reader is readable
+    for _,e in log:read() do  -- Collect available reader events
+        print(tonumber(e.pid), ffi.string(e.str))
+    end
+end
+```
+
+Where cast to `struct pt_regs` flags the source of data as probe arguments, which means any pointer derived
+from this structure points to kernel and a helper is needed to access it. Casting `req.ax` to pointer is then required for `ffi.copy` semantics, otherwise it would be treated as `u64` and only it's value would be
+copied. The type detection is automatic most of the times (socket filters and `bpf.tracepoint`), but not with uprobes and kprobes.
+
+### Installation
+
+```bash
+$ luarocks install bpf
+```
+
+### Examples
+
+See `examples/lua` directory.
+
+### Helpers
+
+* `print(...)` is a wrapper for `bpf_trace_printk`, the output is captured in `cat /sys/kernel/debug/tracing/trace_pipe`
+* `bit.*` library **is** supported (`lshift, rshift, arshift, bnot, band, bor, bxor`)
+* `math.*` library *partially* supported (`log2, log, log10`)
+* `ffi.cast()` is implemented (including structures and arrays)
+* `ffi.new(...)` allocates memory on stack, initializers are NYI
+* `ffi.copy(...)` copies memory (possibly using helpers) between stack/kernel/registers
+* `ntoh(x[, width])` - convert from network to host byte order.
+* `hton(x[, width])` - convert from host to network byte order.
+* `xadd(dst, inc)` - exclusive add, a synchronous `*dst += b` if Lua had `+=` operator
+
+Below is a list of BPF-specific helpers:
+
+* `time()` - return current monotonic time in nanoseconds (uses `bpf_ktime_get_ns`)
+* `cpu()` - return current CPU number (uses `bpf_get_smp_processor_id`)
+* `pid_tgid()` - return caller `tgid << 32 | pid` (uses `bpf_get_current_pid_tgid`)
+* `uid_gid()` - return caller `gid << 32 | uid` (uses `bpf_get_current_uid_gid`)
+* `comm(var)` - write current process name (uses `bpf_get_current_comm`)
+* `perf_submit(map, var)` - submit variable to perf event array BPF map
+* `stack_id(map, flags)` - return stack trace identifier from stack trace BPF map
+
+### Current state
+
+* Not all LuaJIT bytecode opcodes are supported *(notable mentions below)*
+* Closures `UCLO` will probably never be supported, although you can use upvalues inside compiled function.
+* Type narrowing is opportunistic. Numbers are 64-bit by default, but 64-bit immediate loads are not supported (e.g. `local x = map[ffi.cast('uint64_t', 1000)]`)
+* Tail calls `CALLT`, and iterators `ITERI` are NYI (as of now)
+* Arbitrary ctype **is** supported both for map keys and values
+* Basic optimisations like: constant propagation, partial DCE, liveness analysis and speculative register allocation are implement, but there's no control flow analysis yet. This means the compiler has the visibility when things are used and dead-stores occur, but there's no rewriter pass to eliminate them.
+* No register sub-allocations, no aggressive use of caller-saved `R1-5`, no aggressive narrowing (this would require variable range assertions and variable relationships)
+* Slices with not 1/2/4/8 length are NYI (requires allocating a memory on stack and using pointer type)
+
+
+[bcc]: https://github.com/iovisor/bcc
+[tracing]: http://www.brendangregg.com/blog/2016-03-05/linux-bpf-superpowers.html
+[bashreadline]: http://www.brendangregg.com/blog/2016-02-08/linux-ebpf-bcc-uprobes.html
\ No newline at end of file
diff --git a/src/lua/bcc/vendor/posix.lua b/src/lua/bcc/vendor/posix.lua
index 189097c..8e46713 100644
--- a/src/lua/bcc/vendor/posix.lua
+++ b/src/lua/bcc/vendor/posix.lua
@@ -15,21 +15,25 @@
 ]]
 local ffi = require("ffi")
 
-ffi.cdef[[
-typedef int clockid_t;
-typedef long time_t;
+-- Avoid duplicate declarations if syscall library is present
+local has_syscall, _ = pcall(require, "syscall")
+if not has_syscall then
+  ffi.cdef [[
+  typedef int clockid_t;
+  typedef long time_t;
 
-struct timespec {
-  time_t tv_sec;
-  long tv_nsec;
-};
+  struct timespec {
+    time_t tv_sec;
+    long tv_nsec;
+  };
 
-int clock_gettime(clockid_t clk_id, struct timespec *tp);
-int clock_nanosleep(clockid_t clock_id, int flags,
-  const struct timespec *request, struct timespec *remain);
-
+  int clock_gettime(clockid_t clk_id, struct timespec *tp);
+  int clock_nanosleep(clockid_t clock_id, int flags,
+    const struct timespec *request, struct timespec *remain);
+  ]]
+end
+ffi.cdef [[
 int get_nprocs(void);
-
 uint64_t strtoull(const char *nptr, char **endptr, int base);
 ]]
 
diff --git a/src/lua/bpf-scm-1.rockspec b/src/lua/bpf-scm-1.rockspec
new file mode 100644
index 0000000..7f6ba63
--- /dev/null
+++ b/src/lua/bpf-scm-1.rockspec
@@ -0,0 +1,37 @@
+package = "bpf"
+version = "scm-1"
+source = {
+   url = "git://github.com/iovisor/bcc.git"
+}
+description = {
+   summary = "BCC - LuaJIT to BPF compiler.",
+   detailed = [[
+   ]],
+   homepage = "https://github.com/iovisor/bcc",
+   license = "BSD"
+}
+dependencies = {
+   "lua >= 5.1",
+   "ljsyscall >= 0.12",
+}
+external_dependencies = {
+    LIBELF = {
+       library = "elf"
+    }
+}
+build = {
+  type = "builtin",
+  install = {
+    bin = {
+    }
+  },
+  modules = {
+    bpf = "src/lua/bpf/bpf.lua",
+    ["bpf.builtins"] = "src/lua/bpf/builtins.lua",
+    ["bpf.cdef"] = "src/lua/bpf/cdef.lua",
+    ["bpf.elf"] = "src/lua/bpf/elf.lua",
+    ["bpf.init"] = "src/lua/bpf/init.lua",
+    ["bpf.ljbytecode"] = "src/lua/bpf/ljbytecode.lua",
+    ["bpf.proto"] = "src/lua/bpf/proto.lua",
+  }
+}
diff --git a/src/lua/bpf/bpf.lua b/src/lua/bpf/bpf.lua
new file mode 100644
index 0000000..2661c9a
--- /dev/null
+++ b/src/lua/bpf/bpf.lua
@@ -0,0 +1,1282 @@
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- LuaJIT to BPF bytecode compiler.
+--
+-- The code generation phase is currently one-pass and produces:
+-- * Compiled code in BPF bytecode format (https://www.kernel.org/doc/Documentation/networking/filter.txt)
+-- * Variables with liveness analysis and other meta (spill information, compile-time value)
+--
+-- The code generator optimises as much as possible in single pass:
+-- * Fold compile-time expressions and constant propagation
+-- * Basic control flow analysis with dead code elimination (based on compile-time expressions)
+-- * Single-pass optimistic register allocation
+--
+-- The first pass doesn't have variable lifetime visibility yet, so it relies on rewriter for further
+-- optimisations such as:
+-- * Dead store elimination (first-pass doesn't know if/when the variable is going to be used)
+-- * Common sub-expression elimination (relies on DCE and liveness analysis)
+-- * Orphan JMP elimination (removing this in first pass would break previous JMP targets)
+-- * Better register allocation (needs to be recomputed after optimisations)
+
+local ffi = require('ffi')
+local bit = require('bit')
+local S = require('syscall')
+local bytecode = require('bpf.ljbytecode')
+local cdef = require('bpf.cdef')
+local proto = require('bpf.proto')
+local builtins = require('bpf.builtins')
+
+-- Constants
+local ALWAYS, NEVER = -1, -2
+local BPF = ffi.typeof('struct bpf')
+local HELPER = ffi.typeof('struct bpf_func_id')
+
+-- Symbolic table of constant expressions over numbers
+local const_expr = {
+	ADD = function (a, b) return a + b end,
+	SUB = function (a, b) return a - b end,
+	DIV = function (a, b) return a / b end,
+	MOD = function (a, b) return a % b end,
+	JEQ = function (a, b) return a == b end,
+	JNE = function (a, b) return a ~= b end,
+	JGE = function (a, b) return a >= b end,
+	JGT = function (a, b) return a > b end,
+}
+local const_width = {
+	[1] = BPF.B, [2] = BPF.H, [4] = BPF.W, [8] = BPF.DW,
+}
+
+-- Built-ins that are strict only (never compile-time expandable)
+local builtins_strict = {
+	[ffi.new] = true,
+	[print]   = true,
+}
+
+-- Return struct member size/type (requires LuaJIT 2.1+)
+-- I am ashamed that there's no easier way around it.
+local function sizeofattr(ct, name)
+	if not ffi.typeinfo then error('LuaJIT 2.1+ is required for ffi.typeinfo') end
+	local cinfo = ffi.typeinfo(ct)
+	while true do
+		cinfo = ffi.typeinfo(cinfo.sib)
+		if not cinfo then return end
+		if cinfo.name == name then break end
+	end
+	local size = math.max(1, ffi.typeinfo(cinfo.sib or ct).size - cinfo.size)
+	-- Guess type name
+	return size, builtins.width_type(size)
+end
+
+-- Return true if the constant part is a proxy
+local function is_proxy(x)
+	return type(x) == 'table' and (x.__dissector or x.__map or x.__base)
+end
+
+-- Create compiler closure
+local function create_emitter(env, stackslots, params, param_types)
+
+local V = {}   -- Variable tracking / register allocator
+local code = { -- Generated code
+	pc = 0, bc_pc = 0,
+	insn = ffi.new('struct bpf_insn[4096]'),
+	fixup = {},
+	reachable = true,
+	seen_cmp = nil,
+}
+local Vstate = {} -- Track variable layout at basic block exits
+
+-- Anything below this stack offset is free to use by caller
+-- @note: There is no tracking memory allocator, so the caller may
+-- lower it for persistent objects, but such memory will never
+-- be reclaimed and the caller is responsible for resetting stack
+-- top whenever the memory below is free to be reused
+local stack_top = (stackslots + 1) * ffi.sizeof('uint64_t')
+
+local function emit(op, dst, src, off, imm)
+	local ins = code.insn[code.pc]
+	ins.code = op
+	ins.dst_reg = dst
+	ins.src_reg = src
+	ins.off = off
+	ins.imm = imm
+	code.pc = code.pc + 1
+end
+
+local function reg_spill(var)
+	local vinfo = V[var]
+	vinfo.spill = (var + 1) * ffi.sizeof('uint64_t') -- Index by (variable number) * (register width)
+	emit(BPF.MEM + BPF.STX + BPF.DW, 10, vinfo.reg, -vinfo.spill, 0)
+	vinfo.reg = nil
+end
+
+local function reg_fill(var, reg)
+	local vinfo = V[var]
+	assert(vinfo.spill, 'attempt to fill register with a VAR that isn\'t spilled')
+	emit(BPF.MEM + BPF.LDX + BPF.DW, reg, 10, -vinfo.spill, 0)
+	vinfo.reg = reg
+	vinfo.spill = nil
+end
+
+-- Allocate a register (lazy simple allocator)
+local function reg_alloc(var, reg)
+	-- Specific register requested, must spill/move existing variable
+	if reg then
+		for k,v in pairs(V) do -- Spill any variable that has this register
+			if v.reg == reg and not v.shadow then
+				reg_spill(k)
+				break
+			end
+		end
+		return reg
+	end
+	-- Find free or least recently used slot
+	local last, last_seen, used = nil, 0xffff, 0
+	for k,v in pairs(V) do
+		if v.reg then
+			if not v.live_to or v.live_to < last_seen then
+				last, last_seen = k, v.live_to or last_seen
+			end
+			used = bit.bor(used, bit.lshift(1, v.reg))
+		end
+	end
+	-- Attempt to select a free register from R7-R9 (callee saved)
+	local free = bit.bnot(used)
+	if     bit.band(free, 0x80) ~= 0 then reg = 7
+	elseif bit.band(free,0x100) ~= 0 then reg = 8
+	elseif bit.band(free,0x200) ~= 0 then reg = 9
+	end
+	-- Select another variable to be spilled
+	if not reg then
+		assert(last)
+		reg = V[last].reg
+		reg_spill(last)
+	end
+	assert(reg, 'VAR '..var..'fill/spill failed')
+	return reg
+end
+
+-- Set new variable
+local function vset(var, reg, const, vtype)
+	-- Must materialise all variables shadowing this variable slot, as it will be overwritten
+	if V[var] and V[var].reg then
+		for _, vinfo in pairs(V) do
+			-- Shadowing variable MUST share the same type and attributes,
+			-- but the register assignment may have changed
+			if vinfo.shadow == var then
+				vinfo.reg = V[var].reg
+				vinfo.shadow = nil
+			end
+		end
+	end
+	-- Get precise type for CDATA or attempt to narrow numeric constant
+	if not vtype and type(const) == 'cdata' then vtype = ffi.typeof(const) end
+	V[var] = {reg=reg, const=const, type=vtype}
+end
+
+-- Materialize (or register) a variable in a register
+-- If the register is nil, then the a new register is assigned (if not already assigned)
+local function vreg(var, reg, reserve, vtype)
+	local vinfo = V[var]
+	assert(vinfo, 'VAR '..var..' not registered')
+	vinfo.live_to = code.pc-1
+	if (vinfo.reg and not reg) and not vinfo.shadow then return vinfo.reg end
+	reg = reg_alloc(var, reg)
+	-- Materialize variable shadow copy
+	local src = vinfo
+	while src.shadow do src = V[src.shadow] end
+	if reserve then
+		-- No load to register occurs
+	elseif src.reg then
+		emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, src.reg, 0, 0)
+	elseif src.spill then
+		vinfo.spill = src.spill
+		reg_fill(var, reg)
+	elseif src.const then
+		vtype = vtype or src.type
+		if type(src.const) == 'table' and src.const.__base then
+			-- Load pointer type
+			emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, 10, 0, 0)
+			emit(BPF.ALU64 + BPF.ADD + BPF.K, reg, 0, 0, -src.const.__base)
+		elseif type(src.const) == 'table' and src.const.__dissector then
+			-- Load dissector offset (imm32), but keep the constant part (dissector proxy)
+			emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const.off or 0)
+		elseif vtype and ffi.sizeof(vtype) == 8 then
+			-- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32))
+			emit(BPF.LD + BPF.DW, reg, 0, 0, ffi.cast('uint32_t', src.const))
+			emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.rshift(bit.rshift(src.const, 16), 16)))
+			vinfo.const = nil -- The variable is live
+		else
+			emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const)
+			vinfo.const = nil -- The variable is live
+		end
+	else assert(false, 'VAR '..var..' has neither register nor constant value') end
+	vinfo.reg = reg
+	vinfo.shadow = nil
+	vinfo.live_from = code.pc-1
+	vinfo.type = vtype or vinfo.type
+	return reg
+end
+
+-- Copy variable
+local function vcopy(dst, src)
+	if dst == src then return end
+	V[dst] = {reg=V[src].reg, const=V[src].const, shadow=src, source=V[src].source, type=V[src].type}
+end
+
+-- Dereference variable of pointer type
+local function vderef(dst_reg, src_reg, vtype)
+	-- Dereference map pointers for primitive types
+	-- BPF doesn't allow pointer arithmetics, so use the entry value
+	local w = ffi.sizeof(vtype)
+	assert(const_width[w], 'NYI: sizeof('..tostring(vtype)..') not 1/2/4/8 bytes')
+	if dst_reg ~= src_reg then
+		emit(BPF.ALU64 + BPF.MOV + BPF.X, dst_reg, src_reg, 0, 0)    -- dst = src
+	end
+	emit(BPF.JMP + BPF.JEQ + BPF.K, src_reg, 0, 1, 0)                -- if (src != NULL)
+	emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, src_reg, 0, 0) --     dst = *src;
+end
+
+-- Allocate a space for variable
+local function valloc(size, blank)
+	local base = stack_top
+	assert(stack_top + size < 512 * 1024, 'exceeded maximum stack size of 512kB')
+	stack_top = stack_top + size
+	-- Align to 8 byte boundary
+	stack_top = math.ceil(stack_top/8)*8
+	-- Current kernel version doesn't support ARG_PTR_TO_RAW_STACK
+	-- so we always need to have memory initialized, remove this when supported
+	if blank then 
+		if type(blank) == 'string' then
+			local sp = 0
+			while sp < size do
+				-- TODO: no BPF_ST + BPF_DW instruction yet
+				local as_u32 = ffi.new('uint32_t [1]')
+				local sub = blank:sub(sp+1, sp+ffi.sizeof(as_u32))
+				ffi.copy(as_u32, sub, #sub)
+				emit(BPF.MEM + BPF.ST + BPF.W, 10, 0, -(stack_top-sp), as_u32[0])
+				sp = sp + ffi.sizeof(as_u32)
+			end
+		elseif type(blank) == 'boolean' then
+			reg_alloc(stackslots, 0)
+			emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0)
+			for sp = base+8,stack_top,8 do
+				emit(BPF.MEM + BPF.STX + BPF.DW, 10, 0, -sp, 0)
+			end
+		else error('NYI: will with unknown type '..type(blank)) end
+	end 
+	return stack_top
+end
+
+-- Emit compensation code at the end of basic block to unify variable set layout on all block exits
+-- 1. we need to free registers by spilling
+-- 2. fill registers to match other exits from this BB
+local function bb_end(Vcomp)
+	for i,v in pairs(V) do
+		if Vcomp[i] and Vcomp[i].spill and not v.spill then
+			reg_spill(i)
+		end
+	end
+	for i,v in pairs(V) do
+		if Vcomp[i] and Vcomp[i].reg and not v.reg then
+			vreg(i, Vcomp[i].reg)
+		end
+	end
+end
+
+local function LD_ABS(dst, off, w)
+	local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0
+	-- assert(w < 8, 'NYI: LD_ABS64 is not supported') -- IMM64 has two IMM32 insns fused together
+	emit(BPF.LD + BPF.ABS + const_width[w], dst_reg, 0, 0, off)
+end
+
+local function LD_IND(dst, src, w, off)
+	local src_reg = vreg(src) -- Must materialize first in case dst == src
+	local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0
+	emit(BPF.LD + BPF.IND + const_width[w], dst_reg, src_reg, 0, off or 0)
+end
+
+local function LD_FIELD(a, d, w, imm)
+	if imm then
+		LD_ABS(a, imm, w)
+	else
+		LD_IND(a, d, w)
+	end
+end
+
+-- @note: This is specific now as it expects registers reserved
+local function LD_IMM_X(dst_reg, src_type, imm, w)
+	if w == 8 then -- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32))
+		emit(BPF.LD + const_width[w], dst_reg, src_type, 0, ffi.cast('uint32_t', imm))
+		-- Must shift in two steps as bit.lshift supports [0..31]
+		emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.lshift(bit.lshift(imm, 16), 16)))
+	else
+		emit(BPF.LD + const_width[w], dst_reg, src_type, 0, imm)
+	end
+end
+
+local function LOAD(dst, src, off, vtype)
+	local base = V[src].const
+	assert(base.__dissector, 'NYI: load() on variable that doesnt have dissector')
+	-- Cast to different type if requested
+	vtype = vtype or base.__dissector
+	local w = ffi.sizeof(vtype)
+	assert(w <= 4, 'NYI: load() supports 1/2/4 bytes at a time only')
+	if base.off then -- Absolute address to payload
+		LD_ABS(dst, off + base.off, w)
+	else -- Indirect address to payload
+		LD_IND(dst, src, w, off)
+	end
+	V[dst].type = vtype
+	V[dst].const = nil -- Dissected value is not constant anymore
+end
+
+local function CMP_STR(a, b, op)
+	assert(op == 'JEQ' or op == 'JNE', 'NYI: only equivallence stack/string only supports == or ~=')
+	-- I have no better idea how to implement it than unrolled XOR loop, as we can fixup only one JMP
+	-- So: X(a,b) = a[0] ^ b[0] | a[1] ^ b[1] | ...
+	--     EQ(a,b) <=> X == 0
+	-- This could be optimised by placing early exits by rewriter in second phase for long strings
+	local base, size = V[a].const.__base, math.min(#b, ffi.sizeof(V[a].type))
+	local acc, tmp = reg_alloc(stackslots, 0), reg_alloc(stackslots+1, 1)
+	local sp = 0
+	emit(BPF.ALU64 + BPF.MOV + BPF.K, acc, 0, 0, 0)
+	while sp < size do
+		-- Load string chunk as imm32
+		local as_u32 = ffi.new('uint32_t [1]')
+		local sub = b:sub(sp+1, sp+ffi.sizeof(as_u32))
+		ffi.copy(as_u32, sub, #sub)
+		-- TODO: make this faster by interleaved load/compare steps with DW length
+		emit(BPF.MEM + BPF.LDX + BPF.W, tmp, 10, -(base-sp), 0)
+		emit(BPF.ALU64 + BPF.XOR + BPF.K, tmp, 0, 0, as_u32[0])
+		emit(BPF.ALU64 + BPF.OR + BPF.X, acc, tmp, 0, 0)
+		sp = sp + ffi.sizeof(as_u32)
+	end
+	emit(BPF.JMP + BPF[op] + BPF.K, acc, 0, 0xffff, 0)
+	code.seen_cmp = code.pc-1
+end
+
+local function CMP_REG(a, b, op)
+	-- Fold compile-time expressions
+	if V[a].const and V[b].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then
+		code.seen_cmp = const_expr[op](V[a].const, V[b].const) and ALWAYS or NEVER
+	else
+		-- Comparison against compile-time string or stack memory
+		if V[b].const and type(V[b].const) == 'string' then
+			return CMP_STR(a, V[b].const, op)
+		end
+		-- The 0xFFFF target here has no significance, it's just a placeholder for
+		-- compiler to replace it's absolute offset to LJ bytecode insn with a relative
+		-- offset in BPF program code, verifier will accept only programs with valid JMP targets
+		local a_reg, b_reg = vreg(a), vreg(b)
+		-- Migrate operands from R0-5 as it will be spilled in compensation code when JMP out of BB
+		if a_reg == 0 then a_reg = vreg(a, 7) end
+		emit(BPF.JMP + BPF[op] + BPF.X, a_reg, b_reg, 0xffff, 0)
+		code.seen_cmp = code.pc-1
+	end
+end
+
+local function CMP_IMM(a, b, op)
+	if V[a].const and not is_proxy(V[a].const) then -- Fold compile-time expressions
+		code.seen_cmp = const_expr[op](V[a].const, b) and ALWAYS or NEVER
+	else
+		-- Convert imm32 to number
+		if type(b) == 'string' then
+			if     #b == 1 then b = b:byte()
+			elseif cdef.isptr(V[a].type) then
+				-- String comparison between stack/constant string
+				return CMP_STR(a, b, op)
+			elseif #b <= 4 then
+				-- Convert to u32 with network byte order
+				local imm = ffi.new('uint32_t[1]')
+				ffi.copy(imm, b, #b)
+				b = builtins.hton(imm[0])
+			else error('NYI: compare register with string, where #string > sizeof(u32)') end
+		end
+		-- The 0xFFFF target here has no significance, it's just a placeholder for
+		-- compiler to replace it's absolute offset to LJ bytecode insn with a relative
+		-- offset in BPF program code, verifier will accept only programs with valid JMP targets
+		local reg = vreg(a)
+		-- Migrate operands from R0-5 as it will be spilled in compensation code when JMP out of BB
+		if reg == 0 then reg = vreg(a, 7) end
+		emit(BPF.JMP + BPF[op] + BPF.K, reg, 0, 0xffff, b)
+		code.seen_cmp = code.pc-1
+	end
+end
+
+local function ALU_IMM(dst, a, b, op)
+	-- Fold compile-time expressions
+	if V[a].const and not is_proxy(V[a].const) then
+			assert(type(V[a].const) == 'number', 'VAR '..a..' must be numeric')
+			vset(dst, nil, const_expr[op](V[a].const, b))
+	-- Now we need to materialize dissected value at DST, and add it
+	else
+		vcopy(dst, a)
+		local dst_reg = vreg(dst)
+		if cdef.isptr(V[a].type) then
+			vderef(dst_reg, dst_reg, V[a].const.__dissector)
+			V[dst].type = V[a].const.__dissector
+		else
+			V[dst].type = V[a].type
+		end
+		emit(BPF.ALU64 + BPF[op] + BPF.K, dst_reg, 0, 0, b)
+	end
+end
+
+local function ALU_REG(dst, a, b, op)
+	-- Fold compile-time expressions
+	if V[a].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then
+		assert(type(V[a].const) == 'number', 'VAR '..a..' must be numeric')
+		assert(type(V[b].const) == 'number', 'VAR '..b..' must be numeric')
+		if type(op) == 'string' then op = const_expr[op] end
+		vcopy(dst, a)
+		V[dst].const = op(V[a].const, V[b].const)
+	else
+		local src_reg = b and vreg(b) or 0 -- SRC is optional for unary operations
+		if b and cdef.isptr(V[b].type) then
+			-- We have to allocate a temporary register for dereferencing to preserve
+			-- pointer in source variable that MUST NOT be altered
+			reg_alloc(stackslots, 2)
+			vderef(2, src_reg, V[b].const.__dissector)
+			src_reg = 2
+		end
+		vcopy(dst, a) -- DST may alias B, so copy must occur after we materialize B
+		local dst_reg = vreg(dst)
+		if cdef.isptr(V[a].type) then
+			vderef(dst_reg, dst_reg, V[a].const.__dissector)
+			V[dst].type = V[a].const.__dissector
+		end
+		emit(BPF.ALU64 + BPF[op] + BPF.X, dst_reg, src_reg, 0, 0)
+		V[stackslots].reg = nil  -- Free temporary registers
+	end
+end
+
+
+local function ALU_IMM_NV(dst, a, b, op)
+	-- Do DST = IMM(a) op VAR(b) where we can't invert because
+	-- the registers are u64 but immediates are u32, so complement
+	-- arithmetics wouldn't work
+	vset(stackslots+1, nil, a)
+	ALU_REG(dst, stackslots+1, b, op)
+end
+
+local function BUILTIN(func, ...)
+	local builtin_export = {
+		-- Compiler primitives (work with variable slots, emit instructions)
+		V=V, vreg=vreg, vset=vset, vcopy=vcopy, vderef=vderef, valloc=valloc, emit=emit,
+		reg_alloc=reg_alloc, reg_spill=reg_spill, tmpvar=stackslots, const_width=const_width,
+		-- Extensions and helpers (use with care)
+		LD_IMM_X = LD_IMM_X,
+	}
+	func(builtin_export, ...)
+end
+
+local function CALL(a, b, d)
+	assert(b-1 <= 1, 'NYI: CALL with >1 return values')
+	-- Perform either compile-time, helper, or builtin
+	local func = V[a].const
+	-- Gather all arguments and check if they're constant
+	local args, const, nargs = {}, true, d - 1
+	for i = a+1, a+d-1 do
+		table.insert(args, V[i].const)
+		if not V[i].const or is_proxy(V[i].const) then const = false end
+	end
+	local builtin = builtins[func]
+	if not const or nargs == 0 then
+		if builtin and type(builtin) == 'function' then
+			args = {a}
+			for i = a+1, a+nargs do table.insert(args, i) end
+			BUILTIN(builtin, unpack(args))
+		elseif V[a+2] and V[a+2].const then -- var OP imm
+			ALU_IMM(a, a+1, V[a+2].const, builtin)
+		elseif nargs <= 2 then              -- var OP var
+			ALU_REG(a, a+1, V[a+2] and a+2, builtin)
+		else
+			error('NYI: CALL non-builtin with 3 or more arguments')
+		end
+	-- Call on dissector implies slice retrieval
+	elseif type(func) == 'table' and func.__dissector then
+		assert(nargs >= 2, 'NYI: <dissector>.slice(a, b) must have at least two arguments')
+		assert(V[a+1].const and V[a+2].const, 'NYI: slice() arguments must be constant')
+		local off = V[a+1].const
+		local vtype = builtins.width_type(V[a+2].const - off)
+		LOAD(a, a, off, vtype)
+	-- Strict builtins cannot be expanded on compile-time
+	elseif builtins_strict[func] and builtin then
+		args = {a}
+		for i = a+1, a+nargs do table.insert(args, i) end
+		BUILTIN(builtin, unpack(args))
+	-- Attempt compile-time call expansion (expects all argument compile-time known)
+	else
+		V[a].const = func(unpack(args))
+	end
+end
+
+local function MAP_INIT(map_var, key, imm)
+	local map = V[map_var].const
+	vreg(map_var, 1, true, ffi.typeof('uint64_t'))
+	-- Reserve R1 and load ptr for process-local map fd
+	LD_IMM_X(1, BPF.PSEUDO_MAP_FD, map.fd, ffi.sizeof(V[map_var].type))
+	V[map_var].reg = nil -- R1 will be invalidated after CALL, forget register allocation
+	-- Reserve R2 and load R2 = key pointer
+	local key_size = ffi.sizeof(map.key_type)
+	local w = const_width[key_size] or BPF.DW
+	local pod_type = const_width[key_size]
+	local sp = stack_top + key_size -- Must use stack below spill slots
+	-- Store immediate value on stack
+	reg_alloc(stackslots, 2) -- Spill anything in R2 (unnamed tmp variable)
+	local key_base = key and V[key].const
+	imm = imm or key_base
+	if imm and (not key or not is_proxy(key_base)) then
+		assert(pod_type, 'NYI: map[const K], K width must be 1/2/4/8')
+		emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, imm)
+	-- Key is in register, spill it
+	elseif V[key].reg and pod_type then
+		if cdef.isptr(V[key].type) then
+			-- There is already pointer in register, dereference before spilling
+			emit(BPF.MEM + BPF.LDX + w, 2, V[key].reg, 0, 0)
+			emit(BPF.MEM + BPF.STX + w, 10, 2, -sp, 0)
+		else -- Variable in register is POD, spill it on the stack
+			emit(BPF.MEM + BPF.STX + w, 10, V[key].reg, -sp, 0)
+		end
+	-- Key is spilled from register to stack
+	elseif V[key].spill then
+		sp = V[key].spill
+	-- Key is already on stack, write to base-relative address
+	elseif key_base.__base then
+		assert(key_size == ffi.sizeof(V[key].type), 'VAR '..key..' type incompatible with BPF map key type')
+		sp = key_base.__base
+	else
+		error('VAR '..key..' is neither const-expr/register/stack/spilled')
+	end
+	-- If [FP+K] addressing, emit it
+	if sp then
+		emit(BPF.ALU64 + BPF.MOV + BPF.X, 2, 10, 0, 0)
+		emit(BPF.ALU64 + BPF.ADD + BPF.K, 2, 0, 0, -sp)
+	end
+end
+
+local function MAP_GET(dst, map_var, key, imm)
+	local map = V[map_var].const
+	MAP_INIT(map_var, key, imm)
+	-- Flag as pointer type and associate dissector for map value type
+	vreg(dst, 0, true, ffi.typeof('uint8_t *'))
+	V[dst].const = {__dissector=map.val_type}
+	emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_lookup_elem)
+	V[stackslots].reg = nil -- Free temporary registers
+end
+
+local function MAP_DEL(map_var, key, key_imm)
+	-- Set R0, R1 (map fd, preempt R0)
+	reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable)
+	MAP_INIT(map_var, key, key_imm)
+	emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_delete_elem)
+	V[stackslots].reg = nil -- Free temporary registers
+end
+
+local function MAP_SET(map_var, key, key_imm, src)
+	local map = V[map_var].const	
+	-- Delete when setting nil
+	if V[src].type == ffi.typeof('void') then
+		return MAP_DEL(map_var, key, key_imm)
+	end
+	-- Set R0, R1 (map fd, preempt R0)
+	reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable)
+	MAP_INIT(map_var, key, key_imm)
+	reg_alloc(stackslots, 4) -- Spill anything in R4 (unnamed tmp variable)
+	emit(BPF.ALU64 + BPF.MOV + BPF.K, 4, 0, 0, 0) -- BPF_ANY, create new element or update existing
+	-- Reserve R3 for value pointer
+	local val_size = ffi.sizeof(map.val_type)
+	local w = const_width[val_size] or BPF.DW
+	local pod_type = const_width[val_size]
+	-- Stack pointer must be aligned to both key/value size and have enough headroom for (key, value)
+	local sp = stack_top + ffi.sizeof(map.key_type) + val_size
+	sp = sp + (sp % val_size)
+	local base = V[src].const
+	if base and not is_proxy(base) then
+		assert(pod_type, 'NYI: MAP[K] = imm V; V width must be 1/2/4/8')
+		emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, base)
+	-- Value is in register, spill it
+	elseif V[src].reg and pod_type then
+		emit(BPF.MEM + BPF.STX + w, 10, V[src].reg, -sp, 0)
+	-- We get a pointer to spilled register on stack
+	elseif V[src].spill then
+		-- If variable is a pointer, we can load it to R3 directly (save "LEA")
+		if cdef.isptr(V[src].type) then
+			reg_fill(src, 3)
+			emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem)
+			return
+		else
+			sp = V[src].spill
+		end
+	-- Value is already on stack, write to base-relative address
+	elseif base.__base then
+		assert(val_size == ffi.sizeof(V[key].type), 'VAR '..key..' type incompatible with BPF map value type')
+		sp = base.__base
+	-- Value is constant, materialize it on stack
+	else
+		error('VAR '..key..' is neither const-expr/register/stack/spilled')
+	end
+	reg_alloc(stackslots, 3) -- Spill anything in R3 (unnamed tmp variable)
+	emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, 10, 0, 0)
+	emit(BPF.ALU64 + BPF.ADD + BPF.K, 3, 0, 0, -sp)
+	emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem)
+	V[stackslots].reg = nil -- Free temporary registers
+end
+
+-- Finally - this table translates LuaJIT bytecode into code emitter actions.
+local BC = {
+	-- Constants
+	KNUM = function(a, _, c, _) -- KNUM
+		vset(a, nil, c, ffi.typeof('int32_t')) -- TODO: only 32bit immediates are supported now
+	end,
+	KSHORT = function(a, _, _, d) -- KSHORT
+		vset(a, nil, d, ffi.typeof('int16_t'))
+	end,
+	KPRI = function(a, _, _, d) -- KPRI
+		-- KNIL is 0, must create a special type to identify it
+		local vtype = (d < 1) and ffi.typeof('void') or ffi.typeof('uint8_t')
+		vset(a, nil, (d < 2) and 0 or 1, vtype)
+	end,
+	KSTR = function(a, _, c, _) -- KSTR
+		vset(a, nil, c, ffi.typeof('const char[?]'))
+	end,
+	MOV = function(a, _, _, d) -- MOV var, var
+		vcopy(a, d)
+	end,
+
+	-- Comparison ops
+	-- Note: comparisons are always followed by JMP opcode, that
+	--       will fuse following JMP to JMP+CMP instruction in BPF
+	-- Note:  we're narrowed to integers, so operand/operator inversion is legit
+	ISLT = function(a, _, _, d) return CMP_REG(d, a, 'JGE') end, -- (a < d) (inverted)
+	ISGE = function(a, _, _, d) return CMP_REG(a, d, 'JGE') end, -- (a >= d)
+	ISGT = function(a, _, _, d) return CMP_REG(a, d, 'JGT') end, -- (a > d)
+	ISEQV = function(a, _, _, d) return CMP_REG(a, d, 'JEQ') end, -- (a == d)
+	ISNEV = function(a, _, _, d) return CMP_REG(a, d, 'JNE') end, -- (a ~= d)
+	ISEQS = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == str(c))
+	ISNES = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= str(c))
+	ISEQN = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == c)
+	ISNEN = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= c)
+	IST = function(_, _, _, d) return CMP_IMM(d, 0, 'JNE') end, -- (d)
+	ISF = function(_, _, _, d) return CMP_IMM(d, 0, 'JEQ') end, -- (not d)
+	ISEQP = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- ISEQP (a == c)
+	-- Binary operations with RHS constants
+	ADDVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end,
+	SUBVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'SUB') end,
+	MULVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end,
+	DIVVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'DIV') end,
+	MODVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MOD') end,
+	-- Binary operations with LHS constants
+	-- Cheat code: we're narrowed to integer arithmetic, so MUL+ADD are commutative
+	ADDNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end, -- ADDNV
+	MULNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end, -- MULNV
+	SUBNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'SUB') end, -- SUBNV
+	DIVNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'DIV') end, -- DIVNV
+	-- Binary operations between registers
+	ADDVV = function(a, b, _, d) return ALU_REG(a, b, d, 'ADD') end,
+	SUBVV = function(a, b, _, d) return ALU_REG(a, b, d, 'SUB') end,
+	MULVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MUL') end,
+	DIVVV = function(a, b, _, d) return ALU_REG(a, b, d, 'DIV') end,
+	MODVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MOD') end,
+	-- Strings
+	CAT = function(a, b, _, d) -- CAT A = B ~ D
+		assert(V[b].const and V[d].const, 'NYI: CAT only works on compile-time expressions')
+		assert(type(V[b].const) == 'string' and type(V[d].const) == 'string',
+			'NYI: CAT only works on compile-time strings')
+		vset(a, nil, V[b].const .. V[d].const)
+	end,
+	-- Tables
+	GGET = function (a, _, c, _) -- GGET (A = GLOBAL[c])
+		if env[c] ~= nil then
+			vset(a, nil, env[c])
+		else error(string.format("undefined global '%s'", c)) end
+	end,
+	UGET = function (a, _, c, _) -- UGET (A = UPVALUE[c])
+		if env[c] ~= nil then
+			vset(a, nil, env[c])
+		else error(string.format("undefined upvalue '%s'", c)) end
+	end,
+	TGETB = function (a, b, _, d) -- TGETB (A = B[D])
+		if a ~= b then vset(a) end
+		local base = V[b].const
+		if base.__map then -- BPF map read (constant)
+			MAP_GET(a, b, nil, d)
+		else
+			LOAD(a, b, d, ffi.typeof('uint8_t'))
+		end
+	end,
+	TSETB = function (a, b, _, d) -- TSETB (B[D] = A)
+		if V[b].const.__map then -- BPF map read (constant)
+			return MAP_SET(b, nil, d, a) -- D is literal
+		elseif V[b].const and V[b].const and V[a].const then
+			V[b].const[V[d].const] = V[a].const
+		else error('NYI: B[D] = A, where B is not Lua table or BPF map')
+		end
+	end,
+	TSETV = function (a, b, _, d) -- TSETV (B[D] = A)
+		if V[b].const.__map then -- BPF map read (constant)
+			return MAP_SET(b, d, nil, a) -- D is variable
+		elseif V[b].const and V[d].const and V[a].const then
+			V[b].const[V[d].const] = V[a].const
+		else error('NYI: B[D] = A, where B is not Lua table or BPF map')
+		end
+	end,
+	TSETS = function (a, b, c, _) -- TSETS (B[C] = A)
+		assert(V[b] and V[b].const, 'NYI: B[D] where B is not Lua table or BPF map')
+		local base = V[b].const
+		if base.__dissector then
+			local ofs,bpos = ffi.offsetof(base.__dissector, c)
+			assert(not bpos, 'NYI: B[C] = A, where C is a bitfield')
+			local w = sizeofattr(base.__dissector, c)
+			-- TODO: support vectorized moves larger than register width
+			assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8')
+			local src_reg = vreg(a)
+			-- If source is a pointer, we must dereference it first
+			if cdef.isptr(V[a].type) then
+				local tmp_reg = reg_alloc(stackslots, 1) -- Clone variable in tmp register
+				emit(BPF.ALU64 + BPF.MOV + BPF.X, tmp_reg, src_reg, 0, 0)
+				vderef(tmp_reg, tmp_reg, V[a].const.__dissector)
+				src_reg = tmp_reg -- Materialize and dereference it
+			-- Source is a value on stack, we must load it first
+			elseif V[a].const and V[a].const.__base > 0 then
+				emit(BPF.MEM + BPF.LDX + const_width[w], src_reg, 10, -V[a].const.__base, 0)
+				V[a].type = V[a].const.__dissector
+				V[a].const = nil -- Value is dereferenced
+			end
+			-- If the table is not on stack, it must be checked for NULL
+			if not base.__base then
+				emit(BPF.JMP + BPF.JEQ + BPF.K, V[b].reg, 0, 1, 0) -- if (map[x] != NULL)
+				emit(BPF.MEM + BPF.STX + const_width[w], V[b].reg, src_reg, ofs, 0)
+			else -- Table is already on stack, write to base-relative address
+				emit(BPF.MEM + BPF.STX + const_width[w], 10, src_reg, -base.__base + ofs, 0)
+			end
+		elseif V[a].const then
+			base[c] = V[a].const
+		else error('NYI: B[C] = A, where B is not Lua table or BPF map')
+		end
+	end,
+	TGETV = function (a, b, _, d) -- TGETV (A = B[D])
+		assert(V[b] and V[b].const, 'NYI: B[D] where B is not Lua table or BPF map')
+		if a ~= b then vset(a) end
+		if V[b].const.__map then -- BPF map read
+			MAP_GET(a, b, d)
+		elseif V[b].const == env.pkt then  -- Raw packet, no offset
+			LD_FIELD(a, d, 1, V[d].const)
+		else V[a].const = V[b].const[V[d].const] end
+	end,
+	TGETS = function (a, b, c, _) -- TGETS (A = B[C])
+		assert(V[b] and V[b].const, 'NYI: B[C] where C is string and B not Lua table or BPF map')
+		local base = V[b].const
+		if type(base) == 'table' and base.__dissector then
+			local ofs,bpos,bsize = ffi.offsetof(base.__dissector, c)
+			-- Resolve table key using metatable
+			if not ofs and type(base.__dissector[c]) == 'string' then
+				c = base.__dissector[c]
+				ofs,bpos,bsize = ffi.offsetof(base.__dissector, c)
+			end
+			if not ofs and proto[c] then -- Load new dissector on given offset
+				BUILTIN(proto[c], a, b, c)
+			else
+				assert(ofs, tostring(base.__dissector)..'.'..c..' attribute not exists')
+				if a ~= b then vset(a) end
+				-- Dissected value is probably not constant anymore
+				local new_const = nil
+				-- Simple register load, get absolute offset or R-relative
+				local w, atype = sizeofattr(base.__dissector, c)
+				if base.__base == true then -- R-relative addressing
+					local dst_reg = vreg(a, nil, true)
+					assert(const_width[w], 'NYI: sizeof('..tostring(base.__dissector)..'.'..c..') not 1/2/4/8 bytes')
+					emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, V[b].reg, ofs, 0)
+				elseif not base.source and base.__base and base.__base > 0 then -- [FP+K] addressing
+					if cdef.isptr(atype) then -- If the member is pointer type, update base pointer with offset
+						new_const = {__base = base.__base-ofs}
+					else
+						local dst_reg = vreg(a, nil, true)
+						emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, 10, -base.__base+ofs, 0)
+					end
+				elseif base.off then -- Absolute address to payload
+					LD_ABS(a, ofs + base.off, w)
+				elseif base.source == 'probe' then -- Indirect read using probe
+					BUILTIN(builtins[builtins.probe_read], nil, a, b, atype, ofs)
+					V[a].source = V[b].source -- Builtin handles everything
+					return
+				else -- Indirect address to payload
+					LD_IND(a, b, w, ofs)
+				end
+				-- Bitfield, must be further narrowed with a bitmask/shift
+				if bpos then
+					local mask = 0
+					for i=bpos+1,bpos+bsize do
+						mask = bit.bor(mask, bit.lshift(1, w*8-i))
+					end
+					emit(BPF.ALU64 + BPF.AND + BPF.K, vreg(a), 0, 0, mask)
+					-- Free optimization: single-bit values need just boolean result
+					if bsize > 1 then
+						local shift = w*8-bsize-bpos
+						if shift > 0 then
+							emit(BPF.ALU64 + BPF.RSH + BPF.K, vreg(a), 0, 0, shift)
+						end
+					end
+				end
+				V[a].type = atype
+				V[a].const = new_const
+				V[a].source = V[b].source
+			end
+		else V[a].const = base[c] end
+	end,
+	-- Loops and branches
+	CALLM = function (a, b, _, d) -- A = A(A+1, ..., A+D+MULTRES)
+		-- NYI: Support single result only
+		CALL(a, b, d+2)
+	end,
+	CALL = function (a, b, _, d) -- A = A(A+1, ..., A+D-1)
+		CALL(a, b, d)
+	end,
+	JMP = function (a, _, c, d) -- JMP
+		-- Discard unused slots after jump
+		for i, _ in pairs(V) do
+			if i >= a then V[i] = {} end
+		end
+		local val = code.fixup[c] or {}
+		if code.seen_cmp and code.seen_cmp ~= ALWAYS then
+			if code.seen_cmp ~= NEVER then -- Do not emit the jump or fixup
+				-- Store previous CMP insn for reemitting after compensation code
+				local jmpi = ffi.new('struct bpf_insn', code.insn[code.pc-1])
+				code.pc = code.pc - 1
+				-- First branch point, emit compensation code
+				local Vcomp = Vstate[c]
+				if not Vcomp then
+					for i,v in pairs(V) do
+						if not v.reg and v.const and not is_proxy(v.const) then
+							vreg(i, 0)   -- Load to TMP register (not saved)
+						end
+						if v.reg and v.reg <= 5 then
+							reg_spill(i) -- Spill caller-saved registers
+						end
+					end
+					-- Record variable state
+					Vstate[c] = V
+					V = {}
+					for i,v in pairs(Vstate[c]) do
+						V[i] = {}
+						for k,e in pairs(v) do
+							V[i][k] = e
+						end
+					end
+				-- Variable state already set, emit specific compensation code
+				else bb_end(Vcomp) end
+				-- Reemit CMP insn
+				emit(jmpi.code, jmpi.dst_reg, jmpi.src_reg, jmpi.off, jmpi.imm)
+				-- Fuse JMP into previous CMP opcode, mark JMP target for fixup
+				-- as we don't knot the relative offset in generated code yet
+				table.insert(val, code.pc-1)
+				code.fixup[c] = val
+			end
+			code.seen_cmp = nil
+		else
+			emit(BPF.JMP + BPF.JEQ + BPF.X, 6, 6, 0xffff, 0) -- Always true
+			table.insert(val, code.pc-1) -- Fixup JMP target
+			code.reachable = false -- Code following the JMP is not reachable
+			code.fixup[c] = val
+		end
+	end,
+	RET1 = function (a, _, _, _) -- RET1
+		if V[a].reg ~= 0 then vreg(a, 0) end
+		emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0)
+		-- Free optimisation: spilled variable will not be filled again
+		for _,v in pairs(V) do if v.reg == 0 then v.reg = nil end end
+		code.reachable = false
+	end,
+	RET0 = function (_, _, _, _) -- RET0
+		emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0)
+		emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0)
+		code.reachable = false
+	end,
+	compile = function ()
+		return code
+	end
+}
+-- Always initialize R6 with R1 context
+emit(BPF.ALU64 + BPF.MOV + BPF.X, 6, 1, 0, 0)
+-- Register R6 as context variable (first argument)
+if params and params > 0 then
+	vset(0, 6, param_types[1] or proto.skb)
+	V[0].source = V[0].const.source -- Propagate source annotation from typeinfo
+end
+-- Register tmpvars
+vset(stackslots)
+vset(stackslots+1)
+return setmetatable(BC, {
+	__index = function (t, k, v)
+		if type(k) == 'number' then
+			local op_str = string.sub(require('jit.vmdef').bcnames, 6*k+1, 6*k+6)
+			error(string.format("NYI: opcode '0x%02x' (%-04s)", k, op_str))
+		end
+	end,
+	__call = function (t, op, a, b, c, d)
+		code.bc_pc = code.bc_pc + 1
+		-- Exitting BB straight through, emit compensation code
+		if Vstate[code.bc_pc] and code.reachable then
+			bb_end(Vstate[code.bc_pc])
+		end
+		-- Perform fixup of jump targets
+		-- We need to do this because the number of consumed and emited
+		-- bytecode instructions is different
+		local fixup = code.fixup[code.bc_pc]
+		if fixup ~= nil then
+			-- Patch JMP source insn with relative offset
+			for _,pc in ipairs(fixup) do
+				code.insn[pc].off = code.pc - 1 - pc
+			end
+			code.fixup[code.bc_pc] = nil
+			code.reachable = true
+		end
+		-- Execute
+		if code.reachable then
+			assert(t[op], string.format('NYI: instruction %s, parameters: %s,%s,%s,%s', op,a,b,c,d))
+			return t[op](a, b, c, d)
+		end
+	end,
+})
+end
+
+-- Emitted code dump
+local function dump_mem(cls, ins)
+	local mode = bit.band(ins.code, 0xe0)
+	if mode == BPF.XADD then cls = 5 end -- The only mode
+	local op_1 = {'LD', 'LDX', 'ST', 'STX', '', 'XADD'}
+	local op_2 = {[0]='W', [8]='H', [16]='B', [24]='DW'}
+	local name = op_1[cls+1] .. op_2[bit.band(ins.code, 0x18)]
+	local off = tonumber(ffi.cast('int16_t', ins.off)) -- Reinterpret as signed
+	local dst = cls < 2 and 'R'..ins.dst_reg or string.format('[R%d%+d]', ins.dst_reg, off)
+	local src = cls % 2 == 0 and '#'..ins.imm or 'R'..ins.src_reg
+	if cls == BPF.LDX then src = string.format('[R%d%+d]', ins.src_reg, off) end
+	if mode == BPF.ABS then src = string.format('[%d]', ins.imm) end
+	if mode == BPF.IND then src = string.format('[R%d%+d]', ins.src_reg, ins.imm) end
+	return string.format('%s\t%s\t%s', name, dst, src)
+end
+
+local function dump_alu(cls, ins, pc)
+	local alu = {'ADD', 'SUB', 'MUL', 'DIV', 'OR', 'AND', 'LSH', 'RSH', 'NEG', 'MOD', 'XOR', 'MOV', 'ARSH', 'END' }
+	local jmp = {'JA', 'JEQ', 'JGT', 'JGE', 'JSET', 'JNE', 'JSGT', 'JSGE', 'CALL', 'EXIT'}
+	local helper = {'unspec', 'map_lookup_elem', 'map_update_elem', 'map_delete_elem', 'probe_read', 'ktime_get_ns',
+					'trace_printk', 'get_prandom_u32', 'get_smp_processor_id', 'skb_store_bytes',
+					'l3_csum_replace', 'l4_csum_replace', 'tail_call', 'clone_redirect', 'get_current_pid_tgid',
+					'get_current_uid_gid', 'get_current_comm', 'get_cgroup_classid', 'skb_vlan_push', 'skb_vlan_pop',
+					'skb_get_tunnel_key', 'skb_set_tunnel_key', 'perf_event_read', 'redirect', 'get_route_realm',
+					'perf_event_output', 'skb_load_bytes'}
+	local op = 0
+	for i = 0,13 do if 0x10 * i == bit.band(ins.code, 0xf0) then op = i + 1 break end end
+	local name = (cls == 5) and jmp[op] or alu[op]
+	local src = (bit.band(ins.code, 0x08) == BPF.X) and 'R'..ins.src_reg or '#'..ins.imm
+	local target = (cls == 5 and op < 9) and string.format('\t=> %04d', pc + ins.off + 1) or ''
+	if cls == 5 and op == 9 then target = string.format('\t; %s', helper[ins.imm + 1] or tostring(ins.imm)) end
+	return string.format('%s\t%s\t%s%s', name, 'R'..ins.dst_reg, src, target)
+end
+
+local function dump(code)
+	if not code then return end
+	print(string.format('-- BPF %s:0-%u', code.insn, code.pc))
+	local cls_map = {
+		[0] = dump_mem, [1] = dump_mem, [2] = dump_mem, [3] = dump_mem,
+		[4] = dump_alu, [5] = dump_alu, [7] = dump_alu,
+	}
+	for i = 0, code.pc - 1 do
+		local ins = code.insn[i]
+		local cls = bit.band(ins.code, 0x07)
+		print(string.format('%04u\t%s', i, cls_map[cls](cls, ins, i)))
+	end
+end
+
+local function compile(prog, params)
+	-- Create code emitter sandbox, include caller locals
+	local env = { pkt=proto.pkt, BPF=BPF }
+	-- Include upvalues up to 4 nested scopes back
+	-- the narrower scope overrides broader scope
+	for k = 5, 2, -1 do
+		local i = 1
+		while true do
+			local ok, n, v = pcall(debug.getlocal, k, i)
+			if not ok or not n then break end
+			env[n] = v
+			i = i + 1
+		end
+	end
+	setmetatable(env, {
+		__index = function (_, k)
+			return proto[k] or builtins[k] or _G[k]
+		end
+	})
+	-- Create code emitter and compile LuaJIT bytecode
+	if type(prog) == 'string' then prog = loadstring(prog) end
+	-- Create error handler to print traceback
+	local funci, pc = bytecode.funcinfo(prog), 0
+	local E = create_emitter(env, funci.stackslots, funci.params, params or {})
+	local on_err = function (e)
+			funci = bytecode.funcinfo(prog, pc)
+			local from, to = 0, 0
+			for _ = 1, funci.currentline do
+				from = to
+				to = string.find(funci.source, '\n', from+1, true) or 0
+			end
+			print(funci.loc..':'..string.sub(funci.source, from+1, to-1))
+			print('error: '..e)
+			print(debug.traceback())
+	end
+	for _,op,a,b,c,d in bytecode.decoder(prog) do
+		local ok, res, err = xpcall(E,on_err,op,a,b,c,d)
+		if not ok then
+			return nil, res, err
+		end
+	end
+	return E:compile()
+end
+
+-- BPF map interface
+local bpf_map_mt = {
+	__gc = function (map) S.close(map.fd) end,
+	__len = function(map) return map.max_entries end,
+	__index = function (map, k)
+		if type(k) == 'string' then
+			-- Return iterator
+			if k == 'pairs' then
+				return function(t, key)
+					-- Get next key
+					local next_key = ffi.new(ffi.typeof(t.key))
+					local cur_key
+					if key then
+						cur_key = t.key
+						t.key[0] = key
+					else
+						cur_key = ffi.new(ffi.typeof(t.key))
+					end
+					local ok, err = S.bpf_map_op(S.c.BPF_CMD.MAP_GET_NEXT_KEY, map.fd, cur_key, next_key)
+					if not ok then return nil end
+					-- Get next value
+					assert(S.bpf_map_op(S.c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, next_key, map.val))
+					return next_key[0], map.val[0]
+				end, map, nil
+			-- Read for perf event map
+			elseif k == 'reader' then
+				return function (pmap, pid, cpu, event_type)
+					-- Caller must either specify PID or CPU
+					if not pid or pid < 0 then
+						assert((cpu and cpu >= 0), 'NYI: creating composed reader for all CPUs')
+						pid = -1
+					end
+					-- Create BPF output reader
+					local pe = S.t.perf_event_attr1()
+					pe[0].type = 'software'
+					pe[0].config = 'sw_bpf_output'
+					pe[0].sample_type = 'raw'
+					pe[0].sample_period = 1
+					pe[0].wakeup_events = 1
+					local reader, err = S.t.perf_reader(S.perf_event_open(pe, pid, cpu or -1))
+					if not reader then return nil, tostring(err) end
+					-- Register event reader fd in BPF map
+					assert(cpu < pmap.max_entries, string.format('BPF map smaller than read CPU %d', cpu))
+					pmap[cpu] = reader.fd
+					-- Open memory map and start reading
+					local ok, err = reader:start()
+					assert(ok, tostring(err))
+					ok, err = reader:mmap()
+					assert(ok, tostring(err))
+					return cdef.event_reader(reader, event_type)
+				end
+			-- Signalise this is a map type
+			end
+			return k == '__map'
+		end
+		-- Retrieve key
+		map.key[0] = k
+		local ok, err = S.bpf_map_op(S.c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, map.key, map.val)
+		if not ok then return nil, err end
+		return ffi.new(map.val_type, map.val[0])
+	end,
+	__newindex = function (map, k, v)
+		map.key[0] = k
+		if v == nil then
+			return S.bpf_map_op(map.fd, S.c.BPF_CMD.MAP_DELETE_ELEM, map.key, nil)
+		end
+		map.val[0] = v
+		return S.bpf_map_op(S.c.BPF_CMD.MAP_UPDATE_ELEM, map.fd, map.key, map.val)
+	end,
+}
+
+-- Linux tracing interface
+local function trace_check_enabled(path)
+	path = path or '/sys/kernel/debug/tracing'
+	if S.statfs(path) then return true end
+	return nil, 'debugfs not accessible: "mount -t debugfs nodev /sys/kernel/debug"? missing sudo?'
+end
+
+-- Tracepoint interface
+local tracepoint_mt = {
+	__index = {
+		bpf = function (t, prog)
+			if type(prog) ~= 'table' then
+				-- Create protocol parser with source=probe
+				prog = compile(prog, {proto.type(t.type, {source='probe'})})
+			end
+			-- Load the BPF program
+			local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.TRACEPOINT, prog.insn, prog.pc)
+			assert(prog_fd, tostring(err)..': '..tostring(log))
+			-- Open tracepoint and attach
+			t.reader:setbpf(prog_fd:getfd())
+			table.insert(t.progs, prog_fd)
+			return prog_fd
+		end,
+	}
+}
+-- Open tracepoint
+local function tracepoint_open(path, pid, cpu, group_fd)
+	-- Open tracepoint and compile tracepoint type
+	local tp = assert(S.perf_tracepoint('/sys/kernel/debug/tracing/events/'..path))
+	local tp_type = assert(cdef.tracepoint_type(path))
+	-- Open tracepoint reader and create interface
+	local reader = assert(S.perf_attach_tracepoint(tp, pid, cpu, group_fd))
+	return setmetatable({tp=tp,type=tp_type,reader=reader,progs={}}, tracepoint_mt)
+end
+
+local function trace_bpf(ptype, pname, pdef, retprobe, prog, pid, cpu, group_fd)
+	-- Load BPF program
+	if type(prog) ~= 'table' then
+		prog = compile(prog, {proto.pt_regs})
+	end
+	local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.KPROBE, prog.insn, prog.pc)
+	assert(prog_fd, tostring(err)..': '..tostring(log))
+	-- Open tracepoint and attach
+	local tp, err = S.perf_probe(ptype, pname, pdef, retprobe)
+	if not tp then
+		prog_fd:close()
+		return nil, tostring(err)
+	end
+	local reader, err = S.perf_attach_tracepoint(tp, pid, cpu, group_fd, {sample_type='raw, callchain'})
+	if not reader then
+		prog_fd:close()
+		S.perf_probe(ptype, pname, false)
+		return nil, tostring(err)
+	end
+	local ok, err = reader:setbpf(prog_fd:getfd())
+	if not ok then
+		prog_fd:close()
+		reader:close()
+		S.perf_probe(ptype, pname, false)
+		return nil, tostring(err)..' (kernel version should be at least 4.1)'
+	end
+	-- Create GC closure for reader to close BPF program
+	-- and detach probe in correct order
+	ffi.gc(reader, function ()
+		prog_fd:close()
+		reader:close()
+		S.perf_probe(ptype, pname, false)
+	end)
+	return {reader=reader, prog=prog_fd, probe=pname, probe_type=ptype}
+end
+
+-- Module interface
+return setmetatable({
+	new = create_emitter,
+	dump = dump,
+	maps = {},
+	map = function (type, max_entries, key_ctype, val_ctype)
+		if not key_ctype then key_ctype = ffi.typeof('uint32_t') end
+		if not val_ctype then val_ctype = ffi.typeof('uint32_t') end
+		if not max_entries then max_entries = 4096 end
+		-- Special case for BPF_MAP_STACK_TRACE
+		if S.c.BPF_MAP[type] == S.c.BPF_MAP.STACK_TRACE then
+			key_ctype = ffi.typeof('int32_t')
+			val_ctype = ffi.typeof('struct bpf_stacktrace')
+		end
+		local fd, err = S.bpf_map_create(S.c.BPF_MAP[type], ffi.sizeof(key_ctype), ffi.sizeof(val_ctype), max_entries)
+		if not fd then return nil, tostring(err) end
+		local map = setmetatable({
+			max_entries = max_entries,
+			key = ffi.new(ffi.typeof('$ [1]', key_ctype)),
+			val = ffi.new(ffi.typeof('$ [1]', val_ctype)),
+			map_type = S.c.BPF_MAP[type],
+			key_type = key_ctype,
+			val_type = val_ctype,
+			fd = fd:nogc():getfd(),
+		}, bpf_map_mt)
+		return map
+	end,
+	socket = function (sock, prog)
+		-- Expect socket type, if sock is string then assume it's
+		-- an interface name (e.g. 'lo'), if it's a number then typecast it as a socket
+		local ok, err
+		if type(sock) == 'string' then
+			local iface = assert(S.nl.getlink())[sock]
+			assert(iface, sock..' is not interface name')
+			sock, err = S.socket('packet', 'raw')
+			assert(sock, tostring(err))
+			ok, err = sock:bind(S.t.sockaddr_ll({protocol='all', ifindex=iface.index}))
+			assert(ok, tostring(err))
+		elseif type(sock) == 'number' then
+			sock = assert(S.t.socket(sock))
+		end
+		-- Load program and attach it to socket
+		if type(prog) ~= 'table' then
+			prog = compile(prog, {proto.skb})
+		end
+		local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.SOCKET_FILTER, prog.insn, prog.pc)
+		assert(prog_fd, tostring(err)..': '..tostring(log))
+		assert(sock:setsockopt('socket', 'attach_bpf', prog_fd:getfd()))
+		return prog_fd, err
+	end,
+	tracepoint = function(tp, prog, pid, cpu, group_fd)
+		assert(trace_check_enabled())
+		-- Return tracepoint instance if no program specified
+		-- this allows free specialisation of arg0 to tracepoint type
+		local probe = tracepoint_open(tp, pid, cpu, group_fd)
+		-- Load the BPF program
+		if prog then
+			probe:bpf(prog)
+		end
+		return probe
+	end,
+	kprobe = function(tp, prog, retprobe, pid, cpu, group_fd)
+		assert(trace_check_enabled())
+		-- Open tracepoint and attach
+		local pname, pdef = tp:match('([^:]+):(.+)')
+		return trace_bpf('kprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd)
+	end,
+	uprobe = function(tp, prog, retprobe, pid, cpu, group_fd)
+		assert(trace_check_enabled())
+		-- Translate symbol to address
+		local obj, sym_want = tp:match('([^:]+):(.+)')
+		if not S.statfs(obj) then return nil, S.t.error(S.c.E.NOENT) end
+		-- Resolve Elf object (no support for anything else)
+		local elf = require('bpf.elf').open(obj)
+		local sym = elf:resolve(sym_want)
+		if not sym then return nil, 'no such symbol' end
+		sym = sym.st_value - elf:loadaddr()
+		local sym_addr = string.format('%x%04x', tonumber(bit.rshift(sym, 32)),
+		                                         tonumber(ffi.cast('uint32_t', sym)))
+		-- Convert it to expected uprobe format
+		local pname = string.format('%s_%s', obj:gsub('.*/', ''), sym_addr)
+		local pdef = obj..':0x'..sym_addr
+		return trace_bpf('uprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd)
+	end,
+	tracelog = function(path)
+		assert(trace_check_enabled())
+		path = path or '/sys/kernel/debug/tracing/trace_pipe'
+		return io.open(path, 'r')
+	end,
+	ntoh = builtins.ntoh, hton = builtins.hton,
+}, {
+	__call = function (t, prog) return compile(prog) end,
+})
diff --git a/src/lua/bpf/builtins.lua b/src/lua/bpf/builtins.lua
new file mode 100644
index 0000000..1153570
--- /dev/null
+++ b/src/lua/bpf/builtins.lua
@@ -0,0 +1,392 @@
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+local ffi = require('ffi')
+local bit = require('bit')
+local cdef = require('bpf.cdef')
+
+local BPF, HELPER = ffi.typeof('struct bpf'), ffi.typeof('struct bpf_func_id')
+local const_width = {
+	[1] = BPF.B, [2] = BPF.H, [4] = BPF.W, [8] = BPF.DW,
+}
+local const_width_type = {
+	[1] = ffi.typeof('uint8_t'), [2] = ffi.typeof('uint16_t'), [4] = ffi.typeof('uint32_t'), [8] = ffi.typeof('uint64_t'),
+}
+
+-- Built-ins that will be translated into BPF instructions
+-- i.e. bit.bor(0xf0, 0x0f) becomes {'alu64, or, k', reg(0xf0), reg(0x0f), 0, 0}
+local builtins = {
+	[bit.lshift]  = 'LSH',
+	[bit.rshift]  = 'RSH',
+	[bit.band]    = 'AND',
+	[bit.bnot]    = 'NEG',
+	[bit.bor]     = 'OR',
+	[bit.bxor]    = 'XOR',
+	[bit.arshift] = 'ARSH',
+	-- Extensions and intrinsics
+}
+
+local function width_type(w)
+	-- Note: ffi.typeof doesn't accept '?' as template
+	return const_width_type[w] or ffi.typeof(string.format('uint8_t [%d]', w))
+end
+builtins.width_type = width_type
+
+-- Byte-order conversions for little endian
+local function ntoh(x, w)
+	if w then x = ffi.cast(const_width_type[w/8], x) end
+	return bit.bswap(x)
+end
+local function hton(x, w) return ntoh(x, w) end
+builtins.ntoh = ntoh
+builtins.hton = hton
+builtins[ntoh] = function (e, dst, a, w)
+	-- This is trickery, but TO_LE means cpu_to_le(),
+	-- and we want exactly the opposite as network is always 'be'
+	w = w or ffi.sizeof(e.V[a].type)*8
+	if w == 8 then return end -- NOOP
+	assert(w <= 64, 'NYI: hton(a[, width]) - operand larger than register width')
+	-- Allocate registers and execute
+	e.vcopy(dst, a)
+	e.emit(BPF.ALU + BPF.END + BPF.TO_BE, e.vreg(dst), 0, 0, w)
+end
+builtins[hton] = function (e, dst, a, w)
+	w = w or ffi.sizeof(e.V[a].type)*8
+	if w == 8 then return end -- NOOP
+	assert(w <= 64, 'NYI: hton(a[, width]) - operand larger than register width')
+	-- Allocate registers and execute
+	e.vcopy(dst, a)
+	e.emit(BPF.ALU + BPF.END + BPF.TO_LE, e.vreg(dst), 0, 0, w)
+end
+-- Byte-order conversions for big endian are no-ops
+if ffi.abi('be') then
+	ntoh = function (x, w)
+		return w and ffi.cast(const_width_type[w/8], x) or x
+	end
+	hton = ntoh
+	builtins[ntoh] = function(a, b, w) return end
+	builtins[hton] = function(a, b, w) return end
+end
+-- Other built-ins
+local function xadd(a, b) error('NYI') end
+builtins.xadd = xadd
+builtins[xadd] = function (e, dst, a, b, off)
+	assert(e.V[a].const.__dissector, 'xadd(a, b) called on non-pointer')
+	local w = ffi.sizeof(e.V[a].const.__dissector)
+	assert(w == 4 or w == 8, 'NYI: xadd() - 1 and 2 byte atomic increments are not supported')
+	-- Allocate registers and execute
+	e.vcopy(dst, a)
+	local src_reg = e.vreg(b)
+	local dst_reg = e.vreg(dst)
+	e.emit(BPF.JMP + BPF.JEQ + BPF.K, dst_reg, 0, 1, 0) -- if (dst != NULL)
+	e.emit(BPF.XADD + BPF.STX + const_width[w], dst_reg, src_reg, off or 0, 0)
+end
+
+local function probe_read() error('NYI') end
+builtins.probe_read = probe_read
+builtins[probe_read] = function (e, ret, dst, src, vtype, ofs)
+	e.reg_alloc(e.tmpvar, 1)
+	-- Load stack pointer to dst, since only load to stack memory is supported
+	-- we have to use allocated stack memory or create a new allocation and convert
+	-- to pointer type
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 10, 0, 0)
+	if not e.V[dst].const or not e.V[dst].const.__base > 0 then
+		builtins[ffi.new](e, dst, vtype) -- Allocate stack memory
+	end
+	e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[dst].const.__base)
+	-- Set stack memory maximum size bound
+	e.reg_alloc(e.tmpvar, 2)
+	if not vtype then
+		vtype = cdef.typename(e.V[dst].type)
+		-- Dereference pointer type to pointed type for size calculation
+		if vtype:sub(-1) == '*' then vtype = vtype:sub(0, -2) end
+	end
+	local w = ffi.sizeof(vtype)
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 2, 0, 0, w)
+	-- Set source pointer
+	if e.V[src].reg then
+		e.reg_alloc(e.tmpvar, 3) -- Copy from original register
+		e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, e.V[src].reg, 0, 0)
+	else
+		local src_reg = e.vreg(src, 3)
+		e.reg_spill(src) -- Spill to avoid overwriting
+	end
+	if ofs and ofs > 0 then
+		e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 3, 0, 0, ofs)
+	end
+	-- Call probe read helper
+	ret = ret or e.tmpvar
+	e.vset(ret)
+	e.vreg(ret, 0, true, ffi.typeof('int32_t'))
+	e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.probe_read)
+	e.V[e.tmpvar].reg = nil  -- Free temporary registers
+end
+builtins[ffi.cast] = function (e, dst, ct, x)
+	assert(e.V[ct].const, 'ffi.cast(ctype, x) called with bad ctype')
+	e.vcopy(dst, x)
+	if not e.V[x].const then
+		e.V[dst].type = ffi.typeof(e.V[ct].const)
+	else
+		e.V[dst].const.__dissector = ffi.typeof(e.V[ct].const)
+	end
+	-- Specific types also encode source of the data
+	-- This is because BPF has different helpers for reading
+	-- different data sources, so variables must track origins.
+	-- struct pt_regs - source of the data is probe
+	-- struct skb     - source of the data is socket buffer
+	-- struct X       - source of the data is probe/tracepoint
+	if ffi.typeof(e.V[ct].const) == ffi.typeof('struct pt_regs') then
+		e.V[dst].source = 'probe'
+	end
+end
+builtins[ffi.new] = function (e, dst, ct, x)
+	if type(ct) == 'number' then
+		ct = ffi.typeof(e.V[ct].const) -- Get ctype from variable
+	end
+	assert(not x, 'NYI: ffi.new(ctype, ...) - initializer is not supported')
+	assert(not cdef.isptr(ct, true), 'NYI: ffi.new(ctype, ...) - ctype MUST NOT be a pointer')
+	e.vset(dst, nil, ct)
+	e.V[dst].const = {__base = e.valloc(ffi.sizeof(ct), true), __dissector = ct}
+end
+builtins[ffi.copy] = function (e,ret, dst, src)
+	assert(cdef.isptr(e.V[dst].type), 'ffi.copy(dst, src) - dst MUST be a pointer type')
+	assert(cdef.isptr(e.V[src].type), 'ffi.copy(dst, src) - src MUST be a pointer type')
+	-- Specific types also encode source of the data
+	-- struct pt_regs - source of the data is probe
+	-- struct skb     - source of the data is socket buffer
+	if e.V[src].source == 'probe' then
+		e.reg_alloc(e.tmpvar, 1)
+		-- Load stack pointer to dst, since only load to stack memory is supported
+		-- we have to either use spilled variable or allocated stack memory offset
+		e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 10, 0, 0)
+		if e.V[dst].spill then
+			e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[dst].spill)
+		elseif e.V[dst].const.__base then
+			e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[dst].const.__base)
+		else error('ffi.copy(dst, src) - can\'t get stack offset of dst') end
+		-- Set stack memory maximum size bound
+		local dst_tname = cdef.typename(e.V[dst].type)
+		if dst_tname:sub(-1) == '*' then dst_tname = dst_tname:sub(0, -2) end
+		e.reg_alloc(e.tmpvar, 2)
+		e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 2, 0, 0, ffi.sizeof(dst_tname))
+		-- Set source pointer
+		if e.V[src].reg then
+			e.reg_alloc(e.tmpvar, 3) -- Copy from original register
+			e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, e.V[src].reg, 0, 0)
+		else
+			local src_reg = e.vreg(src, 3)
+			e.reg_spill(src) -- Spill to avoid overwriting
+		end
+		-- Call probe read helper
+		e.vset(ret)
+		e.vreg(ret, 0, true, ffi.typeof('int32_t'))
+		e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.probe_read)
+		e.V[e.tmpvar].reg = nil  -- Free temporary registers
+	elseif e.V[src].const and e.V[src].const.__map then
+		error('NYI: ffi.copy(dst, src) - src is backed by BPF map')
+	elseif e.V[src].const and e.V[src].const.__dissector then
+		error('NYI: ffi.copy(dst, src) - src is backed by socket buffer')
+	else
+		-- TODO: identify cheap register move
+		-- TODO: identify copy to/from stack
+		error('NYI: ffi.copy(dst, src) - src is neither BPF map/socket buffer or probe')
+	end	
+end
+-- print(format, ...) builtin changes semantics from Lua print(...)
+-- the first parameter has to be format and only reduced set of conversion specificers
+-- is allowed: %d %u %x %ld %lu %lx %lld %llu %llx %p %s
+builtins[print] = function (e, ret, fmt, a1, a2, a3)
+	-- Load format string and length
+	e.reg_alloc(e.V[e.tmpvar], 1)
+	e.reg_alloc(e.V[e.tmpvar+1], 1)
+	if type(e.V[fmt].const) == 'string' then
+		local src = e.V[fmt].const
+		local len = #src + 1
+		local dst = e.valloc(len, src)
+		-- TODO: this is materialize step
+		e.V[fmt].const = {__base=dst}
+		e.V[fmt].type = ffi.typeof('char ['..len..']')
+	elseif e.V[fmt].const.__base then -- NOP
+	else error('NYI: print(fmt, ...) - format variable is not literal/stack memory') end
+	-- Prepare helper call
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 10, 0, 0)
+	e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[fmt].const.__base)
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 2, 0, 0, ffi.sizeof(e.V[fmt].type))
+	if a1 then
+		local args = {a1, a2, a3}
+		assert(#args <= 3, 'print(fmt, ...) - maximum of 3 arguments supported')
+		for i, arg in ipairs(args) do
+			e.vcopy(e.tmpvar, arg)  -- Copy variable
+			e.vreg(e.tmpvar, 3+i-1) -- Materialize it in arg register
+		end
+	end
+	-- Call helper
+	e.vset(ret)
+	e.vreg(ret, 0, true, ffi.typeof('int32_t')) -- Return is integer
+	e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.trace_printk)
+	e.V[e.tmpvar].reg = nil  -- Free temporary registers
+end
+
+-- Implements bpf_perf_event_output(ctx, map, flags, var, vlen) on perf event map
+local function perf_submit(e, dst, map_var, src)
+	-- Set R2 = map fd (indirect load)
+	local map = e.V[map_var].const
+	e.vcopy(e.tmpvar, map_var)
+	e.vreg(e.tmpvar, 2, true, ffi.typeof('uint64_t'))
+	e.LD_IMM_X(2, BPF.PSEUDO_MAP_FD, map.fd, ffi.sizeof('uint64_t'))
+	-- Set R1 = ctx
+	e.reg_alloc(e.tmpvar, 1) -- Spill anything in R1 (unnamed tmp variable)
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 6, 0, 0) -- CTX is always in R6, copy
+	-- Set R3 = flags
+	e.vset(e.tmpvar, nil, 0) -- BPF_F_CURRENT_CPU
+	e.vreg(e.tmpvar, 3, false, ffi.typeof('uint64_t'))
+	-- Set R4 = pointer to src on stack
+	assert(e.V[src].const.__base, 'NYI: submit(map, var) - variable is not on stack')
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 4, 10, 0, 0)
+	e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 4, 0, 0, -e.V[src].const.__base)
+	-- Set R5 = src length
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 5, 0, 0, ffi.sizeof(e.V[src].type))
+	-- Set R0 = ret and call
+	e.vset(dst)
+	e.vreg(dst, 0, true, ffi.typeof('int32_t')) -- Return is integer
+	e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.perf_event_output)
+	e.V[e.tmpvar].reg = nil  -- Free temporary registers
+end
+
+-- Implements bpf_get_stack_id()
+local function stack_id(e, ret, map_var, key)
+	-- Set R2 = map fd (indirect load)
+	local map = e.V[map_var].const
+	e.vcopy(e.tmpvar, map_var)
+	e.vreg(e.tmpvar, 2, true, ffi.typeof('uint64_t'))
+	e.LD_IMM_X(2, BPF.PSEUDO_MAP_FD, map.fd, ffi.sizeof('uint64_t'))
+	-- Set R1 = ctx
+	e.reg_alloc(e.tmpvar, 1) -- Spill anything in R1 (unnamed tmp variable)
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 6, 0, 0) -- CTX is always in R6, copy
+	-- Load flags in R2 (immediate value or key)
+	local imm = e.V[key].const
+	assert(tonumber(imm), 'NYI: stack_id(map, var), var must be constant number')
+	e.reg_alloc(e.tmpvar, 3) -- Spill anything in R2 (unnamed tmp variable)
+	e.LD_IMM_X(3, 0, imm, 8)
+	-- Return R0 as signed integer
+	e.vset(ret)
+	e.vreg(ret, 0, true, ffi.typeof('int32_t'))
+	e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.get_stackid)
+	e.V[e.tmpvar].reg = nil  -- Free temporary registers
+end
+
+-- table.insert(table, value) keeps semantics with the exception of BPF maps
+-- map `perf_event` -> submit inserted value
+builtins[table.insert] = function (e, dst, map_var, value)
+	assert(e.V[map_var].const.__map, 'NYI: table.insert() supported only on BPF maps')
+	return perf_submit(e, dst, map_var, value)
+end
+
+-- bpf_get_current_comm(buffer) - write current process name to byte buffer
+local function comm() error('NYI') end
+builtins[comm] = function (e, ret, dst)
+	-- Set R1 = buffer
+	assert(e.V[dst].const.__base, 'NYI: comm(buffer) - buffer variable is not on stack')
+	e.reg_alloc(e.tmpvar, 1) -- Spill
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.X, 1, 10, 0, 0)
+	e.emit(BPF.ALU64 + BPF.ADD + BPF.K, 1, 0, 0, -e.V[dst].const.__base)
+	-- Set R2 = length
+	e.reg_alloc(e.tmpvar, 2) -- Spill
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.K, 2, 0, 0, ffi.sizeof(e.V[dst].type))
+	-- Return is integer
+	e.vset(ret)
+	e.vreg(ret, 0, true, ffi.typeof('int32_t'))
+	e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.get_current_comm)
+	e.V[e.tmpvar].reg = nil  -- Free temporary registers
+end
+
+-- Math library built-ins
+math.log2 = function (x) error('NYI') end
+builtins[math.log2] = function (e, dst, x)
+	-- Classic integer bits subdivison algorithm to find the position
+	-- of the highest bit set, adapted for BPF bytecode-friendly operations.
+	-- https://graphics.stanford.edu/~seander/bithacks.html
+	-- r = 0
+	local r = e.vreg(dst, nil, true)
+	e.emit(BPF.ALU64 + BPF.MOV + BPF.K, r, 0, 0, 0)
+	-- v = x
+	e.vcopy(e.tmpvar, x)
+	local v = e.vreg(e.tmpvar, 2)
+	if cdef.isptr(e.V[x].const) then -- No pointer arithmetics, dereference
+		e.vderef(v, v, ffi.typeof('uint64_t'))
+	end
+	-- Invert value to invert all tests, otherwise we would need and+jnz
+	e.emit(BPF.ALU64 + BPF.NEG + BPF.K, v, 0, 0, 0)        -- v = ~v
+	-- Unrolled test cases, converted masking to arithmetic as we don't have "if !(a & b)"
+	-- As we're testing inverted value, we have to use arithmetic shift to copy MSB
+	for i=4,0,-1 do
+		local k = bit.lshift(1, i)
+		e.emit(BPF.JMP + BPF.JGT + BPF.K, v, 0, 2, bit.bnot(bit.lshift(1, k))) -- if !upper_half(x)
+		e.emit(BPF.ALU64 + BPF.ARSH + BPF.K, v, 0, 0, k)                       --     v >>= k
+		e.emit(BPF.ALU64 + BPF.OR + BPF.K, r, 0, 0, k)                         --     r |= k
+	end
+	-- No longer constant, cleanup tmpvars
+	e.V[dst].const = nil
+	e.V[e.tmpvar].reg = nil
+end
+builtins[math.log10] = function (e, dst, x)
+	-- Compute log2(x) and transform
+	builtins[math.log2](e, dst, x)
+	-- Relationship: log10(v) = log2(v) / log2(10)
+	local r = e.V[dst].reg
+	e.emit(BPF.ALU64 + BPF.ADD + BPF.K, r, 0, 0, 1)    -- Compensate round-down
+	e.emit(BPF.ALU64 + BPF.MUL + BPF.K, r, 0, 0, 1233) -- log2(10) ~ 1233>>12
+	e.emit(BPF.ALU64 + BPF.RSH + BPF.K, r, 0, 0, 12)
+end
+builtins[math.log] = function (e, dst, x)
+	-- Compute log2(x) and transform
+	builtins[math.log2](e, dst, x)
+	-- Relationship: ln(v) = log2(v) / log2(e)
+	local r = e.V[dst].reg
+	e.emit(BPF.ALU64 + BPF.ADD + BPF.K, r, 0, 0, 1)    -- Compensate round-down
+	e.emit(BPF.ALU64 + BPF.MUL + BPF.K, r, 0, 0, 2839) -- log2(e) ~ 2839>>12
+	e.emit(BPF.ALU64 + BPF.RSH + BPF.K, r, 0, 0, 12)
+end
+
+-- Call-type helpers
+local function call_helper(e, dst, h)
+	e.vset(dst)
+	local dst_reg = e.vreg(dst, 0, true)
+	e.emit(BPF.JMP + BPF.CALL, 0, 0, 0, h)
+	e.V[dst].const = nil -- Target is not a function anymore
+end
+local function cpu() error('NYI') end
+local function rand() error('NYI') end
+local function time() error('NYI') end
+local function pid_tgid() error('NYI') end
+local function uid_gid() error('NYI') end
+
+-- Export helpers and builtin variants
+builtins.cpu = cpu
+builtins.time = time
+builtins.pid_tgid = pid_tgid
+builtins.uid_gid = uid_gid
+builtins.comm = comm
+builtins.perf_submit = perf_submit
+builtins.stack_id = stack_id
+builtins[cpu] = function (e, dst) return call_helper(e, dst, HELPER.get_smp_processor_id) end
+builtins[rand] = function (e, dst) return call_helper(e, dst, HELPER.get_prandom_u32) end
+builtins[time] = function (e, dst) return call_helper(e, dst, HELPER.ktime_get_ns) end
+builtins[pid_tgid] = function (e, dst) return call_helper(e, dst, HELPER.get_current_pid_tgid) end
+builtins[uid_gid] = function (e, dst) return call_helper(e, dst, HELPER.get_current_uid_gid) end
+builtins[perf_submit] = function (e, dst, map, value) return perf_submit(e, dst, map, value) end
+builtins[stack_id] = function (e, dst, map, key) return stack_id(e, dst, map, key) end
+
+return builtins
diff --git a/src/lua/bpf/cdef.lua b/src/lua/bpf/cdef.lua
new file mode 100644
index 0000000..07749c1
--- /dev/null
+++ b/src/lua/bpf/cdef.lua
@@ -0,0 +1,233 @@
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+local ffi = require('ffi')
+local bit = require('bit')
+local S = require('syscall')
+local M = {}
+
+ffi.cdef [[
+struct bpf {
+	/* Instruction classes */
+	static const int LD   = 0x00;
+	static const int LDX  = 0x01;
+	static const int ST   = 0x02;
+	static const int STX  = 0x03;
+	static const int ALU  = 0x04;
+	static const int JMP  = 0x05;
+	static const int ALU64 = 0x07;
+	/* ld/ldx fields */
+	static const int W    = 0x00;
+	static const int H    = 0x08;
+	static const int B    = 0x10;
+	static const int ABS  = 0x20;
+	static const int IND  = 0x40;
+	static const int MEM  = 0x60;
+	static const int LEN  = 0x80;
+	static const int MSH  = 0xa0;
+	/* alu/jmp fields */
+	static const int ADD  = 0x00;
+	static const int SUB  = 0x10;
+	static const int MUL  = 0x20;
+	static const int DIV  = 0x30;
+	static const int OR   = 0x40;
+	static const int AND  = 0x50;
+	static const int LSH  = 0x60;
+	static const int RSH  = 0x70;
+	static const int NEG  = 0x80;
+	static const int MOD  = 0x90;
+	static const int XOR  = 0xa0;
+	static const int JA   = 0x00;
+	static const int JEQ  = 0x10;
+	static const int JGT  = 0x20;
+	static const int JGE  = 0x30;
+	static const int JSET = 0x40;
+	static const int K    = 0x00;
+	static const int X    = 0x08;
+	static const int JNE  = 0x50;	/* jump != */
+	static const int JSGT = 0x60;	/* SGT is signed '>', GT in x86 */
+	static const int JSGE = 0x70;	/* SGE is signed '>=', GE in x86 */
+	static const int CALL = 0x80;	/* function call */
+	static const int EXIT = 0x90;	/* function return */
+	/* ld/ldx fields */
+	static const int DW    = 0x18;	/* double word */
+	static const int XADD  = 0xc0;	/* exclusive add */
+	/* alu/jmp fields */
+	static const int MOV   = 0xb0;	/* mov reg to reg */
+	static const int ARSH  = 0xc0;	/* sign extending arithmetic shift right */
+	/* change endianness of a register */
+	static const int END   = 0xd0;	/* flags for endianness conversion: */
+	static const int TO_LE = 0x00;	/* convert to little-endian */
+	static const int TO_BE = 0x08;	/* convert to big-endian */
+	/* misc */
+	static const int PSEUDO_MAP_FD = 0x01;
+	/* helper functions */
+	static const int F_CURRENT_CPU    = 0xffffffff;
+	static const int F_USER_STACK     = 1 << 8;
+	static const int F_FAST_STACK_CMP = 1 << 9;
+	static const int F_REUSE_STACKID  = 1 << 10;
+};
+/* eBPF commands */
+struct bpf_cmd {
+	static const int MAP_CREATE       = 0;
+	static const int MAP_LOOKUP_ELEM  = 1;
+	static const int MAP_UPDATE_ELEM  = 2;
+	static const int MAP_DELETE_ELEM  = 3;
+	static const int MAP_GET_NEXT_KEY = 4;
+	static const int PROG_LOAD        = 5;
+	static const int OBJ_PIN          = 6;
+	static const int OBJ_GET          = 7;
+};
+/* eBPF helpers */
+struct bpf_func_id {
+	static const int unspec               = 0;
+	static const int map_lookup_elem      = 1;
+	static const int map_update_elem      = 2;
+	static const int map_delete_elem      = 3;
+	static const int probe_read           = 4;
+	static const int ktime_get_ns         = 5;
+	static const int trace_printk         = 6;
+	static const int get_prandom_u32      = 7;
+	static const int get_smp_processor_id = 8;
+	static const int skb_store_bytes      = 9;
+	static const int l3_csum_replace      = 10;
+	static const int l4_csum_replace      = 11;
+	static const int tail_call            = 12;
+	static const int clone_redirect       = 13;
+	static const int get_current_pid_tgid = 14;
+	static const int get_current_uid_gid  = 15;
+	static const int get_current_comm     = 16;
+	static const int get_cgroup_classid   = 17;
+	static const int skb_vlan_push        = 18;
+	static const int skb_vlan_pop         = 19;
+	static const int skb_get_tunnel_key   = 20;
+	static const int skb_set_tunnel_key   = 21;
+	static const int perf_event_read      = 22;
+	static const int redirect             = 23;
+	static const int get_route_realm      = 24;
+	static const int perf_event_output    = 25;
+	static const int skb_load_bytes       = 26;
+	static const int get_stackid          = 27;
+};
+/* BPF_MAP_STACK_TRACE structures and constants */
+static const int BPF_MAX_STACK_DEPTH = 127;
+struct bpf_stacktrace {
+	uint64_t ip[BPF_MAX_STACK_DEPTH];
+};
+]]
+
+-- Compatibility: ljsyscall doesn't have support for BPF syscall
+if not S.bpf then
+	error("ljsyscall doesn't support bpf(), must be updated")
+else
+	-- Compatibility: ljsyscall<=0.12
+	if not S.c.BPF_MAP.PERCPU_HASH then
+		S.c.BPF_MAP.PERCPU_HASH  = 5
+		S.c.BPF_MAP.PERCPU_ARRAY = 6
+		S.c.BPF_MAP.STACK_TRACE  = 7
+		S.c.BPF_MAP.CGROUP_ARRAY = 8
+	end
+	if not S.c.BPF_PROG.TRACEPOINT then
+		S.c.BPF_PROG.TRACEPOINT  = 5
+	end
+end
+
+-- Compatibility: metatype for stacktrace
+local function stacktrace_iter(t, i)
+	i = i + 1
+	if i < #t and t.ip[i] > 0 then
+		return i, t.ip[i]
+	end
+end
+ffi.metatype('struct bpf_stacktrace', {
+	__len = function (t) return ffi.sizeof(t.ip) / ffi.sizeof(t.ip[0]) end,
+	__ipairs = function (t) return stacktrace_iter, t, -1 end,
+})
+
+-- Reflect cdata type
+function M.typename(v)
+	if not v or type(v) ~= 'cdata' then return nil end
+	return string.match(tostring(ffi.typeof(v)), '<([^>]+)')
+end
+
+-- Reflect if cdata type can be pointer (accepts array or pointer) 
+function M.isptr(v, noarray)
+	local ctname = M.typename(v)
+	if ctname then
+		ctname = string.sub(ctname, -1)
+		ctname = ctname == '*' or (not noarray and ctname == ']')
+	end
+	return ctname
+end
+
+function M.osversion()
+	-- We have no better way to extract current kernel hex-string other
+	-- than parsing headers, compiling a helper function or reading /proc
+	local ver_str, count = S.sysctl('kernel.version'):match('%d+.%d+.%d+'), 2
+	if not ver_str then -- kernel.version is freeform, fallback to kernel.osrelease
+		ver_str = S.sysctl('kernel.osrelease'):match('%d+.%d+.%d+')
+	end
+	local version = 0
+	for i in ver_str:gmatch('%d+') do -- Convert 'X.Y.Z' to 0xXXYYZZ
+		version = bit.bor(version, bit.lshift(tonumber(i), 8*count))
+		count = count - 1
+	end
+	return version
+end
+
+function M.event_reader(reader, event_type)
+	-- Caller can specify event message binary format
+	if event_type then
+		assert(type(event_type) == 'string' and ffi.typeof(event_type), 'not a valid type for event reader')
+		event_type = ffi.typeof(event_type .. '*') -- Convert type to pointer-to-type
+	end
+	-- Wrap reader in interface that can interpret read event messages
+	return setmetatable({reader=reader,type=event_type}, {__index = {
+		block = function(self)
+			return S.select { readfds = {reader.fd} }
+		end,
+		next = function(self, k)
+			local len, ev = reader:next(k)
+			-- Filter out only sample frames
+			while ev and ev.type ~= S.c.PERF_RECORD.SAMPLE do
+				len, ev = reader:next(len)
+			end
+			if ev and event_type then
+				-- The perf event reader returns framed data with header and variable length
+				-- This is going skip the frame header and cast data to given type
+				ev = ffi.cast(event_type, ffi.cast('char *', ev) + ffi.sizeof('struct perf_event_header') + ffi.sizeof('uint32_t'))
+			end
+			return len, ev
+		end,
+		read = function(self)
+			return self.next, self, nil
+		end,
+	}})
+end
+
+function M.tracepoint_type(tp)
+	-- Read tracepoint format string
+	local fp = assert(io.open('/sys/kernel/debug/tracing/events/'..tp..'/format', 'r'))
+	local fmt = fp:read '*a'
+	fp:close()
+	-- Parse struct fields
+	local fields = {}
+	for f in fmt:gmatch 'field:([^;]+;)' do
+		table.insert(fields, f)
+	end
+	return string.format('struct { %s }', table.concat(fields))
+end
+
+return M
\ No newline at end of file
diff --git a/src/lua/bpf/elf.lua b/src/lua/bpf/elf.lua
new file mode 100644
index 0000000..6783827
--- /dev/null
+++ b/src/lua/bpf/elf.lua
@@ -0,0 +1,261 @@
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+-- This is a tiny wrapper over libelf to extract load address
+-- and offsets of dynamic symbols
+
+local S = require('syscall')
+local ffi = require('ffi')
+ffi.cdef [[
+/* Type for a 16-bit quantity.  */
+typedef uint16_t Elf32_Half;
+typedef uint16_t Elf64_Half;
+
+/* Types for signed and unsigned 32-bit quantities.  */
+typedef uint32_t Elf32_Word;
+typedef int32_t  Elf32_Sword;
+typedef uint32_t Elf64_Word;
+typedef int32_t  Elf64_Sword;
+
+/* Types for signed and unsigned 64-bit quantities.  */
+typedef uint64_t Elf32_Xword;
+typedef int64_t  Elf32_Sxword;
+typedef uint64_t Elf64_Xword;
+typedef int64_t  Elf64_Sxword;
+
+/* Type of addresses.  */
+typedef uint32_t Elf32_Addr;
+typedef uint64_t Elf64_Addr;
+
+/* Type of file offsets.  */
+typedef uint32_t Elf32_Off;
+typedef uint64_t Elf64_Off;
+
+/* Type for section indices, which are 16-bit quantities.  */
+typedef uint16_t Elf32_Section;
+typedef uint16_t Elf64_Section;
+
+/* Constants */
+struct Elf_Cmd
+{
+  static const int READ              = 1;
+  static const int RDWR              = 2;
+  static const int WRITE             = 3;
+  static const int CLR               = 4;
+  static const int SET               = 5;
+  static const int FDDONE            = 6;
+  static const int FDREAD            = 7;
+  static const int READ_MMAP         = 8;
+  static const int RDWR_MMAP         = 9;
+  static const int WRITE_MMAP        =10;
+  static const int READ_MMAP_PRIVATE =11;
+  static const int EMPTY             =12;
+  static const int NUM               =13;
+};
+
+/* Descriptor for the ELF file.  */
+typedef struct Elf Elf;
+/* Descriptor for ELF file section.  */
+typedef struct Elf_Scn Elf_Scn;
+/* Container type for metatable */
+struct Elf_object { int fd; Elf *elf; };
+/* Program segment header.  */
+typedef struct
+{
+  Elf64_Word    p_type;                 /* Segment type */
+  Elf64_Word    p_flags;                /* Segment flags */
+  Elf64_Off     p_offset;               /* Segment file offset */
+  Elf64_Addr    p_vaddr;                /* Segment virtual address */
+  Elf64_Addr    p_paddr;                /* Segment physical address */
+  Elf64_Xword   p_filesz;               /* Segment size in file */
+  Elf64_Xword   p_memsz;                /* Segment size in memory */
+  Elf64_Xword   p_align;                /* Segment alignment */
+} Elf64_Phdr;
+typedef Elf64_Phdr GElf_Phdr;
+/* Section header.  */
+typedef struct
+{
+  Elf64_Word    sh_name;                /* Section name (string tbl index) */
+  Elf64_Word    sh_type;                /* Section type */
+  Elf64_Xword   sh_flags;               /* Section flags */
+  Elf64_Addr    sh_addr;                /* Section virtual addr at execution */
+  Elf64_Off     sh_offset;              /* Section file offset */
+  Elf64_Xword   sh_size;                /* Section size in bytes */
+  Elf64_Word    sh_link;                /* Link to another section */
+  Elf64_Word    sh_info;                /* Additional section information */
+  Elf64_Xword   sh_addralign;           /* Section alignment */
+  Elf64_Xword   sh_entsize;             /* Entry size if section holds table */
+} Elf64_Shdr;
+typedef Elf64_Shdr GElf_Shdr;
+/* Descriptor for data to be converted to or from memory format.  */
+typedef struct
+{
+  void *d_buf;                  /* Pointer to the actual data.  */
+  int d_type;                   /* Type of this piece of data.  */
+  unsigned int d_version;       /* ELF version.  */
+  size_t d_size;                /* Size in bytes.  */
+  uint64_t d_off;               /* Offset into section.  */
+  size_t d_align;               /* Alignment in section.  */
+} Elf_Data;
+/* Symbol table entry.  */
+typedef struct
+{
+  Elf64_Word    st_name;                /* Symbol name (string tbl index) */
+  unsigned char st_info;                /* Symbol type and binding */
+  unsigned char st_other;               /* Symbol visibility */
+  Elf64_Section st_shndx;               /* Section index */
+  Elf64_Addr    st_value;               /* Symbol value */
+  Elf64_Xword   st_size;                /* Symbol size */
+} Elf64_Sym;
+typedef Elf64_Sym GElf_Sym;
+
+/* Coordinate ELF library and application versions.  */
+unsigned int elf_version (unsigned int __version);
+/* Return descriptor for ELF file to work according to CMD.  */
+Elf *elf_begin (int __fildes, int __cmd, Elf *__ref);
+/* Free resources allocated for ELF.  */
+int elf_end (Elf *__elf);
+/* Get the number of program headers in the ELF file.  If the file uses
+   more headers than can be represented in the e_phnum field of the ELF
+   header the information from the sh_info field in the zeroth section
+   header is used.  */
+int elf_getphdrnum (Elf *__elf, size_t *__dst);
+/* Retrieve program header table entry.  */
+GElf_Phdr *gelf_getphdr (Elf *__elf, int __ndx, GElf_Phdr *__dst);
+/* Retrieve section header.  */
+GElf_Shdr *gelf_getshdr (Elf_Scn *__scn, GElf_Shdr *__dst);
+/* Retrieve symbol information from the symbol table at the given index.  */
+GElf_Sym *gelf_getsym (Elf_Data *__data, int __ndx, GElf_Sym *__dst);
+/* Get section with next section index.  */
+Elf_Scn *elf_nextscn (Elf *__elf, Elf_Scn *__scn);
+/* Get data from section while translating from file representation
+   to memory representation.  */
+Elf_Data *elf_getdata (Elf_Scn *__scn, Elf_Data *__data);
+/* Return pointer to string at OFFSET in section INDEX.  */
+char *elf_strptr (Elf *__elf, size_t __index, size_t __offset);
+]]
+
+local elf = ffi.load('elf')
+local EV = { NONE=0, CURRENT=1, NUM=2 }
+local PT = { NULL=0, LOAD=1, DYNAMIC=2, INTERP=3, NOTE=4, SHLIB=5, PHDR=6, TLS=7, NUM=8 }
+local SHT = { NULL=0, PROGBITS=1, SYMTAB=2, STRTAB=3, RELA=4, HASH=5, DYNAMIC=6, NOTE=7,
+              NOBITS=8, REL=9, SHLIB=10, DYNSYM=11, INIT_ARRAY=14, FINI_ARRAY=15, PREINIT_ARRAY=16,
+              GROUP=17, SYMTAB_SHNDX=18, NUM=19 }
+local ELF_C = ffi.new('struct Elf_Cmd')
+local M = {}
+
+-- Optional poor man's C++ demangler
+local cpp_demangler = os.getenv('CPP_DEMANGLER')
+if not cpp_demangler then
+	for prefix in string.gmatch(os.getenv('PATH'), '[^;:]+') do
+		if S.statfs(prefix..'/c++filt') then
+			cpp_demangler = prefix..'/c++filt'
+			break
+		end
+	end
+end
+local cpp_demangle = function (name) return name end
+if cpp_demangler then
+	cpp_demangle = function (name)
+		local cmd = string.format('%s -p %s', cpp_demangler, name)
+		local fp = assert(io.popen(cmd, 'r'))
+		local output = fp:read('*all')
+		fp:close()
+		return output:match '^(.-)%s*$'
+	end
+end
+
+-- Metatable for ELF object
+ffi.metatype('struct Elf_object', {
+	__gc = function (t) t:close() end,
+	__index = {
+		close = function (t)
+			if t.elf ~= nil then
+				elf.elf_end(t.elf)
+				S.close(t.fd)
+				t.elf = nil
+			end
+		end,
+		-- Load library load address
+		loadaddr = function(t)
+			local phnum = ffi.new('size_t [1]')
+			if elf.elf_getphdrnum(t.elf, phnum) == nil then
+				return nil, 'cannot get phdrnum'
+			end
+			local header = ffi.new('GElf_Phdr [1]')
+			for i = 0, tonumber(phnum[0])-1 do
+				if elf.gelf_getphdr(t.elf, i, header) ~= nil
+				   and header[0].p_type == PT.LOAD then
+				   return header[0].p_vaddr
+				end
+			end
+		end,
+		-- Resolve symbol address
+		resolve = function (t, k, pattern)
+			local section = elf.elf_nextscn(t.elf, nil)
+			while section ~= nil do
+				local header = ffi.new('GElf_Shdr [1]')
+				if elf.gelf_getshdr(section, header) ~= nil then
+					if header[0].sh_type == SHT.SYMTAB or header[0].sh_type == SHT.DYNSYM then
+						local data = elf.elf_getdata(section, nil)
+						while data ~= nil do
+							if data.d_size % header[0].sh_entsize > 0 then
+								return nil, 'bad section header entity size'
+							end
+							local symcount = tonumber(data.d_size / header[0].sh_entsize)
+							local sym = ffi.new('GElf_Sym [1]')
+							for i = 0, symcount - 1 do
+								if elf.gelf_getsym(data, i, sym) ~= nil then
+									local name = elf.elf_strptr(t.elf, header[0].sh_link, sym[0].st_name)
+									if name ~= nil then
+										-- Demangle C++ symbols if necessary
+										name = ffi.string(name)
+										if name:sub(1,2) == '_Z' then
+											name = cpp_demangle(name)
+										end
+										-- Match symbol name against pattern
+										if pattern and string.match(name, k) or k == name then
+											return sym[0]	
+										end
+									end
+								end
+							end
+							data = elf.elf_getdata(section, data)
+						end
+					end
+				end
+				section = elf.elf_nextscn(t.elf, section)
+			end
+		end,
+	}
+})
+
+-- Open an ELF object
+function M.open(path)
+	if elf.elf_version(EV.CURRENT) == EV.NONE then
+		return nil, 'bad version'
+	end
+	local fd, err = S.open(path, 'rdonly')
+	if not fd then return nil, err end
+	local pt = ffi.new('Elf *')
+	pt = elf.elf_begin(fd:getfd(), ELF_C.READ, pt)
+	if not pt then
+		fd:close()
+		return nil, 'cannot open elf object'
+	end
+	return ffi.new('struct Elf_object', fd:nogc():getfd(), pt)
+end
+
+return M
\ No newline at end of file
diff --git a/src/lua/bpf/init.lua b/src/lua/bpf/init.lua
new file mode 100644
index 0000000..1cccbd7
--- /dev/null
+++ b/src/lua/bpf/init.lua
@@ -0,0 +1,16 @@
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+return require('bpf.bpf')
diff --git a/src/lua/bpf/ljbytecode.lua b/src/lua/bpf/ljbytecode.lua
new file mode 100644
index 0000000..75d18e1
--- /dev/null
+++ b/src/lua/bpf/ljbytecode.lua
@@ -0,0 +1,72 @@
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+local jutil = require("jit.util")
+local vmdef = require("jit.vmdef")
+local bit = require('bit')
+local shr, band = bit.rshift, bit.band
+
+-- Decode LuaJIT 2.0 Byte Format
+-- Reference: http://wiki.luajit.org/Bytecode-2.0
+-- Thanks to LJ, we get code in portable bytecode with constants folded, basic
+-- virtual registers allocated etc.
+-- No SSA IR, type inference or advanced optimizations because the code wasn't traced yet.
+local function decode_ins(func, pc)
+	local ins, m = jutil.funcbc(func, pc)
+	if not ins then return nil end
+	local op, ma, mb, mc = band(ins, 0xff), band(m, 7), band(m, 15*8), band(m, 15*128)
+	local a, b, c, d = band(shr(ins, 8), 0xff), nil, nil, shr(ins, 16)
+	if mb ~= 0 then
+		d = band(d, 0xff)
+		b = shr(ins, 24)
+	end
+	if ma == 5 then          -- BCMuv
+	    a = jutil.funcuvname(func, a)
+	end
+	if mc == 13*128 then     -- BCMjump
+		c = pc+d-0x7fff
+	elseif mc == 9*128 then  -- BCMint
+		c = jutil.funck(func, d)
+	elseif mc == 10*128 then -- BCMstr
+		c = jutil.funck(func, -d-1)
+	elseif mc == 5*128 then  -- BCMuv
+	    c = jutil.funcuvname(func, d)
+	end
+	-- Convert version-specific opcode to string
+	op = 6*op
+	op = string.sub(vmdef.bcnames, op+1, op+6):match('[^%s]+')
+	return pc, op, a, b, c, d
+end
+
+-- Decoder closure
+local function decoder(func)
+	local pc = 0
+	return function ()
+		pc = pc + 1
+		return decode_ins(func, pc)
+	end
+end
+
+-- Hexdump generated code
+local function dump(func)
+	return require('jit.bc').dump(func)
+end
+
+return {
+	decode = decode_ins,
+	decoder = decoder,
+	dump = dump,
+	funcinfo = function (...) return jutil.funcinfo(...) end,
+}
\ No newline at end of file
diff --git a/src/lua/bpf/proto.lua b/src/lua/bpf/proto.lua
new file mode 100644
index 0000000..69b3fbf
--- /dev/null
+++ b/src/lua/bpf/proto.lua
@@ -0,0 +1,490 @@
+--[[
+Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]
+local ffi = require('ffi')
+local BPF = ffi.typeof('struct bpf')
+
+ffi.cdef [[
+struct sk_buff {
+	uint32_t len;
+	uint32_t pkt_type;
+	uint32_t mark;
+	uint32_t queue_mapping;
+	uint32_t protocol;
+	uint32_t vlan_present;
+	uint32_t vlan_tci;
+	uint32_t vlan_proto;
+	uint32_t priority;
+	uint32_t ingress_ifindex;
+	uint32_t ifindex;
+	uint32_t tc_index;
+	uint32_t cb[5];
+	uint32_t hash;
+	uint32_t tc_classid;
+};
+
+struct eth_t {
+	uint8_t  dst[6];
+	uint8_t  src[6];
+	uint16_t type;
+} __attribute__((packed));
+
+struct dot1q_t {
+	uint16_t pri:3;
+	uint16_t cfi:1;
+	uint16_t vlanid:12;
+	uint16_t type;
+} __attribute__((packed));
+
+struct arp_t {
+	uint16_t htype;
+	uint16_t ptype;
+	uint8_t  hlen;
+	uint8_t  plen;
+	uint16_t oper;
+	uint8_t  sha[6];
+	uint32_t spa;
+	uint8_t  tha[6];
+	uint32_t tpa;
+} __attribute__((packed));
+
+struct ip_t {
+	uint8_t  ver:4;
+	uint8_t  hlen:4;
+	uint8_t  tos;
+	uint16_t tlen;
+	uint16_t identification;
+	uint16_t ffo_unused:1;
+	uint16_t df:1;
+	uint16_t mf:1;
+	uint16_t foffset:13;
+	uint8_t  ttl;
+	uint8_t  proto;
+	uint16_t hchecksum;
+	uint32_t src;
+	uint32_t dst;
+} __attribute__((packed));
+
+struct icmp_t {
+	uint8_t  type;
+	uint8_t  code;
+	uint16_t checksum;
+} __attribute__((packed));
+
+struct ip6_t {
+	uint32_t ver:4;
+	uint32_t priority:8;
+	uint32_t flow_label:20;
+	uint16_t payload_len;
+	uint8_t  next_header;
+	uint8_t  hop_limit;
+	uint64_t src_hi;
+	uint64_t src_lo;
+	uint64_t dst_hi;
+	uint64_t dst_lo;
+} __attribute__((packed));
+
+struct ip6_opt_t {
+	uint8_t  next_header;
+	uint8_t  ext_len;
+	uint8_t  pad[6];
+} __attribute__((packed));
+
+struct icmp6_t {
+	uint8_t  type;
+	uint8_t  code;
+	uint16_t checksum;
+} __attribute__((packed));
+
+struct udp_t {
+	uint16_t src_port;
+	uint16_t dst_port;
+	uint16_t length;
+	uint16_t crc;
+} __attribute__((packed));
+
+struct tcp_t {
+	uint16_t src_port;
+	uint16_t dst_port;
+	uint32_t seq_num;
+	uint32_t ack_num;
+	uint8_t  offset:4;
+	uint8_t  reserved:4;
+	uint8_t  flag_cwr:1;
+	uint8_t  flag_ece:1;
+	uint8_t  flag_urg:1;
+	uint8_t  flag_ack:1;
+	uint8_t  flag_psh:1;
+	uint8_t  flag_rst:1;
+	uint8_t  flag_syn:1;
+	uint8_t  flag_fin:1;
+	uint16_t rcv_wnd;
+	uint16_t cksum;
+	uint16_t urg_ptr;
+} __attribute__((packed));
+
+struct vxlan_t {
+	uint32_t rsv1:4;
+	uint32_t iflag:1;
+	uint32_t rsv2:3;
+	uint32_t rsv3:24;
+	uint32_t key:24;
+	uint32_t rsv4:8;
+} __attribute__((packed));
+]]
+
+
+-- Architecture-specific ptrace register layout
+local S = require('syscall')
+local arch = S.abi.arch
+local parm_to_reg = {}
+if arch == 'x64' then
+	ffi.cdef [[
+	struct pt_regs {
+		unsigned long r15;
+		unsigned long r14;
+		unsigned long r13;
+		unsigned long r12;
+		unsigned long bp;
+		unsigned long bx;
+		unsigned long r11;
+		unsigned long r10;
+		unsigned long r9;
+		unsigned long r8;
+		unsigned long ax;
+		unsigned long cx;
+		unsigned long dx;
+		unsigned long si;
+		unsigned long di;
+		unsigned long orig_ax;
+		unsigned long ip;
+		unsigned long cs;
+		unsigned long flags;
+		unsigned long sp;
+		unsigned long ss;
+	};]]
+	parm_to_reg = {parm1='di', parm2='si', parm3='dx', parm4='cx', parm5='r8', ret='sp', fp='bp'}
+else
+	ffi.cdef 'struct pt_regs {};'
+end
+-- Map symbolic registers to architecture ABI
+ffi.metatype('struct pt_regs', {
+		__index = function (t,k)
+			return assert(parm_to_reg[k], 'no such register: '..k)
+		end,
+})
+
+local M = {}
+
+-- Dissector interface
+local function dissector(type, e, dst, src, field)
+	local parent = e.V[src].const
+	-- Create new dissector variable
+	e.vcopy(dst, src)
+	-- Compute and materialize new dissector offset from parent
+	e.V[dst].const = {off=e.V[src].const.off, __dissector=e.V[src].const.__dissector}
+	parent.__dissector[field](e, dst)
+	e.V[dst].const.__dissector = type
+end
+M.dissector = dissector
+
+-- Get current effective offset, load field value at an offset relative to it and
+-- add its value to compute next effective offset (e.g. udp_off = ip_off + pkt[ip_off].hlen)
+local function next_offset(e, var, type, off, mask, shift)
+	local d = e.V[var].const
+	-- Materialize relative offset value in R0
+	local dst_reg, tmp_reg
+	if d.off then
+		dst_reg = e.vreg(var, 0, true)
+		tmp_reg = dst_reg -- Use target register to avoid copy
+		e.emit(BPF.LD + BPF.ABS + e.const_width[ffi.sizeof(type)], tmp_reg, 0, 0, d.off + off or 0)
+	else
+		tmp_reg = e.vreg(e.tmpvar, 0, true, type) -- Reserve R0 for temporary relative offset
+		dst_reg = e.vreg(var) -- Must rematerialize (if it was spilled by tmp var)
+		e.emit(BPF.LD + BPF.IND + e.const_width[ffi.sizeof(type)], tmp_reg, dst_reg, 0, off or 0)
+	end
+	-- Finalize relative offset
+	if mask then
+		e.emit(BPF.ALU + BPF.AND + BPF.K, tmp_reg, 0, 0, mask)
+	end	
+	if shift then
+		local op = BPF.LSH
+		if shift < 0 then
+			op = BPF.RSH
+			shift = -shift
+		end
+		e.emit(BPF.ALU + op + BPF.K, tmp_reg, 0, 0, shift)
+	end
+	-- Add to base offset to turn it into effective address
+	if dst_reg ~= tmp_reg then
+		e.emit(BPF.ALU + BPF.ADD + BPF.X, dst_reg, tmp_reg, 0, 0)
+	else
+		e.emit(BPF.ALU + BPF.ADD + BPF.K, dst_reg, 0, 0, d.off)
+	end
+	-- Discard temporary allocations
+	d.off = nil
+	e.V[e.tmpvar].reg = nil
+end
+
+local function next_skip(e, var, off)
+	local d = e.V[var].const
+	if not d.off then
+		local dst_reg = e.vreg(var)
+		e.emit(BPF.ALU64 + BPF.ADD + BPF.K, dst_reg, 0, 0, off)
+	else
+		d.off = d.off + off
+	end
+end
+
+local function skip_eth(e, dst)
+	-- IP starts right after ETH header (fixed size)
+	local d = e.V[dst].const
+	d.off = d.off + ffi.sizeof('struct eth_t')
+end
+
+-- Export types
+M.type = function(typestr, t)
+	t = t or {}
+	t.__dissector=ffi.typeof(typestr)
+	return t
+end
+M.skb     = M.type('struct sk_buff', {__base=true})
+M.pt_regs = M.type('struct pt_regs', {__base=true, source='probe'})
+M.pkt     = {off=0, __dissector=ffi.typeof('struct eth_t')} -- skb needs special accessors
+-- M.eth     = function (...) return dissector(ffi.typeof('struct eth_t'), ...) end
+M.dot1q   = function (...) return dissector(ffi.typeof('struct dot1q_t'), ...) end
+M.arp     = function (...) return dissector(ffi.typeof('struct arp_t'), ...) end
+M.icmp    = function (...) return dissector(ffi.typeof('struct icmp_t'), ...) end
+M.ip      = function (...) return dissector(ffi.typeof('struct ip_t'), ...) end
+M.icmp6   = function (...) return dissector(ffi.typeof('struct icmp6_t'), ...) end
+M.ip6     = function (...) return dissector(ffi.typeof('struct ip6_t'), ...) end
+M.ip6_opt = function (...) return dissector(ffi.typeof('struct ip6_opt_t'), ...) end
+M.udp     = function (...) return dissector(ffi.typeof('struct udp_t'), ...) end
+M.tcp     = function (...) return dissector(ffi.typeof('struct tcp_t'), ...) end
+M.vxlan   = function (...) return dissector(ffi.typeof('struct vxlan_t'), ...) end
+M.data    = function (...) return dissector(ffi.typeof('uint8_t'), ...) end
+
+-- Metatables
+ffi.metatype(ffi.typeof('struct eth_t'), {
+	__index = {
+		ip = skip_eth,
+		ip6 = skip_eth,
+	}
+})
+
+ffi.metatype(ffi.typeof('struct ip_t'), {
+	__index = {
+		-- Skip IP header length (stored as number of words)
+		-- e.g. hlen = 5, Header Length = 5 x sizeof(u32) = 20 octets
+		-- Mask first nibble and shift by 2 (multiplication by 4)
+		icmp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end,
+		udp  = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end,
+		tcp  = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end,
+	}
+})
+
+ffi.metatype(ffi.typeof('struct tcp_t'), {
+	__index = {
+		-- Skip TCP header length (stored as number of words)
+		-- e.g. hlen = 5, Header Length = 5 x sizeof(u32) = 20 octets
+		data = function(e, dst)
+			next_offset(e, dst, ffi.typeof('uint8_t'), ffi.offsetof('struct tcp_t', 'offset'), 0xf0, -2)
+		end,
+	}
+})
+
+ffi.metatype(ffi.typeof('struct udp_t'), {
+	__index = {
+		-- Skip UDP header length (8 octets)
+		data = function(e, dst)
+			next_skip(e, dst, ffi.sizeof('struct udp_t'))
+		end,
+	}
+})
+
+-- Constants
+M.c = {
+	eth = { -- Constants http://standards.ieee.org/regauth/ethertype
+		ip     = 0x0800, -- IP (v4) protocol
+		ip6    = 0x86dd, -- IP (v6) protocol
+		arp    = 0x0806, -- Address resolution protocol
+		revarp = 0x8035, -- Reverse addr resolution protocol
+		vlan   = 0x8100, -- IEEE 802.1Q VLAN tagging
+	},
+	ip = {
+		-- Reserved Addresses
+		addr_any         = 0x00000000, -- 0.0.0.0
+		addr_broadcast   = 0xffffffff, -- 255.255.255.255
+		addr_loopback    = 0x7f000001, -- 127.0.0.1
+		addr_mcast_all   = 0xe0000001, -- 224.0.0.1
+		addr_mcast_local = 0xe00000ff, -- 224.0.0.255
+		-- Type of service (ip_tos), RFC 1349 ("obsoleted by RFC 2474")
+		tos_default      = 0x00, -- default
+		tos_lowdelay     = 0x10, -- low delay
+		tos_throughput   = 0x08, -- high throughput
+		tos_reliability  = 0x04, -- high reliability
+		tos_lowcost      = 0x02, -- low monetary cost - XXX
+		tos_ect          = 0x02, -- ECN-capable transport
+		tos_ce           = 0x01, -- congestion experienced
+		-- Fragmentation flags (ip_off)
+		rf = 0x8000, -- reserved
+		df = 0x4000, -- don't fragment
+		mf = 0x2000, -- more fragments (not last frag)
+		offmask  = 0x1fff, -- mask for fragment offset
+		-- Time-to-live (ip_ttl), seconds
+		ttl_default = 64,  -- default ttl, RFC 1122, RFC 1340
+		ttl_max     = 255, -- maximum ttl
+		-- Protocol (ip_p) - http://www.iana.org/assignments/protocol-numbers
+		proto_ip      = 0,  -- dummy for IP
+		proto_hopopts = 0,  -- IPv6 hop-by-hop options
+		proto_icmp    = 1,  -- ICMP
+		proto_igmp    = 2,  -- IGMP
+		proto_ggp     = 3,  -- gateway-gateway protocol
+		proto_ipip    = 4,  -- IP in IP
+		proto_st      = 5,  -- ST datagram mode
+		proto_tcp     = 6,  -- TCP
+		proto_cbt     = 7,  -- CBT
+		proto_egp     = 8,  -- exterior gateway protocol
+		proto_igp     = 9,  -- interior gateway protocol
+		proto_bbnrcc  = 10,  -- BBN RCC monitoring
+		proto_nvp     = 11,  -- Network Voice Protocol
+		proto_pup     = 12,  -- PARC universal packet
+		proto_argus   = 13,  -- ARGUS
+		proto_emcon   = 14,  -- EMCON
+		proto_xnet    = 15,  -- Cross Net Debugger
+		proto_chaos   = 16,  -- Chaos
+		proto_udp     = 17,  -- UDP
+		proto_mux     = 18,  -- multiplexing
+		proto_dcnmeas = 19,  -- DCN measurement
+		proto_hmp     = 20,  -- Host Monitoring Protocol
+		proto_prm     = 21,  -- Packet Radio Measurement
+		proto_idp     = 22,  -- Xerox NS IDP
+		proto_trunk1  = 23,  -- Trunk-1
+		proto_trunk2  = 24,  -- Trunk-2
+		proto_leaf1   = 25,  -- Leaf-1
+		proto_leaf2   = 26,  -- Leaf-2
+		proto_rdp     = 27,  -- "Reliable Datagram" proto
+		proto_irtp    = 28,  -- Inet Reliable Transaction
+		proto_tp      = 29,  -- ISO TP class 4
+		proto_netblt  = 30,  -- Bulk Data Transfer
+		proto_mfpnsp  = 31,  -- MFE Network Services
+		proto_meritinp= 32,  -- Merit Internodal Protocol
+		proto_sep     = 33,  -- Sequential Exchange proto
+		proto_3pc     = 34,  -- Third Party Connect proto
+		proto_idpr    = 35,  -- Interdomain Policy Route
+		proto_xtp     = 36,  -- Xpress Transfer Protocol
+		proto_ddp     = 37,  -- Datagram Delivery Proto
+		proto_cmtp    = 38,  -- IDPR Ctrl Message Trans
+		proto_tppp    = 39,  -- TP++ Transport Protocol
+		proto_il      = 40,  -- IL Transport Protocol
+		proto_ip6     = 41,  -- IPv6
+		proto_sdrp    = 42,  -- Source Demand Routing
+		proto_routing = 43,  -- IPv6 routing header
+		proto_fragment= 44,  -- IPv6 fragmentation header
+		proto_rsvp    = 46,  -- Reservation protocol
+		proto_gre     = 47,  -- General Routing Encap
+		proto_mhrp    = 48,  -- Mobile Host Routing
+		proto_ena     = 49,  -- ENA
+		proto_esp     = 50,  -- Encap Security Payload
+		proto_ah      = 51,  -- Authentication Header
+		proto_inlsp   = 52,  -- Integated Net Layer Sec
+		proto_swipe   = 53,  -- SWIPE
+		proto_narp    = 54,  -- NBMA Address Resolution
+		proto_mobile  = 55,  -- Mobile IP, RFC 2004
+		proto_tlsp    = 56,  -- Transport Layer Security
+		proto_skip    = 57,  -- SKIP
+		proto_icmp6   = 58,  -- ICMP for IPv6
+		proto_none    = 59,  -- IPv6 no next header
+		proto_dstopts = 60,  -- IPv6 destination options
+		proto_anyhost = 61,  -- any host internal proto
+		proto_cftp    = 62,  -- CFTP
+		proto_anynet  = 63,  -- any local network
+		proto_expak   = 64,  -- SATNET and Backroom EXPAK
+		proto_kryptolan = 65,  -- Kryptolan
+		proto_rvd     = 66,  -- MIT Remote Virtual Disk
+		proto_ippc    = 67,  -- Inet Pluribus Packet Core
+		proto_distfs  = 68,  -- any distributed fs
+		proto_satmon  = 69,  -- SATNET Monitoring
+		proto_visa    = 70,  -- VISA Protocol
+		proto_ipcv    = 71,  -- Inet Packet Core Utility
+		proto_cpnx    = 72,  -- Comp Proto Net Executive
+		proto_cphb    = 73,  -- Comp Protocol Heart Beat
+		proto_wsn     = 74,  -- Wang Span Network
+		proto_pvp     = 75,  -- Packet Video Protocol
+		proto_brsatmon= 76,  -- Backroom SATNET Monitor
+		proto_sunnd   = 77,  -- SUN ND Protocol
+		proto_wbmon   = 78,  -- WIDEBAND Monitoring
+		proto_wbexpak = 79,  -- WIDEBAND EXPAK
+		proto_eon     = 80,  -- ISO CNLP
+		proto_vmtp    = 81,  -- Versatile Msg Transport
+		proto_svmtp   = 82,  -- Secure VMTP
+		proto_vines   = 83,  -- VINES
+		proto_ttp     = 84,  -- TTP
+		proto_nsfigp  = 85,  -- NSFNET-IGP
+		proto_dgp     = 86,  -- Dissimilar Gateway Proto
+		proto_tcf     = 87,  -- TCF
+		proto_eigrp   = 88,  -- EIGRP
+		proto_ospf    = 89,  -- Open Shortest Path First
+		proto_spriterpc= 90,  -- Sprite RPC Protocol
+		proto_larp    = 91,  -- Locus Address Resolution
+		proto_mtp     = 92,  -- Multicast Transport Proto
+		proto_ax25    = 93,  -- AX.25 Frames
+		proto_ipipencap= 94,  -- yet-another IP encap
+		proto_micp    = 95,  -- Mobile Internet Ctrl
+		proto_sccsp   = 96,  -- Semaphore Comm Sec Proto
+		proto_etherip = 97,  -- Ethernet in IPv4
+		proto_encap   = 98,  -- encapsulation header
+		proto_anyenc  = 99,  -- private encryption scheme
+		proto_gmtp    = 100,  -- GMTP
+		proto_ifmp    = 101,  -- Ipsilon Flow Mgmt Proto
+		proto_pnni    = 102,  -- PNNI over IP
+		proto_pim     = 103,  -- Protocol Indep Multicast
+		proto_aris    = 104,  -- ARIS
+		proto_scps    = 105,  -- SCPS
+		proto_qnx     = 106,  -- QNX
+		proto_an      = 107,  -- Active Networks
+		proto_ipcomp  = 108,  -- IP Payload Compression
+		proto_snp     = 109,  -- Sitara Networks Protocol
+		proto_compaqpeer= 110,  -- Compaq Peer Protocol
+		proto_ipxip   = 111,  -- IPX in IP
+		proto_vrrp    = 112,  -- Virtual Router Redundancy
+		proto_pgm     = 113,  -- PGM Reliable Transport
+		proto_any0hop = 114,  -- 0-hop protocol
+		proto_l2tp    = 115,  -- Layer 2 Tunneling Proto
+		proto_ddx     = 116,  -- D-II Data Exchange (DDX)
+		proto_iatp    = 117,  -- Interactive Agent Xfer
+		proto_stp     = 118,  -- Schedule Transfer Proto
+		proto_srp     = 119,  -- SpectraLink Radio Proto
+		proto_uti     = 120,  -- UTI
+		proto_smp     = 121,  -- Simple Message Protocol
+		proto_sm      = 122,  -- SM
+		proto_ptp     = 123,  -- Performance Transparency
+		proto_isis    = 124,  -- ISIS over IPv4
+		proto_fire    = 125,  -- FIRE
+		proto_crtp    = 126,  -- Combat Radio Transport
+		proto_crudp   = 127,  -- Combat Radio UDP
+		proto_sscopmce= 128,  -- SSCOPMCE
+		proto_iplt    = 129,  -- IPLT
+		proto_sps     = 130,  -- Secure Packet Shield
+		proto_pipe    = 131,  -- Private IP Encap in IP
+		proto_sctp    = 132,  -- Stream Ctrl Transmission
+		proto_fc      = 133,  -- Fibre Channel
+		proto_rsvpign = 134,  -- RSVP-E2E-IGNORE
+		proto_raw     = 255,  -- Raw IP packets
+		proto_reserved= 255,  -- Reserved
+	},
+}
+
+return M
\ No newline at end of file
diff --git a/src/lua/squishy b/src/lua/squishy
index bf83d79..a642005 100644
--- a/src/lua/squishy
+++ b/src/lua/squishy
@@ -13,5 +13,13 @@
 Module "bcc.table" "bcc/table.lua"
 Module "bcc.usdt" "bcc/usdt.lua"
 
+Module "bpf" "bpf/init.lua"
+Module "bpf.bpf" "bpf/bpf.lua"
+Module "bpf.builtins" "bpf/builtins.lua"
+Module "bpf.cdef" "bpf/cdef.lua"
+Module "bpf.elf" "bpf/elf.lua"
+Module "bpf.ljbytecode" "bpf/ljbytecode.lua"
+Module "bpf.proto" "bpf/proto.lua"
+
 Main "bcc/run.lua"
 Output "bcc.lua"
diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py
index 1c60d1b..3d258a6 100644
--- a/src/python/bcc/__init__.py
+++ b/src/python/bcc/__init__.py
@@ -25,7 +25,7 @@
 import sys
 basestring = (unicode if sys.version_info[0] < 3 else str)
 
-from .libbcc import lib, _CB_TYPE, bcc_symbol
+from .libbcc import lib, _CB_TYPE, bcc_symbol, _SYM_CB_TYPE
 from .table import Table
 from .perf import Perf
 from .usyms import ProcessSymbols
@@ -531,8 +531,25 @@
         res = lib.bcc_procutils_which_so(libname.encode("ascii"))
         return res if res is None else res.decode()
 
-    def attach_tracepoint(self, tp="", fn_name="", pid=-1, cpu=0, group_fd=-1):
-        """attach_tracepoint(tp="", fn_name="", pid=-1, cpu=0, group_fd=-1)
+    def _get_tracepoints(self, tp_re):
+        results = []
+        events_dir = os.path.join(TRACEFS, "events")
+        for category in os.listdir(events_dir):
+            cat_dir = os.path.join(events_dir, category)
+            if not os.path.isdir(cat_dir):
+                continue
+            for event in os.listdir(cat_dir):
+                evt_dir = os.path.join(cat_dir, event)
+                if os.path.isdir(evt_dir):
+                    tp = ("%s:%s" % (category, event))
+                    if re.match(tp_re, tp):
+                        results.append(tp)
+        return results
+
+    def attach_tracepoint(self, tp="", tp_re="", fn_name="", pid=-1,
+                          cpu=0, group_fd=-1):
+        """attach_tracepoint(tp="", tp_re="", fn_name="", pid=-1,
+                             cpu=0, group_fd=-1)
 
         Run the bpf function denoted by fn_name every time the kernel tracepoint
         specified by 'tp' is hit. The optional parameters pid, cpu, and group_fd
@@ -540,12 +557,24 @@
         the tracepoint category and the tracepoint name, separated by a colon.
         For example: sched:sched_switch, syscalls:sys_enter_bind, etc.
 
+        Instead of a tracepoint name, a regular expression can be provided in
+        tp_re. The program will then attach to tracepoints that match the
+        provided regular expression.
+
         To obtain a list of kernel tracepoints, use the tplist tool or cat the
         file /sys/kernel/debug/tracing/available_events.
 
-        Example: BPF(text).attach_tracepoint("sched:sched_switch", "on_switch")
+        Examples:
+            BPF(text).attach_tracepoint(tp="sched:sched_switch", fn_name="on_switch")
+            BPF(text).attach_tracepoint(tp_re="sched:.*", fn_name="on_switch")
         """
 
+        if tp_re:
+            for tp in self._get_tracepoints(tp_re):
+                self.attach_tracepoint(tp=tp, fn_name=fn_name, pid=pid,
+                                       cpu=cpu, group_fd=group_fd)
+            return
+
         fn = self.load_func(fn_name, BPF.TRACEPOINT)
         (tp_category, tp_name) = tp.split(':')
         res = lib.bpf_attach_tracepoint(fn.fd, tp_category.encode("ascii"),
@@ -586,9 +615,29 @@
         del self.open_uprobes[name]
         _num_open_probes -= 1
 
-    def attach_uprobe(self, name="", sym="", addr=None,
+    def _get_user_functions(self, name, sym_re):
+        """
+        We are returning addresses here instead of symbol names because it
+        turns out that the same name may appear multiple times with different
+        addresses, and the same address may appear multiple times with the same
+        name. We can't attach a uprobe to the same address more than once, so
+        it makes sense to return the unique set of addresses that are mapped to
+        a symbol that matches the provided regular expression.
+        """
+        addresses = []
+        def sym_cb(sym_name, addr):
+            if re.match(sym_re, sym_name) and addr not in addresses:
+                addresses.append(addr)
+            return 0
+
+        res = lib.bcc_foreach_symbol(name, _SYM_CB_TYPE(sym_cb))
+        if res < 0:
+            raise Exception("Error %d enumerating symbols in %s" % (res, name))
+        return addresses
+
+    def attach_uprobe(self, name="", sym="", sym_re="", addr=None,
             fn_name="", pid=-1, cpu=0, group_fd=-1):
-        """attach_uprobe(name="", sym="", addr=None, fn_name=""
+        """attach_uprobe(name="", sym="", sym_re="", addr=None, fn_name=""
                          pid=-1, cpu=0, group_fd=-1)
 
         Run the bpf function denoted by fn_name every time the symbol sym in
@@ -596,6 +645,10 @@
         be supplied in place of sym. Optional parameters pid, cpu, and group_fd
         can be used to filter the probe.
 
+        Instead of a symbol name, a regular expression can be provided in
+        sym_re. The uprobe will then attach to symbols that match the provided
+        regular expression.
+
         Libraries can be given in the name argument without the lib prefix, or
         with the full path (/usr/lib/...). Binaries can be given only with the
         full path (/bin/sh).
@@ -605,6 +658,14 @@
         """
 
         name = str(name)
+
+        if sym_re:
+            for sym_addr in self._get_user_functions(name, sym_re):
+                self.attach_uprobe(name=name, addr=sym_addr,
+                                   fn_name=fn_name, pid=pid, cpu=cpu,
+                                   group_fd=group_fd)
+            return
+
         (path, addr) = BPF._check_path_symbol(name, sym, addr)
 
         self._check_probe_quota(1)
@@ -639,9 +700,9 @@
             raise Exception("Failed to detach BPF from uprobe")
         self._del_uprobe(ev_name)
 
-    def attach_uretprobe(self, name="", sym="", addr=None,
+    def attach_uretprobe(self, name="", sym="", sym_re="", addr=None,
             fn_name="", pid=-1, cpu=0, group_fd=-1):
-        """attach_uretprobe(name="", sym="", addr=None, fn_name=""
+        """attach_uretprobe(name="", sym="", sym_re="", addr=None, fn_name=""
                             pid=-1, cpu=0, group_fd=-1)
 
         Run the bpf function denoted by fn_name every time the symbol sym in
@@ -649,6 +710,13 @@
         meaning of additional parameters.
         """
 
+        if sym_re:
+            for sym_addr in self._get_user_functions(name, sym_re):
+                self.attach_uretprobe(name=name, addr=sym_addr,
+                                      fn_name=fn_name, pid=pid, cpu=cpu,
+                                      group_fd=group_fd)
+            return
+
         name = str(name)
         (path, addr) = BPF._check_path_symbol(name, sym, addr)
 
@@ -799,6 +867,17 @@
         return name
 
     @staticmethod
+    def symaddr(addr, pid):
+        """symaddr(addr, pid)
+
+        Translate a memory address into a function name plus the instruction
+        offset as a hexadecimal number, which is returned as a string.
+        A pid of less than zero will access the kernel symbol cache.
+        """
+        name, offset = BPF._sym_cache(pid).resolve(addr)
+        return "%s+0x%x" % (name, offset)
+
+    @staticmethod
     def ksym(addr):
         """ksym(addr)
 
@@ -815,8 +894,7 @@
         instruction offset as a hexidecimal number, which is returned as a
         string.
         """
-        name, offset = BPF._sym_cache(-1).resolve(addr)
-        return "%s+0x%x" % (name, offset)
+        return BPF.symaddr(addr, -1)
 
     @staticmethod
     def ksymname(name):
@@ -835,6 +913,20 @@
         """
         return len([k for k in self.open_kprobes.keys() if isinstance(k, str)])
 
+    def num_open_uprobes(self):
+        """num_open_uprobes()
+
+        Get the number of open U[ret]probes.
+        """
+        return len(self.open_uprobes)
+
+    def num_open_tracepoints(self):
+        """num_open_tracepoints()
+
+        Get the number of open tracepoints.
+        """
+        return len(self.open_tracepoints)
+
     def kprobe_poll(self, timeout = -1):
         """kprobe_poll(self)
 
diff --git a/src/python/bcc/libbcc.py b/src/python/bcc/libbcc.py
index 868b994..7dfc2a2 100644
--- a/src/python/bcc/libbcc.py
+++ b/src/python/bcc/libbcc.py
@@ -129,6 +129,10 @@
 lib.bcc_resolve_symname.argtypes = [
     ct.c_char_p, ct.c_char_p, ct.c_ulonglong, ct.POINTER(bcc_symbol)]
 
+_SYM_CB_TYPE = ct.CFUNCTYPE(ct.c_int, ct.c_char_p, ct.c_ulonglong)
+lib.bcc_foreach_symbol.restype = ct.c_int
+lib.bcc_foreach_symbol.argtypes = [ct.c_char_p, _SYM_CB_TYPE]
+
 lib.bcc_symcache_new.restype = ct.c_void_p
 lib.bcc_symcache_new.argtypes = [ct.c_int]
 
@@ -170,11 +174,41 @@
             ('num_arguments', ct.c_int),
         ]
 
+class bcc_usdt_location(ct.Structure):
+    _fields_ = [
+            ('address', ct.c_ulonglong)
+        ]
+
+class BCC_USDT_ARGUMENT_FLAGS(object):
+    NONE = 0x0
+    CONSTANT = 0x1
+    DEREF_OFFSET = 0x2
+    DEREF_IDENT = 0x4
+    REGISTER_NAME = 0x8
+
+class bcc_usdt_argument(ct.Structure):
+    _fields_ = [
+            ('size', ct.c_int),
+            ('valid', ct.c_int),
+            ('constant', ct.c_int),
+            ('deref_offset', ct.c_int),
+            ('deref_ident', ct.c_char_p),
+            ('register_name', ct.c_char_p)
+        ]
+
 _USDT_CB = ct.CFUNCTYPE(None, ct.POINTER(bcc_usdt))
 
 lib.bcc_usdt_foreach.restype = None
 lib.bcc_usdt_foreach.argtypes = [ct.c_void_p, _USDT_CB]
 
+lib.bcc_usdt_get_location.restype = ct.c_int
+lib.bcc_usdt_get_location.argtypes = [ct.c_void_p, ct.c_char_p, ct.c_int,
+                                      ct.POINTER(bcc_usdt_location)]
+
+lib.bcc_usdt_get_argument.restype = ct.c_int
+lib.bcc_usdt_get_argument.argtypes = [ct.c_void_p, ct.c_char_p, ct.c_int,
+                                      ct.c_int, ct.POINTER(bcc_usdt_argument)]
+
 _USDT_PROBE_CB = ct.CFUNCTYPE(None, ct.c_char_p, ct.c_char_p,
                               ct.c_ulonglong, ct.c_int)
 
diff --git a/src/python/bcc/table.py b/src/python/bcc/table.py
index f34cb7b..d395fce 100644
--- a/src/python/bcc/table.py
+++ b/src/python/bcc/table.py
@@ -243,16 +243,17 @@
         return next_key
 
     def print_log2_hist(self, val_type="value", section_header="Bucket ptr",
-            section_print_fn=None):
+            section_print_fn=None, bucket_fn=None):
         """print_log2_hist(val_type="value", section_header="Bucket ptr",
-                           section_print_fn=None)
+                           section_print_fn=None, bucket_fn=None)
 
         Prints a table as a log2 histogram. The table must be stored as
         log2. The val_type argument is optional, and is a column header.
         If the histogram has a secondary key, multiple tables will print
         and section_header can be used as a header description for each.
         If section_print_fn is not None, it will be passed the bucket value
-        to format into a string as it sees fit.
+        to format into a string as it sees fit. If bucket_fn is not None,
+        it will be used to produce a bucket value for the histogram keys.
         """
         if isinstance(self.Key(), ct.Structure):
             tmp = {}
@@ -260,6 +261,8 @@
             f2 = self.Key._fields_[1][0]
             for k, v in self.items():
                 bucket = getattr(k, f1)
+                if bucket_fn:
+                    bucket = bucket_fn(bucket)
                 vals = tmp[bucket] = tmp.get(bucket, [0] * 65)
                 slot = getattr(k, f2)
                 vals[slot] = v.value
diff --git a/src/python/bcc/usdt.py b/src/python/bcc/usdt.py
index 45bcfc8..adcd3d7 100644
--- a/src/python/bcc/usdt.py
+++ b/src/python/bcc/usdt.py
@@ -12,25 +12,100 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .libbcc import lib, _USDT_CB, _USDT_PROBE_CB
+import ctypes as ct
+from .libbcc import lib, _USDT_CB, _USDT_PROBE_CB, \
+                    bcc_usdt_location, bcc_usdt_argument, \
+                    BCC_USDT_ARGUMENT_FLAGS
 
-class USDTProbe(object):
-    def __init__(self, usdt):
-        self.provider = usdt.provider
-        self.name = usdt.name
-        self.bin_path = usdt.bin_path
-        self.semaphore = usdt.semaphore
-        self.num_locations = usdt.num_locations
-        self.num_arguments = usdt.num_arguments
+class USDTProbeArgument(object):
+    def __init__(self, argument):
+        self.signed = argument.size < 0
+        self.size = abs(argument.size)
+        self.valid = argument.valid
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.CONSTANT != 0:
+            self.constant = argument.constant
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.DEREF_OFFSET != 0:
+            self.deref_offset = argument.deref_offset
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.DEREF_IDENT != 0:
+            self.deref_ident = argument.deref_ident
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.REGISTER_NAME != 0:
+            self.register_name = argument.register_name
+
+    def _size_prefix(self):
+        return "%d %s bytes" % \
+                (self.size, "signed  " if self.signed else "unsigned")
+
+    def _format(self):
+        # This mimics the logic in cc/usdt_args.cc that gives meaning to the
+        # various argument settings. A change there will require a change here.
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.CONSTANT != 0:
+            return "%d" % self.constant
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.DEREF_OFFSET == 0:
+            return "%s" % self.register_name
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.DEREF_OFFSET != 0 and \
+           self.valid & BCC_USDT_ARGUMENT_FLAGS.DEREF_IDENT == 0:
+            sign = '+' if self.deref_offset >= 0 else '-'
+            return "*(%s %s %d)" % (self.register_name,
+                                    sign, abs(self.deref_offset))
+        if self.valid & BCC_USDT_ARGUMENT_FLAGS.DEREF_OFFSET != 0 and \
+           self.valid & BCC_USDT_ARGUMENT_FLAGS.DEREF_IDENT != 0 and \
+           self.valid & BCC_USDT_ARGUMENT_FLAGS.REGISTER_NAME != 0 and \
+           self.register_name == "ip":
+            sign = '+' if self.deref_offset >= 0 else '-'
+            return "*(&%s %s %d)" % (self.deref_ident,
+                                     sign, abs(self.deref_offset))
+        # If we got here, this is an unrecognized case. Doesn't mean it's
+        # necessarily bad, so just provide the raw data. It just means that
+        # other tools won't be able to work with this argument.
+        return "unrecognized argument format, flags %d" % self.valid
 
     def __str__(self):
-        return "%s %s:%s [sema 0x%x]\n  %d location(s)\n  %d argument(s)" % \
-               (self.bin_path, self.provider, self.name, self.semaphore,
-                self.num_locations, self.num_arguments)
+        return "%s @ %s" % (self._size_prefix(), self._format())
+
+class USDTProbeLocation(object):
+    def __init__(self, probe, index, location):
+        self.probe = probe
+        self.index = index
+        self.num_arguments = probe.num_arguments
+        self.address = location.address
+
+    def __str__(self):
+        return "0x%x" % self.address
+
+    def get_argument(self, index):
+        arg = bcc_usdt_argument()
+        res = lib.bcc_usdt_get_argument(self.probe.context, self.probe.name,
+                                        self.index, index, ct.pointer(arg))
+        if res != 0:
+            raise Exception("error retrieving probe argument %d location %d" %
+                            (index, self.index))
+        return USDTProbeArgument(arg)
+
+class USDTProbe(object):
+    def __init__(self, context, probe):
+        self.context = context
+        self.provider = probe.provider
+        self.name = probe.name
+        self.bin_path = probe.bin_path
+        self.semaphore = probe.semaphore
+        self.num_locations = probe.num_locations
+        self.num_arguments = probe.num_arguments
+
+    def __str__(self):
+        return "%s %s:%s [sema 0x%x]" % \
+               (self.bin_path, self.provider, self.name, self.semaphore)
 
     def short_name(self):
         return "%s:%s" % (self.provider, self.name)
 
+    def get_location(self, index):
+        loc = bcc_usdt_location()
+        res = lib.bcc_usdt_get_location(self.context, self.name,
+                                        index, ct.pointer(loc))
+        if res != 0:
+            raise Exception("error retrieving probe location %d" % index)
+        return USDTProbeLocation(self, index, loc)
+
 class USDT(object):
     def __init__(self, pid=None, path=None):
         if pid and pid != -1:
@@ -62,7 +137,7 @@
     def enumerate_probes(self):
         probes = []
         def _add_probe(probe):
-            probes.append(USDTProbe(probe.contents))
+            probes.append(USDTProbe(self.context, probe.contents))
 
         lib.bcc_usdt_foreach(self.context, _USDT_CB(_add_probe))
         return probes
diff --git a/tests/cc/test_usdt_args.cc b/tests/cc/test_usdt_args.cc
index 57ccb4f..ba91b42 100644
--- a/tests/cc/test_usdt_args.cc
+++ b/tests/cc/test_usdt_args.cc
@@ -48,12 +48,25 @@
 }
 
 TEST_CASE("test usdt argument parsing", "[usdt]") {
+  SECTION("parse failure") {
+    USDT::ArgumentParser_x64 parser("4@i%ra+1r");
+    USDT::Argument arg;
+    REQUIRE(!parser.parse(&arg));
+    int i;
+    for (i = 0; i < 10 && !parser.done(); ++i) {
+      parser.parse(&arg);
+    }
+    // Make sure we reach termination
+    REQUIRE(i < 10);
+  }
   SECTION("argument examples from the Python implementation") {
     USDT::ArgumentParser_x64 parser(
         "-4@$0 8@$1234 %rdi %rax %rsi "
         "-8@%rbx 4@%r12 8@-8(%rbp) 4@(%rax) "
         "-4@global_max_action(%rip) "
-        "8@24+mp_(%rip) ");
+        "8@24+mp_(%rip) "
+        "-4@CheckpointStats+40(%rip) "
+        "4@glob-2(%rip) ");
 
     verify_register(parser, -4, 0);
     verify_register(parser, 8, 1234);
@@ -69,6 +82,8 @@
 
     verify_register(parser, -4, "ip", 0, std::string("global_max_action"));
     verify_register(parser, 8, "ip", 24, std::string("mp_"));
+    verify_register(parser, -4, "ip", 40, std::string("CheckpointStats"));
+    verify_register(parser, 4, "ip", -2, std::string("glob"));
 
     REQUIRE(parser.done());
   }
diff --git a/tests/lua/.busted b/tests/lua/.busted
new file mode 100644
index 0000000..5a83208
--- /dev/null
+++ b/tests/lua/.busted
@@ -0,0 +1,8 @@
+-- Configuration for unit tests
+-- See: http://olivinelabs.com/busted/ 
+return {
+	default = {
+		lpath = "./?.lua",
+		["auto-insulate"] = false,
+	}
+}
\ No newline at end of file
diff --git a/tests/lua/.luacheckrc b/tests/lua/.luacheckrc
new file mode 100644
index 0000000..407cdbe
--- /dev/null
+++ b/tests/lua/.luacheckrc
@@ -0,0 +1,12 @@
+std = "luajit"
+ignore = { "211", "212", "411", "412", "421", "431", "542" }
+files["examples"] = {
+	new_globals = { "pkt", "time", "xadd", "c" }
+}
+files["bpf/builtins.lua"] = {
+	ignore = { "122" }
+}
+files["spec"] = {
+	std = "+busted",
+	new_globals = { "pkt", "time", "xadd", "c" }
+}
\ No newline at end of file
diff --git a/tests/lua/CMakeLists.txt b/tests/lua/CMakeLists.txt
index 0a01bfc..d3d7298 100644
--- a/tests/lua/CMakeLists.txt
+++ b/tests/lua/CMakeLists.txt
@@ -1,4 +1,5 @@
 find_program(LUAJIT luajit)
+find_program(BUSTED busted)
 
 if(LUAJIT)
 	add_test(NAME lua_test_clang WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
@@ -12,4 +13,9 @@
 
 	add_test(NAME lua_test_standalone WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
 		COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/test_standalone.sh)
+
+	if(BUSTED)
+		add_test(NAME lua_test_busted WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+			COMMAND busted --lua=${LUAJIT} -m "${CMAKE_CURRENT_SOURCE_DIR}/../../src/lua/?.lua" -m "${CMAKE_CURRENT_SOURCE_DIR}/../../src/lua/?/init.lua;")
+	endif()
 endif()
diff --git a/tests/lua/spec/README.md b/tests/lua/spec/README.md
new file mode 100644
index 0000000..e19305c
--- /dev/null
+++ b/tests/lua/spec/README.md
@@ -0,0 +1,5 @@
+# Unit test specs
+
+This directory contains spec files for Lua BPF in [Busted] unit test format.
+
+[Busted]: http://olivinelabs.com/busted/
diff --git a/tests/lua/spec/compile_spec.lua b/tests/lua/spec/compile_spec.lua
new file mode 100644
index 0000000..823a2e5
--- /dev/null
+++ b/tests/lua/spec/compile_spec.lua
@@ -0,0 +1,23 @@
+describe('compile', function()
+	local ffi = require('ffi')
+	local bpf = require('bpf')
+
+	it('can compile socket filter', function()
+		-- Create mock BPF map
+		local mock_map = {
+			max_entries = 16,
+			key_type = ffi.typeof('uint64_t [1]'),
+			val_type = ffi.typeof('uint64_t [1]'),
+			fd = 1,
+			__map = true,
+		}
+		-- Compile small code example
+		local code = bpf(function ()
+		   local proto = pkt.ip.proto
+		   xadd(mock_map[proto], 1)
+		end)
+		assert.truthy(code)
+		assert.same(type(code), 'table')
+		assert.same(code.pc, 15)
+	end)
+end)
diff --git a/tests/lua/spec/decoder_spec.lua b/tests/lua/spec/decoder_spec.lua
new file mode 100644
index 0000000..a175879
--- /dev/null
+++ b/tests/lua/spec/decoder_spec.lua
@@ -0,0 +1,31 @@
+describe('decoder', function()
+
+	-- Decode simple function
+	local bytecode = require('bpf.ljbytecode')
+	local f = function (x) return x + 1 end
+
+	it('should decode functions', function()
+		-- Make sure it calls LJ decoder
+		local bc = bytecode.decoder(f)
+		assert.truthy(bc)
+		-- Decode bytecode bytecode to instructions
+		local jutil = require("jit.util")
+		spy.on(jutil, 'funcbc')
+		local pc, op = bc()
+		-- Check bytecode for sanity (starts with ADDVN(x, 1))
+		assert.equal(pc, 1)
+		assert.equal(op, 'ADDVN')
+		for pc, op in bc do
+			assert.truthy(pc and op)
+		end
+		assert.spy(jutil.funcbc).was.called()
+	end)
+	it('should fail on bad input', function()
+		assert.has_error(function() bytecode.decoder(nil)() end)
+		assert.has_error(function() bytecode.decoder(5)() end)
+		assert.has_error(function() bytecode.decoder('test')() end)
+	end)
+	it('should dump bytecode', function()
+		bytecode.dump(f)
+	end)
+end)
diff --git a/tests/lua/spec/elf_spec.lua b/tests/lua/spec/elf_spec.lua
new file mode 100644
index 0000000..0be050d
--- /dev/null
+++ b/tests/lua/spec/elf_spec.lua
@@ -0,0 +1,24 @@
+describe('elf reader', function()
+
+	local ok, elf = pcall(require, 'bpf.elf')
+	if not ok then return end
+
+	it('should handle C library', function()
+		-- Open libc
+		local sh = elf.open('/bin/sh')
+		assert.truthy(sh)
+		-- Find load address
+		local base = sh:loadaddr()
+		assert.truthy(base)
+		-- Find something from ISO C
+		local malloc_addr = sh:resolve('malloc')
+		assert.truthy(malloc_addr)
+		-- Find something that doesn't exist
+		local bad_addr = sh:resolve('thisnotexists')
+		assert.falsy(bad_addr)
+	end)
+	it('should fail on bad input', function()
+		assert.falsy(elf.open(nil))
+		assert.falsy(elf.open('/tmp'):loadaddr())
+	end)
+end)
diff --git a/tools/btrfsslower.py b/tools/btrfsslower.py
index 111076c..0478fe1 100755
--- a/tools/btrfsslower.py
+++ b/tools/btrfsslower.py
@@ -22,6 +22,7 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #
 # 15-Feb-2016   Brendan Gregg   Created this.
+# 16-Oct-2016   Dina Goldshtein -p to filter by process ID.
 
 from __future__ import print_function
 from bcc import BPF
@@ -87,7 +88,7 @@
     char file[DNAME_INLINE_LEN];
 };
 
-BPF_HASH(entryinfo, pid_t, struct val_t);
+BPF_HASH(entryinfo, u64, struct val_t);
 BPF_PERF_OUTPUT(events);
 
 //
@@ -99,8 +100,9 @@
 // I do by checking file->f_op.
 int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb)
 {
-    u32 pid;
-    pid = bpf_get_current_pid_tgid();
+    u64 id =  bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
     if (FILTER_PID)
         return 0;
 
@@ -115,7 +117,7 @@
     val.fp = fp;
     val.offset = iocb->ki_pos;
     if (val.fp)
-        entryinfo.update(&pid, &val);
+        entryinfo.update(&id, &val);
 
     return 0;
 }
@@ -123,18 +125,19 @@
 // btrfs_file_write_iter():
 int trace_write_entry(struct pt_regs *ctx, struct kiocb *iocb)
 {
-    u32 pid;
-    pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
     if (FILTER_PID)
         return 0;
 
-    // store filep and timestamp by pid
+    // store filep and timestamp by id
     struct val_t val = {};
     val.ts = bpf_ktime_get_ns();
     val.fp = iocb->ki_filp;
     val.offset = iocb->ki_pos;
     if (val.fp)
-        entryinfo.update(&pid, &val);
+        entryinfo.update(&id, &val);
 
     return 0;
 }
@@ -144,8 +147,9 @@
 int trace_open_entry(struct pt_regs *ctx, struct inode *inode,
     struct file *file)
 {
-    u32 pid;
-    pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
     if (FILTER_PID)
         return 0;
 
@@ -153,13 +157,13 @@
     if ((u64)file->f_op != BTRFS_FILE_OPERATIONS)
         return 0;
 
-    // store filep and timestamp by pid
+    // store filep and timestamp by id
     struct val_t val = {};
     val.ts = bpf_ktime_get_ns();
     val.fp = file;
     val.offset = 0;
     if (val.fp)
-        entryinfo.update(&pid, &val);
+        entryinfo.update(&id, &val);
 
     return 0;
 }
@@ -167,18 +171,19 @@
 // btrfs_sync_file():
 int trace_fsync_entry(struct pt_regs *ctx, struct file *file)
 {
-    u32 pid;
-    pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
     if (FILTER_PID)
         return 0;
 
-    // store filep and timestamp by pid
+    // store filep and timestamp by id
     struct val_t val = {};
     val.ts = bpf_ktime_get_ns();
     val.fp = file;
     val.offset = 0;
     if (val.fp)
-        entryinfo.update(&pid, &val);
+        entryinfo.update(&id, &val);
 
     return 0;
 }
@@ -190,9 +195,10 @@
 static int trace_return(struct pt_regs *ctx, int type)
 {
     struct val_t *valp;
-    u32 pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
 
-    valp = entryinfo.lookup(&pid);
+    valp = entryinfo.lookup(&id);
     if (valp == 0) {
         // missed tracing issue or filtered
         return 0;
@@ -201,7 +207,7 @@
     // calculate delta
     u64 ts = bpf_ktime_get_ns();
     u64 delta_us = (ts - valp->ts) / 1000;
-    entryinfo.delete(&pid);
+    entryinfo.delete(&id);
     if (FILTER_US)
         return 0;
 
diff --git a/tools/ext4slower.py b/tools/ext4slower.py
index 848efbc..ceb2a8f 100755
--- a/tools/ext4slower.py
+++ b/tools/ext4slower.py
@@ -22,6 +22,7 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #
 # 11-Feb-2016   Brendan Gregg   Created this.
+# 15-Oct-2016   Dina Goldshtein -p to filter by process ID.
 
 from __future__ import print_function
 from bcc import BPF
@@ -87,7 +88,7 @@
     char file[DNAME_INLINE_LEN];
 };
 
-BPF_HASH(entryinfo, pid_t, struct val_t);
+BPF_HASH(entryinfo, u64, struct val_t);
 BPF_PERF_OUTPUT(events);
 
 //
@@ -99,8 +100,9 @@
 // which I do by checking file->f_op.
 int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb)
 {
-    u32 pid;
-    pid = bpf_get_current_pid_tgid();
+    u64 id =  bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
     if (FILTER_PID)
         return 0;
 
@@ -109,13 +111,13 @@
     if ((u64)fp->f_op != EXT4_FILE_OPERATIONS)
         return 0;
 
-    // store filep and timestamp by pid
+    // store filep and timestamp by id
     struct val_t val = {};
     val.ts = bpf_ktime_get_ns();
     val.fp = fp;
     val.offset = iocb->ki_pos;
     if (val.fp)
-        entryinfo.update(&pid, &val);
+        entryinfo.update(&id, &val);
 
     return 0;
 }
@@ -123,18 +125,19 @@
 // ext4_file_write_iter():
 int trace_write_entry(struct pt_regs *ctx, struct kiocb *iocb)
 {
-    u32 pid;
-    pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
     if (FILTER_PID)
         return 0;
 
-    // store filep and timestamp by pid
+    // store filep and timestamp by id
     struct val_t val = {};
     val.ts = bpf_ktime_get_ns();
     val.fp = iocb->ki_filp;
     val.offset = iocb->ki_pos;
     if (val.fp)
-        entryinfo.update(&pid, &val);
+        entryinfo.update(&id, &val);
 
     return 0;
 }
@@ -143,18 +146,19 @@
 int trace_open_entry(struct pt_regs *ctx, struct inode *inode,
     struct file *file)
 {
-    u32 pid;
-    pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
     if (FILTER_PID)
         return 0;
 
-    // store filep and timestamp by pid
+    // store filep and timestamp by id
     struct val_t val = {};
     val.ts = bpf_ktime_get_ns();
     val.fp = file;
     val.offset = 0;
     if (val.fp)
-        entryinfo.update(&pid, &val);
+        entryinfo.update(&id, &val);
 
     return 0;
 }
@@ -162,18 +166,19 @@
 // ext4_sync_file():
 int trace_fsync_entry(struct pt_regs *ctx, struct file *file)
 {
-    u32 pid;
-    pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
     if (FILTER_PID)
         return 0;
 
-    // store filep and timestamp by pid
+    // store filep and timestamp by id
     struct val_t val = {};
     val.ts = bpf_ktime_get_ns();
     val.fp = file;
     val.offset = 0;
     if (val.fp)
-        entryinfo.update(&pid, &val);
+        entryinfo.update(&id, &val);
 
     return 0;
 }
@@ -185,9 +190,10 @@
 static int trace_return(struct pt_regs *ctx, int type)
 {
     struct val_t *valp;
-    u32 pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
 
-    valp = entryinfo.lookup(&pid);
+    valp = entryinfo.lookup(&id);
     if (valp == 0) {
         // missed tracing issue or filtered
         return 0;
@@ -196,7 +202,7 @@
     // calculate delta
     u64 ts = bpf_ktime_get_ns();
     u64 delta_us = (ts - valp->ts) / 1000;
-    entryinfo.delete(&pid);
+    entryinfo.delete(&id);
     if (FILTER_US)
         return 0;
 
diff --git a/tools/funclatency.py b/tools/funclatency.py
index f5618e0..4e6407c 100755
--- a/tools/funclatency.py
+++ b/tools/funclatency.py
@@ -1,17 +1,16 @@
 #!/usr/bin/python
 # @lint-avoid-python-3-compatibility-imports
 #
-# funclatency   Time kernel funcitons and print latency as a histogram.
+# funclatency   Time functions and print latency as a histogram.
 #               For Linux, uses BCC, eBPF.
 #
-# USAGE: funclatency [-h] [-p PID] [-i INTERVAL] [-T] [-u] [-m] [-r] pattern
+# USAGE: funclatency [-h] [-p PID] [-i INTERVAL] [-T] [-u] [-m] [-F] [-r] [-v]
+#                    pattern
 #
 # Run "funclatency -h" for full usage.
 #
 # The pattern is a string with optional '*' wildcards, similar to file globbing.
-# If you'd prefer to use regular expressions, use the -r option. Matching
-# multiple functions is of limited use, since the output has one histogram for
-# everything. Future versions should split the output histogram by the function.
+# If you'd prefer to use regular expressions, use the -r option.
 #
 # Currently nested or recursive functions are not supported properly, and
 # timestamps will be overwritten, creating dubious output. Try to match single
@@ -21,7 +20,8 @@
 # Copyright (c) 2015 Brendan Gregg.
 # Licensed under the Apache License, Version 2.0 (the "License")
 #
-# 20-Sep-2015   Brendan Gregg   Created this.
+# 20-Sep-2015   Brendan Gregg       Created this.
+# 06-Oct-2016   Sasha Goldshtein    Added user function support.
 
 from __future__ import print_function
 from bcc import BPF
@@ -31,16 +31,18 @@
 
 # arguments
 examples = """examples:
-    ./funclatency do_sys_open       # time the do_sys_open() kenel function
+    ./funclatency do_sys_open       # time the do_sys_open() kernel function
+    ./funclatency c:read            # time the read() C library function
     ./funclatency -u vfs_read       # time vfs_read(), in microseconds
     ./funclatency -m do_nanosleep   # time do_nanosleep(), in milliseconds
     ./funclatency -mTi 5 vfs_read   # output every 5 seconds, with timestamps
     ./funclatency -p 181 vfs_read   # time process 181 only
     ./funclatency 'vfs_fstat*'      # time both vfs_fstat() and vfs_fstatat()
+    ./funclatency 'c:*printf'       # time the *printf family of functions
     ./funclatency -F 'vfs_r*'       # show one histogram per matched function
 """
 parser = argparse.ArgumentParser(
-    description="Time kernel funcitons and print latency as a histogram",
+    description="Time functions and print latency as a histogram",
     formatter_class=argparse.RawDescriptionHelpFormatter,
     epilog=examples)
 parser.add_argument("-p", "--pid",
@@ -57,31 +59,57 @@
     help="show a separate histogram per function")
 parser.add_argument("-r", "--regexp", action="store_true",
     help="use regular expressions. Default is \"*\" wildcards only.")
+parser.add_argument("-v", "--verbose", action="store_true",
+    help="print the BPF program (for debugging purposes)")
 parser.add_argument("pattern",
-    help="search expression for kernel functions")
+    help="search expression for functions")
 args = parser.parse_args()
-pattern = args.pattern
+
+def bail(error):
+    print("Error: " + error)
+    exit(1)
+
+parts = args.pattern.split(':')
+if len(parts) == 1:
+    library = None
+    pattern = args.pattern
+elif len(parts) == 2:
+    library = parts[0]
+    libpath = BPF.find_library(library) or BPF.find_exe(library)
+    if not libpath:
+        bail("can't resolve library %s" % library)
+    library = libpath
+    pattern = parts[1]
+else:
+    bail("unrecognized pattern format '%s'" % pattern)
+
 if not args.regexp:
     pattern = pattern.replace('*', '.*')
     pattern = '^' + pattern + '$'
-debug = 0
 
 # define BPF program
 bpf_text = """
 #include <uapi/linux/ptrace.h>
 #include <linux/blkdev.h>
 
-typedef struct ip_key {
+typedef struct ip_pid {
     u64 ip;
+    u64 pid;
+} ip_pid_t;
+
+typedef struct hist_key {
+    ip_pid_t key;
     u64 slot;
-} ip_key_t;
+} hist_key_t;
 
 BPF_HASH(start, u32);
 STORAGE
 
 int trace_func_entry(struct pt_regs *ctx)
 {
-    u32 pid = bpf_get_current_pid_tgid();
+    u64 pid_tgid = bpf_get_current_pid_tgid();
+    u32 pid = pid_tgid;
+    u32 tgid = pid_tgid >> 32;
     u64 ts = bpf_ktime_get_ns();
 
     FILTER
@@ -94,7 +122,9 @@
 int trace_func_return(struct pt_regs *ctx)
 {
     u64 *tsp, delta;
-    u32 pid = bpf_get_current_pid_tgid();
+    u64 pid_tgid = bpf_get_current_pid_tgid();
+    u32 pid = pid_tgid;
+    u32 tgid = pid_tgid >> 32;
 
     // calculate delta time
     tsp = start.lookup(&pid);
@@ -112,10 +142,13 @@
 }
 """
 
+# do we need to store the IP and pid for each invocation?
+need_key = args.function or (library and not args.pid)
+
 # code substitutions
 if args.pid:
     bpf_text = bpf_text.replace('FILTER',
-        'if (pid != %s) { return 0; }' % args.pid)
+        'if (tgid != %s) { return 0; }' % args.pid)
 else:
     bpf_text = bpf_text.replace('FILTER', '')
 if args.milliseconds:
@@ -127,22 +160,32 @@
 else:
     bpf_text = bpf_text.replace('FACTOR', '')
     label = "nsecs"
-if args.function:
+if need_key:
     bpf_text = bpf_text.replace('STORAGE', 'BPF_HASH(ipaddr, u32);\n' +
-        'BPF_HISTOGRAM(dist, ip_key_t);')
+        'BPF_HISTOGRAM(dist, hist_key_t);')
     # stash the IP on entry, as on return it's kretprobe_trampoline:
     bpf_text = bpf_text.replace('ENTRYSTORE',
         'u64 ip = PT_REGS_IP(ctx); ipaddr.update(&pid, &ip);')
+    pid = '-1' if not library else 'tgid'
     bpf_text = bpf_text.replace('STORE',
-        'u64 ip, *ipp = ipaddr.lookup(&pid); if (ipp) { ip = *ipp; ' +
-        'dist.increment((ip_key_t){ip, bpf_log2l(delta)}); ' +
-        'ipaddr.delete(&pid); }')
+        """
+    u64 ip, *ipp = ipaddr.lookup(&pid);
+    if (ipp) {
+        ip = *ipp;
+        hist_key_t key;
+        key.key.ip = ip;
+        key.key.pid = %s;
+        key.slot = bpf_log2l(delta);
+        dist.increment(key);
+        ipaddr.delete(&pid);
+    }
+        """ % pid)
 else:
     bpf_text = bpf_text.replace('STORAGE', 'BPF_HISTOGRAM(dist);')
     bpf_text = bpf_text.replace('ENTRYSTORE', '')
     bpf_text = bpf_text.replace('STORE',
         'dist.increment(bpf_log2l(delta));')
-if debug:
+if args.verbose:
     print(bpf_text)
 
 # signal handler
@@ -151,9 +194,17 @@
 
 # load BPF program
 b = BPF(text=bpf_text)
-b.attach_kprobe(event_re=pattern, fn_name="trace_func_entry")
-b.attach_kretprobe(event_re=pattern, fn_name="trace_func_return")
-matched = b.num_open_kprobes()
+
+# attach probes
+if not library:
+    b.attach_kprobe(event_re=pattern, fn_name="trace_func_entry")
+    b.attach_kretprobe(event_re=pattern, fn_name="trace_func_return")
+    matched = b.num_open_kprobes()
+else:
+    b.attach_uprobe(name=library, sym_re=pattern, fn_name="trace_func_entry")
+    b.attach_uretprobe(name=library, sym_re=pattern, fn_name="trace_func_return")
+    matched = b.num_open_uprobes()
+
 if matched == 0:
     print("0 functions matched by \"%s\". Exiting." % args.pattern)
     exit()
@@ -163,6 +214,12 @@
     (matched / 2, args.pattern))
 
 # output
+def print_section(key):
+    if not library:
+        return BPF.sym(key[0], -1)
+    else:
+        return "%s [%d]" % (BPF.sym(key[0], key[1]), key[1])
+
 exiting = 0 if args.interval else 1
 dist = b.get_table("dist")
 while (1):
@@ -177,8 +234,9 @@
     if args.timestamp:
         print("%-8s\n" % strftime("%H:%M:%S"), end="")
 
-    if args.function:
-        dist.print_log2_hist(label, "Function", BPF.ksym)
+    if need_key:
+        dist.print_log2_hist(label, "Function", section_print_fn=print_section,
+            bucket_fn=lambda k: (k.ip, k.pid))
     else:
         dist.print_log2_hist(label)
     dist.clear()
diff --git a/tools/funclatency_example.txt b/tools/funclatency_example.txt
index 2db8d2d..a4c6f88 100644
--- a/tools/funclatency_example.txt
+++ b/tools/funclatency_example.txt
@@ -47,6 +47,98 @@
 efficient way to time kernel functions and examine their latency distribution.
 
 
+Now trace a user function, pthread_mutex_lock in libpthread, to determine if
+there is considerable lock contention:
+
+# ./funclatency pthread:pthread_mutex_lock -p $(pidof contentions)
+Tracing 1 function for "pthread:pthread_mutex_lock"... Hit Ctrl-C to end.
+     nsecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 508967   |****************************************|
+      4096 -> 8191       : 70072    |*****                                   |
+      8192 -> 16383      : 27686    |**                                      |
+     16384 -> 32767      : 5075     |                                        |
+     32768 -> 65535      : 2318     |                                        |
+     65536 -> 131071     : 581      |                                        |
+    131072 -> 262143     : 38       |                                        |
+    262144 -> 524287     : 5        |                                        |
+    524288 -> 1048575    : 1        |                                        |
+   1048576 -> 2097151    : 9        |                                        |
+Detaching...
+
+It seems that most calls to pthread_mutex_lock completed rather quickly (in 
+under 4us), but there were some cases of considerable contention, sometimes
+over a full millisecond.
+
+
+Run a quick-and-dirty profiler over all the functions in an executable:
+# ./funclatency /home/user/primes:* -p $(pidof primes) -F
+Tracing 15 functions for "/home/user/primes:*"... Hit Ctrl-C to end.
+^C
+
+Function = is_prime [6556]
+     nsecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 1495322  |****************************************|
+      4096 -> 8191       : 95744    |**                                      |
+      8192 -> 16383      : 9926     |                                        |
+     16384 -> 32767      : 3070     |                                        |
+     32768 -> 65535      : 1415     |                                        |
+     65536 -> 131071     : 112      |                                        |
+    131072 -> 262143     : 9        |                                        |
+    262144 -> 524287     : 3        |                                        |
+    524288 -> 1048575    : 0        |                                        |
+   1048576 -> 2097151    : 8        |                                        |
+
+Function = insert_result [6556]
+     nsecs               : count     distribution
+         0 -> 1          : 0        |                                        |
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 0        |                                        |
+       128 -> 255        : 0        |                                        |
+       256 -> 511        : 0        |                                        |
+       512 -> 1023       : 0        |                                        |
+      1024 -> 2047       : 0        |                                        |
+      2048 -> 4095       : 111047   |****************************************|
+      4096 -> 8191       : 3998     |*                                       |
+      8192 -> 16383      : 720      |                                        |
+     16384 -> 32767      : 238      |                                        |
+     32768 -> 65535      : 106      |                                        |
+     65536 -> 131071     : 5        |                                        |
+    131072 -> 262143     : 4        |                                        |
+Detaching...
+
+From the results, we can see that the is_prime function has something resembling
+an exponential distribution -- very few primes take a very long time to test,
+while most numbers are verified as prime or composite in less than 4us. The
+insert_result function exhibits a similar phenomenon, likely due to contention
+over a shared results container.
+
+
 Now vfs_read() is traced, and a microseconds histogram printed:
 
 # ./funclatency -u vfs_read
@@ -239,13 +331,13 @@
 USAGE message:
 
 # ./funclatency -h
-usage: funclatency [-h] [-p PID] [-i INTERVAL] [-T] [-u] [-m] [-F] [-r]
+usage: funclatency [-h] [-p PID] [-i INTERVAL] [-T] [-u] [-m] [-F] [-r] [-v]
                    pattern
 
-Time kernel functions and print latency as a histogram
+Time functions and print latency as a histogram
 
 positional arguments:
-  pattern               search expression for kernel functions
+  pattern               search expression for functions
 
 optional arguments:
   -h, --help            show this help message and exit
@@ -258,12 +350,15 @@
   -F, --function        show a separate histogram per function
   -r, --regexp          use regular expressions. Default is "*" wildcards
                         only.
+  -v, --verbose         print the BPF program (for debugging purposes)
 
 examples:
     ./funclatency do_sys_open       # time the do_sys_open() kernel function
+    ./funclatency c:read            # time the read() C library function
     ./funclatency -u vfs_read       # time vfs_read(), in microseconds
     ./funclatency -m do_nanosleep   # time do_nanosleep(), in milliseconds
     ./funclatency -mTi 5 vfs_read   # output every 5 seconds, with timestamps
     ./funclatency -p 181 vfs_read   # time process 181 only
     ./funclatency 'vfs_fstat*'      # time both vfs_fstat() and vfs_fstatat()
+    ./funclatency 'c:*printf'       # time the *printf family of functions
     ./funclatency -F 'vfs_r*'       # show one histogram per matched function
diff --git a/tools/mountsnoop.py b/tools/mountsnoop.py
new file mode 100755
index 0000000..dfaafc3
--- /dev/null
+++ b/tools/mountsnoop.py
@@ -0,0 +1,411 @@
+#!/usr/bin/env python
+#
+# mountsnoop Trace mount() and umount syscalls.
+#            For Linux, uses BCC, eBPF. Embedded C.
+#
+# USAGE: mountsnoop [-h]
+#
+# Copyright (c) 2016 Facebook, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 14-Oct-2016   Omar Sandoval   Created this.
+
+from __future__ import print_function
+import argparse
+import bcc
+import ctypes
+import errno
+import functools
+import sys
+
+
+bpf_text = r"""
+#include <uapi/linux/ptrace.h>
+#include <linux/sched.h>
+
+#include <linux/nsproxy.h>
+#include <linux/ns_common.h>
+
+/*
+ * XXX: struct mnt_namespace is defined in fs/mount.h, which is private to the
+ * VFS and not installed in any kernel-devel packages. So, let's duplicate the
+ * important part of the definition. There are actually more members in the
+ * real struct, but we don't need them, and they're more likely to change.
+ */
+struct mnt_namespace {
+    atomic_t count;
+    struct ns_common ns;
+};
+
+/*
+ * XXX: this could really use first-class string support in BPF. target is a
+ * NUL-terminated path up to PATH_MAX in length. source and type are
+ * NUL-terminated strings up to PAGE_SIZE in length. data is a weird case: it's
+ * almost always a NUL-terminated string, but for some filesystems (e.g., older
+ * NFS variants), it's a binary structure with plenty of NUL bytes, so the
+ * kernel always copies up to PAGE_SIZE bytes, stopping when it hits a fault.
+ *
+ * The best we can do with the existing BPF helpers is to copy as much of each
+ * argument as we can. Our stack space is limited, and we need to leave some
+ * headroom for the rest of the function, so this should be a decent value.
+ */
+#define MAX_STR_LEN 412
+
+enum event_type {
+    EVENT_MOUNT,
+    EVENT_MOUNT_SOURCE,
+    EVENT_MOUNT_TARGET,
+    EVENT_MOUNT_TYPE,
+    EVENT_MOUNT_DATA,
+    EVENT_MOUNT_RET,
+    EVENT_UMOUNT,
+    EVENT_UMOUNT_TARGET,
+    EVENT_UMOUNT_RET,
+};
+
+struct data_t {
+    enum event_type type;
+    pid_t pid, tgid;
+    union {
+        /* EVENT_MOUNT, EVENT_UMOUNT */
+        struct {
+            /* current->nsproxy->mnt_ns->ns.inum */
+            unsigned int mnt_ns;
+            char comm[TASK_COMM_LEN];
+            unsigned long flags;
+        } enter;
+        /*
+         * EVENT_MOUNT_SOURCE, EVENT_MOUNT_TARGET, EVENT_MOUNT_TYPE,
+         * EVENT_MOUNT_DATA, EVENT_UMOUNT_TARGET
+         */
+        char str[MAX_STR_LEN];
+        /* EVENT_MOUNT_RET, EVENT_UMOUNT_RET */
+        int retval;
+    };
+};
+
+BPF_PERF_OUTPUT(events);
+
+int kprobe__sys_mount(struct pt_regs *ctx, char __user *source,
+                      char __user *target, char __user *type,
+                      unsigned long flags)
+{
+    /* sys_mount takes too many arguments */
+    char __user *data = (char __user *)PT_REGS_PARM5(ctx);
+    struct data_t event = {};
+    struct task_struct *task;
+    struct nsproxy *nsproxy;
+    struct mnt_namespace *mnt_ns;
+
+    event.pid = bpf_get_current_pid_tgid() & 0xffffffff;
+    event.tgid = bpf_get_current_pid_tgid() >> 32;
+
+    event.type = EVENT_MOUNT;
+    bpf_get_current_comm(event.enter.comm, sizeof(event.enter.comm));
+    event.enter.flags = flags;
+    task = (struct task_struct *)bpf_get_current_task();
+    bpf_probe_read(&nsproxy, sizeof(nsproxy), &task->nsproxy);
+    bpf_probe_read(&mnt_ns, sizeof(mnt_ns), &nsproxy->mnt_ns);
+    bpf_probe_read(&event.enter.mnt_ns, sizeof(event.enter.mnt_ns),
+               &mnt_ns->ns.inum);
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    event.type = EVENT_MOUNT_SOURCE;
+    memset(event.str, 0, sizeof(event.str));
+    bpf_probe_read(event.str, sizeof(event.str), source);
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    event.type = EVENT_MOUNT_TARGET;
+    memset(event.str, 0, sizeof(event.str));
+    bpf_probe_read(event.str, sizeof(event.str), target);
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    event.type = EVENT_MOUNT_TYPE;
+    memset(event.str, 0, sizeof(event.str));
+    bpf_probe_read(event.str, sizeof(event.str), type);
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    event.type = EVENT_MOUNT_DATA;
+    memset(event.str, 0, sizeof(event.str));
+    bpf_probe_read(event.str, sizeof(event.str), data);
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    return 0;
+}
+
+int kretprobe__sys_mount(struct pt_regs *ctx)
+{
+    struct data_t event = {};
+
+    event.type = EVENT_MOUNT_RET;
+    event.pid = bpf_get_current_pid_tgid() & 0xffffffff;
+    event.tgid = bpf_get_current_pid_tgid() >> 32;
+    event.retval = PT_REGS_RC(ctx);
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    return 0;
+}
+
+int kprobe__sys_umount(struct pt_regs *ctx, char __user *target, int flags)
+{
+    struct data_t event = {};
+    struct task_struct *task;
+    struct nsproxy *nsproxy;
+    struct mnt_namespace *mnt_ns;
+
+    event.pid = bpf_get_current_pid_tgid() & 0xffffffff;
+    event.tgid = bpf_get_current_pid_tgid() >> 32;
+
+    event.type = EVENT_UMOUNT;
+    bpf_get_current_comm(event.enter.comm, sizeof(event.enter.comm));
+    event.enter.flags = flags;
+    task = (struct task_struct *)bpf_get_current_task();
+    bpf_probe_read(&nsproxy, sizeof(nsproxy), &task->nsproxy);
+    bpf_probe_read(&mnt_ns, sizeof(mnt_ns), &nsproxy->mnt_ns);
+    bpf_probe_read(&event.enter.mnt_ns, sizeof(event.enter.mnt_ns),
+               &mnt_ns->ns.inum);
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    event.type = EVENT_UMOUNT_TARGET;
+    memset(event.str, 0, sizeof(event.str));
+    bpf_probe_read(event.str, sizeof(event.str), target);
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    return 0;
+}
+
+int kretprobe__sys_umount(struct pt_regs *ctx)
+{
+    struct data_t event = {};
+
+    event.type = EVENT_UMOUNT_RET;
+    event.pid = bpf_get_current_pid_tgid() & 0xffffffff;
+    event.tgid = bpf_get_current_pid_tgid() >> 32;
+    event.retval = PT_REGS_RC(ctx);
+    events.perf_submit(ctx, &event, sizeof(event));
+
+    return 0;
+}
+"""
+
+# sys/mount.h
+MS_MGC_VAL = 0xc0ed0000
+MS_MGC_MSK = 0xffff0000
+MOUNT_FLAGS = [
+    ('MS_RDONLY', 1),
+    ('MS_NOSUID', 2),
+    ('MS_NODEV', 4),
+    ('MS_NOEXEC', 8),
+    ('MS_SYNCHRONOUS', 16),
+    ('MS_REMOUNT', 32),
+    ('MS_MANDLOCK', 64),
+    ('MS_DIRSYNC', 128),
+    ('MS_NOATIME', 1024),
+    ('MS_NODIRATIME', 2048),
+    ('MS_BIND', 4096),
+    ('MS_MOVE', 8192),
+    ('MS_REC', 16384),
+    ('MS_SILENT', 32768),
+    ('MS_POSIXACL', 1 << 16),
+    ('MS_UNBINDABLE', 1 << 17),
+    ('MS_PRIVATE', 1 << 18),
+    ('MS_SLAVE', 1 << 19),
+    ('MS_SHARED', 1 << 20),
+    ('MS_RELATIME', 1 << 21),
+    ('MS_KERNMOUNT', 1 << 22),
+    ('MS_I_VERSION', 1 << 23),
+    ('MS_STRICTATIME', 1 << 24),
+    ('MS_LAZYTIME', 1 << 25),
+    ('MS_ACTIVE', 1 << 30),
+    ('MS_NOUSER', 1 << 31),
+]
+UMOUNT_FLAGS = [
+    ('MNT_FORCE', 1),
+    ('MNT_DETACH', 2),
+    ('MNT_EXPIRE', 4),
+    ('UMOUNT_NOFOLLOW', 8),
+]
+
+
+TASK_COMM_LEN = 16  # linux/sched.h
+MAX_STR_LEN = 412
+
+
+class EventType(object):
+    EVENT_MOUNT = 0
+    EVENT_MOUNT_SOURCE = 1
+    EVENT_MOUNT_TARGET = 2
+    EVENT_MOUNT_TYPE = 3
+    EVENT_MOUNT_DATA = 4
+    EVENT_MOUNT_RET = 5
+    EVENT_UMOUNT = 6
+    EVENT_UMOUNT_TARGET = 7
+    EVENT_UMOUNT_RET = 8
+
+
+class EnterData(ctypes.Structure):
+    _fields_ = [
+        ('mnt_ns', ctypes.c_uint),
+        ('comm', ctypes.c_char * TASK_COMM_LEN),
+        ('flags', ctypes.c_ulong),
+    ]
+
+
+class DataUnion(ctypes.Union):
+    _fields_ = [
+        ('enter', EnterData),
+        ('str', ctypes.c_char * MAX_STR_LEN),
+        ('retval', ctypes.c_int),
+    ]
+
+
+class Event(ctypes.Structure):
+    _fields_ = [
+        ('type', ctypes.c_uint),
+        ('pid', ctypes.c_uint),
+        ('tgid', ctypes.c_uint),
+        ('union', DataUnion),
+    ]
+
+
+def _decode_flags(flags, flag_list):
+    str_flags = []
+    for flag, bit in flag_list:
+        if flags & bit:
+            str_flags.append(flag)
+        flags &= ~bit
+    if flags or not str_flags:
+        str_flags.append('0x{:x}'.format(flags))
+    return str_flags
+
+
+def decode_flags(flags, flag_list):
+    return '|'.join(_decode_flags(flags, flag_list))
+
+
+def decode_mount_flags(flags):
+    str_flags = []
+    if flags & MS_MGC_MSK == MS_MGC_VAL:
+        flags &= ~MS_MGC_MSK
+        str_flags.append('MS_MGC_VAL')
+    str_flags.extend(_decode_flags(flags, MOUNT_FLAGS))
+    return '|'.join(str_flags)
+
+
+def decode_umount_flags(flags):
+    return decode_flags(flags, UMOUNT_FLAGS)
+
+
+def decode_errno(retval):
+    try:
+        return '-' + errno.errorcode[-retval]
+    except KeyError:
+        return str(retval)
+
+
+_escape_chars = {
+    ord('\a'): '\\a',
+    ord('\b'): '\\b',
+    ord('\t'): '\\t',
+    ord('\n'): '\\n',
+    ord('\v'): '\\v',
+    ord('\f'): '\\f',
+    ord('\r'): '\\r',
+    ord('"'): '\\"',
+    ord('\\'): '\\\\',
+}
+
+
+def escape_character(c):
+    try:
+        return _escape_chars[c]
+    except KeyError:
+        if 0x20 <= c <= 0x7e:
+            return chr(c)
+        else:
+            return '\\x{:02x}'.format(c)
+
+
+if sys.version_info.major < 3:
+    def decode_mount_string(s):
+        return '"{}"'.format(''.join(escape_character(ord(c)) for c in s))
+else:
+    def decode_mount_string(s):
+        return '"{}"'.format(''.join(escape_character(c) for c in s))
+
+
+def print_event(mounts, umounts, cpu, data, size):
+    event = ctypes.cast(data, ctypes.POINTER(Event)).contents
+
+    try:
+        if event.type == EventType.EVENT_MOUNT:
+            mounts[event.pid] = {
+                'pid': event.pid,
+                'tgid': event.tgid,
+                'mnt_ns': event.union.enter.mnt_ns,
+                'comm': event.union.enter.comm,
+                'flags': event.union.enter.flags,
+            }
+        elif event.type == EventType.EVENT_MOUNT_SOURCE:
+            mounts[event.pid]['source'] = event.union.str
+        elif event.type == EventType.EVENT_MOUNT_TARGET:
+            mounts[event.pid]['target'] = event.union.str
+        elif event.type == EventType.EVENT_MOUNT_TYPE:
+            mounts[event.pid]['type'] = event.union.str
+        elif event.type == EventType.EVENT_MOUNT_DATA:
+            # XXX: data is not always a NUL-terminated string
+            mounts[event.pid]['data'] = event.union.str
+        elif event.type == EventType.EVENT_UMOUNT:
+            umounts[event.pid] = {
+                'pid': event.pid,
+                'tgid': event.tgid,
+                'mnt_ns': event.union.enter.mnt_ns,
+                'comm': event.union.enter.comm,
+                'flags': event.union.enter.flags,
+            }
+        elif event.type == EventType.EVENT_UMOUNT_TARGET:
+            umounts[event.pid]['target'] = event.union.str
+        elif (event.type == EventType.EVENT_MOUNT_RET or
+              event.type == EventType.EVENT_UMOUNT_RET):
+            if event.type == EventType.EVENT_MOUNT_RET:
+                syscall = mounts.pop(event.pid)
+                call = 'mount({source}, {target}, {type}, {flags}, {data}) = {retval}'.format(
+                    source=decode_mount_string(syscall['source']),
+                    target=decode_mount_string(syscall['target']),
+                    type=decode_mount_string(syscall['type']),
+                    flags=decode_mount_flags(syscall['flags']),
+                    data=decode_mount_string(syscall['data']),
+                    retval=decode_errno(event.union.retval))
+            else:
+                syscall = umounts.pop(event.pid)
+                call = 'umount({target}, {flags}) = {retval}'.format(
+                    target=decode_mount_string(syscall['target']),
+                    flags=decode_umount_flags(syscall['flags']),
+                    retval=decode_errno(event.union.retval))
+            print('{:16} {:<7} {:<7} {:<11} {}'.format(
+                syscall['comm'].decode(), syscall['tgid'], syscall['pid'],
+                syscall['mnt_ns'], call))
+    except KeyError:
+        # This might happen if we lost an event.
+        pass
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='trace mount() and umount() syscalls'
+    )
+    args = parser.parse_args()
+
+    mounts = {}
+    umounts = {}
+    b = bcc.BPF(text=bpf_text)
+    b['events'].open_perf_buffer(
+        functools.partial(print_event, mounts, umounts))
+    print('{:16} {:<7} {:<7} {:<11} {}'.format(
+        'COMM', 'PID', 'TID', 'MNT_NS', 'CALL'))
+    while True:
+        b.kprobe_poll()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/mountsnoop_example.txt b/tools/mountsnoop_example.txt
new file mode 100644
index 0000000..1c5144e
--- /dev/null
+++ b/tools/mountsnoop_example.txt
@@ -0,0 +1,28 @@
+Demonstrations of mountsnoop.
+
+mountsnoop traces the mount() and umount syscalls system-wide. For example,
+running the following series of commands produces this output:
+
+# mount --bind /mnt /mnt
+# umount /mnt
+# unshare -m
+# mount --bind /mnt /mnt
+# umount /mnt
+
+# ./mountsnoop.py
+COMM             PID     TID     MNT_NS      CALL
+mount            710     710     4026531840  mount("/mnt", "/mnt", "", MS_MGC_VAL|MS_BIND, "") = 0
+umount           714     714     4026531840  umount("/mnt", 0x0) = 0
+unshare          717     717     4026532160  mount("none", "/", "", MS_REC|MS_PRIVATE, "") = 0
+mount            725     725     4026532160  mount("/mnt", "/mnt", "", MS_MGC_VAL|MS_BIND, "") = 0
+umount           728     728     4026532160  umount("/mnt", 0x0) = 0
+
+The output shows the calling command, its process ID and thread ID, the mount
+namespace the call was made in, and the call itself.
+
+The mount namespace number is an inode number that uniquely identifies the
+namespace in the running system. This can also be obtained from readlink
+/proc/$PID/ns/mnt.
+
+Note that because of restrictions in BPF, the string arguments to either
+syscall may be truncated.
diff --git a/tools/opensnoop.py b/tools/opensnoop.py
index f2bac4b..0c2b9b5 100755
--- a/tools/opensnoop.py
+++ b/tools/opensnoop.py
@@ -4,13 +4,14 @@
 # opensnoop Trace open() syscalls.
 #           For Linux, uses BCC, eBPF. Embedded C.
 #
-# USAGE: opensnoop [-h] [-t] [-x] [-p PID]
+# USAGE: opensnoop [-h] [-T] [-x] [-p PID] [-t TID] [-n NAME]
 #
 # Copyright (c) 2015 Brendan Gregg.
 # Licensed under the Apache License, Version 2.0 (the "License")
 #
 # 17-Sep-2015   Brendan Gregg   Created this.
-# 29-Apr-2016   Allan McAleavy updated for BPF_PERF_OUTPUT
+# 29-Apr-2016   Allan McAleavy  Updated for BPF_PERF_OUTPUT.
+# 08-Oct-2016   Dina Goldshtein Support filtering by PID and TID.
 
 from __future__ import print_function
 from bcc import BPF
@@ -20,21 +21,24 @@
 # arguments
 examples = """examples:
     ./opensnoop           # trace all open() syscalls
-    ./opensnoop -t        # include timestamps
+    ./opensnoop -T        # include timestamps
     ./opensnoop -x        # only show failed opens
     ./opensnoop -p 181    # only trace PID 181
+    ./opensnoop -t 123    # only trace TID 123
     ./opensnoop -n main   # only print process names containing "main"
 """
 parser = argparse.ArgumentParser(
     description="Trace open() syscalls",
     formatter_class=argparse.RawDescriptionHelpFormatter,
     epilog=examples)
-parser.add_argument("-t", "--timestamp", action="store_true",
+parser.add_argument("-T", "--timestamp", action="store_true",
     help="include timestamp on output")
 parser.add_argument("-x", "--failed", action="store_true",
     help="only show failed opens")
 parser.add_argument("-p", "--pid",
     help="trace this PID only")
+parser.add_argument("-t", "--tid",
+    help="trace this TID only")
 parser.add_argument("-n", "--name",
     help="only print process names containing this name")
 args = parser.parse_args()
@@ -47,35 +51,36 @@
 #include <linux/sched.h>
 
 struct val_t {
-    u32 pid;
+    u64 id;
     u64 ts;
     char comm[TASK_COMM_LEN];
     const char *fname;
 };
 
 struct data_t {
-    u32 pid;
+    u64 id;
     u64 ts;
     int ret;
     char comm[TASK_COMM_LEN];
     char fname[NAME_MAX];
 };
 
-BPF_HASH(args_filename, u32, const char *);
-BPF_HASH(infotmp, u32, struct val_t);
+BPF_HASH(infotmp, u64, struct val_t);
 BPF_PERF_OUTPUT(events);
 
 int trace_entry(struct pt_regs *ctx, const char __user *filename)
 {
     struct val_t val = {};
-    u32 pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+    u32 tid = id;       // Cast and get the lower part
 
     FILTER
     if (bpf_get_current_comm(&val.comm, sizeof(val.comm)) == 0) {
-        val.pid = bpf_get_current_pid_tgid();
+        val.id = id;
         val.ts = bpf_ktime_get_ns();
         val.fname = filename;
-        infotmp.update(&pid, &val);
+        infotmp.update(&id, &val);
     }
 
     return 0;
@@ -83,31 +88,33 @@
 
 int trace_return(struct pt_regs *ctx)
 {
-    u32 pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
     struct val_t *valp;
     struct data_t data = {};
 
     u64 tsp = bpf_ktime_get_ns();
 
-    valp = infotmp.lookup(&pid);
+    valp = infotmp.lookup(&id);
     if (valp == 0) {
         // missed entry
         return 0;
     }
     bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm);
     bpf_probe_read(&data.fname, sizeof(data.fname), (void *)valp->fname);
-    data.pid = valp->pid;
+    data.id = valp->id;
     data.ts = tsp / 1000;
     data.ret = PT_REGS_RC(ctx);
 
     events.perf_submit(ctx, &data, sizeof(data));
-    infotmp.delete(&pid);
-    args_filename.delete(&pid);
+    infotmp.delete(&id);
 
     return 0;
 }
 """
-if args.pid:
+if args.tid:  # TID trumps PID
+    bpf_text = bpf_text.replace('FILTER',
+        'if (tid != %s) { return 0; }' % args.tid)
+elif args.pid:
     bpf_text = bpf_text.replace('FILTER',
         'if (pid != %s) { return 0; }' % args.pid)
 else:
@@ -125,7 +132,7 @@
 
 class Data(ct.Structure):
     _fields_ = [
-        ("pid", ct.c_ulonglong),
+        ("id", ct.c_ulonglong),
         ("ts", ct.c_ulonglong),
         ("ret", ct.c_int),
         ("comm", ct.c_char * TASK_COMM_LEN),
@@ -137,7 +144,8 @@
 # header
 if args.timestamp:
     print("%-14s" % ("TIME(s)"), end="")
-print("%-6s %-16s %4s %3s %s" % ("PID", "COMM", "FD", "ERR", "PATH"))
+print("%-6s %-16s %4s %3s %s" %
+      ("TID" if args.tid else "PID", "COMM", "FD", "ERR", "PATH"))
 
 # process event
 def print_event(cpu, data, size):
@@ -165,8 +173,9 @@
         delta = event.ts - initial_ts
         print("%-14.9f" % (float(delta) / 1000000), end="")
 
-    print("%-6d %-16s %4d %3d %s" % (event.pid, event.comm,
-          fd_s, err, event.fname))
+    print("%-6d %-16s %4d %3d %s" %
+          (event.id & 0xffffffff if args.tid else event.id >> 32,
+           event.comm, fd_s, err, event.fname))
 
 # loop with callback to print_event
 b["events"].open_perf_buffer(print_event)
diff --git a/tools/opensnoop_example.txt b/tools/opensnoop_example.txt
index b297e33..fc92001 100644
--- a/tools/opensnoop_example.txt
+++ b/tools/opensnoop_example.txt
@@ -48,9 +48,9 @@
 
 
 The -p option can be used to filter on a PID, which is filtered in-kernel. Here
-I've used it with -t to print timestamps:
+I've used it with -T to print timestamps:
 
- ./opensnoop -tp 1956
+ ./opensnoop -Tp 1956
 TIME(s)       PID    COMM               FD ERR PATH
 0.000000000   1956   supervise           9   0 supervise/status.new
 0.000289999   1956   supervise           9   0 supervise/status.new
@@ -123,18 +123,22 @@
 USAGE message:
 
 # ./opensnoop -h
-usage: opensnoop [-h] [-t] [-x] [-p PID]
+usage: opensnoop [-h] [-T] [-x] [-p PID] [-t TID] [-n NAME]
 
 Trace open() syscalls
 
 optional arguments:
-  -h, --help         show this help message and exit
-  -t, --timestamp    include timestamp on output
-  -x, --failed       only show failed opens
-  -p PID, --pid PID  trace this PID only
+  -h, --help            show this help message and exit
+  -T, --timestamp       include timestamp on output
+  -x, --failed          only show failed opens
+  -p PID, --pid PID     trace this PID only
+  -t TID, --tid TID     trace this TID only
+  -n NAME, --name NAME  only print process names containing this name
 
 examples:
     ./opensnoop           # trace all open() syscalls
-    ./opensnoop -t        # include timestamps
+    ./opensnoop -T        # include timestamps
     ./opensnoop -x        # only show failed opens
     ./opensnoop -p 181    # only trace PID 181
+    ./opensnoop -t 123    # only trace TID 123
+    ./opensnoop -n main   # only print process names containing "main"
diff --git a/tools/stackcount.py b/tools/stackcount.py
index d9cdb3e..d26dfc7 100755
--- a/tools/stackcount.py
+++ b/tools/stackcount.py
@@ -1,139 +1,290 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
-# stackcount    Count kernel function calls and their stack traces.
+# stackcount    Count events and their stack traces.
 #               For Linux, uses BCC, eBPF.
 #
-# USAGE: stackcount [-h] [-p PID] [-i INTERVAL] [-T] [-r] pattern
+# USAGE: stackcount [-h] [-p PID] [-i INTERVAL] [-T] [-r] [-s]
+#                   [-P] [-v] pattern
 #
 # The pattern is a string with optional '*' wildcards, similar to file
 # globbing. If you'd prefer to use regular expressions, use the -r option.
 #
-# The current implementation uses an unrolled loop for x86_64, and was written
-# as a proof of concept. This implementation should be replaced in the future
-# with an appropriate bpf_ call, when available.
-#
-# Currently limited to a stack trace depth of 11 (maxdepth + 1).
-#
 # Copyright 2016 Netflix, Inc.
 # Licensed under the Apache License, Version 2.0 (the "License")
 #
-# 12-Jan-2016	Brendan Gregg	Created this.
+# 12-Jan-2016	Brendan Gregg	    Created this.
+# 09-Jul-2016   Sasha Goldshtein    Generalized for uprobes and tracepoints.
 
 from __future__ import print_function
-from bcc import BPF
+from bcc import BPF, USDT
 from time import sleep, strftime
 import argparse
+import re
 import signal
+import sys
+import traceback
 
-# arguments
-examples = """examples:
-    ./stackcount submit_bio       # count kernel stack traces for submit_bio
-    ./stackcount ip_output        # count kernel stack traces for ip_output
-    ./stackcount -s ip_output     # show symbol offsets
-    ./stackcount -sv ip_output    # show offsets and raw addresses (verbose)
-    ./stackcount 'tcp_send*'      # count stacks for funcs matching tcp_send*
-    ./stackcount -r '^tcp_send.*' # same as above, using regular expressions
-    ./stackcount -Ti 5 ip_output  # output every 5 seconds, with timestamps
-    ./stackcount -p 185 ip_output # count ip_output stacks for PID 185 only
-"""
-parser = argparse.ArgumentParser(
-    description="Count kernel function calls and their stack traces",
-    formatter_class=argparse.RawDescriptionHelpFormatter,
-    epilog=examples)
-parser.add_argument("-p", "--pid",
-    help="trace this PID only")
-parser.add_argument("-i", "--interval", default=99999999,
-    help="summary interval, seconds")
-parser.add_argument("-T", "--timestamp", action="store_true",
-    help="include timestamp on output")
-parser.add_argument("-r", "--regexp", action="store_true",
-    help="use regular expressions. Default is \"*\" wildcards only.")
-parser.add_argument("-s", "--offset", action="store_true",
-    help="show address offsets")
-parser.add_argument("-v", "--verbose", action="store_true",
-    help="show raw addresses")
-parser.add_argument("pattern",
-    help="search expression for kernel functions")
-args = parser.parse_args()
-pattern = args.pattern
-if not args.regexp:
-    pattern = pattern.replace('*', '.*')
-    pattern = '^' + pattern + '$'
-offset = args.offset
-verbose = args.verbose
-debug = 0
-maxdepth = 10    # and MAXDEPTH
+debug = False
 
-# signal handler
-def signal_ignore(signal, frame):
-    print()
+class Probe(object):
+    def __init__(self, pattern, use_regex=False, pid=None, per_pid=False):
+        """Init a new probe.
 
-# load BPF program
-bpf_text = """
-#include <uapi/linux/ptrace.h>
+        Init the probe from the pattern provided by the user. The supported
+        patterns mimic the 'trace' and 'argdist' tools, but are simpler because
+        we don't have to distinguish between probes and retprobes.
 
-BPF_HASH(counts, int);
-BPF_STACK_TRACE(stack_traces, 1024);
+            func            -- probe a kernel function
+            lib:func        -- probe a user-space function in the library 'lib'
+            p::func         -- same thing as 'func'
+            p:lib:func      -- same thing as 'lib:func'
+            t:cat:event     -- probe a kernel tracepoint
+            u:lib:probe     -- probe a USDT tracepoint
+        """
+        parts = pattern.split(':')
+        if len(parts) == 1:
+            parts = ["p", "", parts[0]]
+        elif len(parts) == 2:
+            parts = ["p", parts[0], parts[1]]
+        elif len(parts) == 3:
+            if parts[0] == "t":
+                parts = ["t", "", "%s:%s" % tuple(parts[1:])]
+            if parts[0] not in ["p", "t", "u"]:
+                raise Exception("Type must be 'p', 't', or 'u', but got %s" %
+                                parts[0])
+        else:
+            raise Exception("Too many ':'-separated components in pattern %s" %
+                            pattern)
 
-int trace_count(struct pt_regs *ctx) {
+        (self.type, self.library, self.pattern) = parts
+        if not use_regex:
+            self.pattern = self.pattern.replace('*', '.*')
+            self.pattern = '^' + self.pattern + '$'
+
+        if (self.type == "p" and self.library) or self.type == "u":
+            libpath = BPF.find_library(self.library)
+            if libpath is None:
+                # This might be an executable (e.g. 'bash')
+                libpath = BPF.find_exe(self.library)
+            if libpath is None or len(libpath) == 0:
+                raise Exception("unable to find library %s" % self.library)
+            self.library = libpath
+
+        self.pid = pid
+        self.per_pid = per_pid
+        self.matched = 0
+
+    def is_kernel_probe(self):
+        return self.type == "t" or (self.type == "p" and self.library == "")
+
+    def attach(self):
+        if self.type == "p":
+            if self.library:
+                self.bpf.attach_uprobe(name=self.library,
+                                       sym_re=self.pattern,
+                                       fn_name="trace_count",
+                                       pid=self.pid or -1)
+                self.matched = self.bpf.num_open_uprobes()
+            else:
+                self.bpf.attach_kprobe(event_re=self.pattern,
+                                       fn_name="trace_count",
+                                       pid=self.pid or -1)
+                self.matched = self.bpf.num_open_kprobes()
+        elif self.type == "t":
+            self.bpf.attach_tracepoint(tp_re=self.pattern,
+                                       fn_name="trace_count",
+                                       pid=self.pid or -1)
+            self.matched = self.bpf.num_open_tracepoints()
+        elif self.type == "u":
+            pass # Nothing to do -- attach already happened in `load`
+
+        if self.matched == 0:
+            raise Exception("No functions matched by pattern %s" % self.pattern)
+
+    def load(self):
+        trace_count_text = """
+int trace_count(void *ctx) {
     FILTER
-    int key = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID);
+    struct key_t key = {};
+    key.pid = GET_PID;
+    key.stackid = stack_traces.get_stackid(ctx, STACK_FLAGS);
     u64 zero = 0;
     u64 *val = counts.lookup_or_init(&key, &zero);
     (*val)++;
     return 0;
 }
-"""
-if args.pid:
-    bpf_text = bpf_text.replace('FILTER',
-        ('u32 pid; pid = bpf_get_current_pid_tgid(); ' +
-        'if (pid != %s) { return 0; }') % (args.pid))
-else:
-    bpf_text = bpf_text.replace('FILTER', '')
-if debug:
-    print(bpf_text)
-b = BPF(text=bpf_text)
-b.attach_kprobe(event_re=pattern, fn_name="trace_count")
-matched = b.num_open_kprobes()
-if matched == 0:
-    print("0 functions matched by \"%s\". Exiting." % args.pattern)
-    exit()
+        """
+        bpf_text = """#include <uapi/linux/ptrace.h>
 
-# header
-print("Tracing %d functions for \"%s\"... Hit Ctrl-C to end." %
-    (matched, args.pattern))
+struct key_t {
+    u32 pid;
+    int stackid;
+};
 
-def print_frame(addr):
-    print("  ", end="")
-    if verbose:
-        print("%-16x " % addr, end="")
-    if offset:
-        print("%s" % b.ksymaddr(addr))
-    else:
-        print("%s" % b.ksym(addr))
+BPF_HASH(counts, struct key_t);
+BPF_STACK_TRACE(stack_traces, 1024);
 
-# output
-exiting = 0 if args.interval else 1
-while (1):
+        """
+
+        # We really mean the tgid from the kernel's perspective, which is in
+        # the top 32 bits of bpf_get_current_pid_tgid().
+        if self.is_kernel_probe() and self.pid:
+            trace_count_text = trace_count_text.replace('FILTER',
+                ('u32 pid; pid = bpf_get_current_pid_tgid() >> 32; ' +
+                'if (pid != %d) { return 0; }') % (self.pid))
+        else:
+            trace_count_text = trace_count_text.replace('FILTER', '')
+
+        # We need per-pid statistics when tracing a user-space process, because
+        # the meaning of the symbols depends on the pid. We also need them if
+        # per-pid statistics were requested with -P.
+        if self.per_pid or not self.is_kernel_probe():
+            trace_count_text = trace_count_text.replace('GET_PID',
+                                        'bpf_get_current_pid_tgid() >> 32')
+        else:
+            trace_count_text = trace_count_text.replace('GET_PID', '0xffffffff')
+
+        stack_flags = 'BPF_F_REUSE_STACKID'
+        if not self.is_kernel_probe():
+            stack_flags += '| BPF_F_USER_STACK' # can't do both U *and* K
+        trace_count_text = trace_count_text.replace('STACK_FLAGS', stack_flags)
+
+        self.usdt = None
+        if self.type == "u":
+            self.usdt = USDT(path=self.library, pid=self.pid)
+            for probe in self.usdt.enumerate_probes():
+                if not self.pid and (probe.bin_path != self.library):
+                    continue
+                if re.match(self.pattern, probe.name):
+                    # This hack is required because the bpf_usdt_readarg
+                    # functions generated need different function names for
+                    # each attached probe. If we just stick to trace_count,
+                    # we'd get multiple bpf_usdt_readarg helpers with the same
+                    # name when enabling more than one USDT probe.
+                    new_func = "trace_count_%d" % self.matched
+                    bpf_text += trace_count_text.replace(
+                                            "trace_count", new_func)
+                    self.usdt.enable_probe(probe.name, new_func)
+                    self.matched += 1
+            if debug:
+                print(self.usdt.get_text())
+        else:
+            bpf_text += trace_count_text
+
+        if debug:
+            print(bpf_text)
+        self.bpf = BPF(text=bpf_text, usdt_contexts=
+                      [self.usdt] if self.usdt else [])
+
+class Tool(object):
+    def __init__(self):
+        examples = """examples:
+    ./stackcount submit_bio          # count kernel stack traces for submit_bio
+    ./stackcount -s ip_output        # show symbol offsets
+    ./stackcount -sv ip_output       # show offsets and raw addresses (verbose)
+    ./stackcount 'tcp_send*'         # count stacks for funcs matching tcp_send*
+    ./stackcount -r '^tcp_send.*'    # same as above, using regular expressions
+    ./stackcount -Ti 5 ip_output     # output every 5 seconds, with timestamps
+    ./stackcount -p 185 ip_output    # count ip_output stacks for PID 185 only
+    ./stackcount -p 185 c:malloc     # count stacks for malloc in PID 185
+    ./stackcount t:sched:sched_fork  # count stacks for the sched_fork tracepoint
+    ./stackcount -p 185 u:node:*     # count stacks for all USDT probes in node
+        """
+        parser = argparse.ArgumentParser(
+            description="Count events and their stack traces",
+            formatter_class=argparse.RawDescriptionHelpFormatter,
+            epilog=examples)
+        parser.add_argument("-p", "--pid", type=int,
+            help="trace this PID only")
+        parser.add_argument("-i", "--interval", default=99999999,
+            help="summary interval, seconds")
+        parser.add_argument("-T", "--timestamp", action="store_true",
+            help="include timestamp on output")
+        parser.add_argument("-r", "--regexp", action="store_true",
+            help="use regular expressions. Default is \"*\" wildcards only.")
+        parser.add_argument("-s", "--offset", action="store_true",
+            help="show address offsets")
+        parser.add_argument("-P", "--perpid", action="store_true",
+            help="display stacks separately for each process")
+        parser.add_argument("-v", "--verbose", action="store_true",
+            help="show raw addresses")
+        parser.add_argument("-d", "--debug", action="store_true",
+            help="print BPF program before starting (for debugging purposes)")
+        parser.add_argument("pattern",
+            help="search expression for events")
+        self.args = parser.parse_args()
+        global debug
+        debug = self.args.debug
+        self.probe = Probe(self.args.pattern, self.args.regexp,
+                           self.args.pid, self.args.perpid)
+
+    def _print_frame(self, addr, pid):
+        print("  ", end="")
+        if self.args.verbose:
+            print("%-16x " % addr, end="")
+        if self.args.offset:
+            print("%s" % self.probe.bpf.symaddr(addr, pid))
+        else:
+            print("%s" % self.probe.bpf.sym(addr, pid))
+
+    @staticmethod
+    def _signal_ignore(signal, frame):
+        print()
+
+    def _comm_for_pid(self, pid):
+        if pid in self.comm_cache:
+            return self.comm_cache[pid]
+
+        try:
+            comm = "    %s [%d]" % (
+                    open("/proc/%d/comm" % pid).read().strip(),
+                    pid)
+            self.comm_cache[pid] = comm
+            return comm
+        except:
+            return "    unknown process [%d]" % pid
+
+    def run(self):
+        self.probe.load()
+        self.probe.attach()
+        print("Tracing %d functions for \"%s\"... Hit Ctrl-C to end." %
+              (self.probe.matched, self.args.pattern))
+        exiting = 0 if self.args.interval else 1
+        while True:
+            try:
+                sleep(int(self.args.interval))
+            except KeyboardInterrupt:
+                exiting = 1
+                # as cleanup can take many seconds, trap Ctrl-C:
+                signal.signal(signal.SIGINT, Tool._signal_ignore)
+
+            print()
+            if self.args.timestamp:
+                print("%-8s\n" % strftime("%H:%M:%S"), end="")
+
+            counts = self.probe.bpf["counts"]
+            stack_traces = self.probe.bpf["stack_traces"]
+            self.comm_cache = {}
+            for k, v in sorted(counts.items(),
+                               key=lambda counts: counts[1].value):
+                for addr in stack_traces.walk(k.stackid):
+                    pid = -1 if self.probe.is_kernel_probe() else k.pid
+                    self._print_frame(addr, pid)
+                if not self.args.pid and k.pid != 0xffffffff:
+                    print(self._comm_for_pid(k.pid))
+                print("    %d\n" % v.value)
+            counts.clear()
+
+            if exiting:
+                print("Detaching...")
+                exit()
+
+if __name__ == "__main__":
     try:
-        sleep(int(args.interval))
-    except KeyboardInterrupt:
-        exiting = 1
-        # as cleanup can take many seconds, trap Ctrl-C:
-        signal.signal(signal.SIGINT, signal_ignore)
+        Tool().run()
+    except Exception:
+        if debug:
+            traceback.print_exc()
+        elif sys.exc_info()[0] is not SystemExit:
+            print(sys.exc_info()[1])
 
-    print()
-    if args.timestamp:
-        print("%-8s\n" % strftime("%H:%M:%S"), end="")
-
-    counts = b["counts"]
-    stack_traces = b["stack_traces"]
-    for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
-        for addr in stack_traces.walk(k.value):
-            print_frame(addr)
-        print("    %d\n" % v.value)
-    counts.clear()
-
-    if exiting:
-        print("Detaching...")
-        exit()
diff --git a/tools/stackcount_example.txt b/tools/stackcount_example.txt
index 01b5b8a..d2d6448 100644
--- a/tools/stackcount_example.txt
+++ b/tools/stackcount_example.txt
@@ -1,8 +1,8 @@
 Demonstrations of stackcount, the Linux eBPF/bcc version.
 
 
-This program traces kernel functions and frequency counts them with their entire
-kernel stack trace, summarized in-kernel for efficiency. For example, counting
+This program traces functions and frequency counts them with their entire
+stack trace, summarized in-kernel for efficiency. For example, counting
 stack traces that led to submit_bio(), which creates block device I/O:
 
 # ./stackcount submit_bio
@@ -268,6 +268,76 @@
 flow.
 
 
+User-space functions can also be traced if a library name is provided. For
+example, to quickly identify code locations that allocate heap memory:
+
+# ./stackcount -l c -p 4902 malloc
+Tracing 1 functions for "malloc"... Hit Ctrl-C to end.
+^C
+  malloc
+  rbtree_new
+  main
+  [unknown]
+    12
+
+  malloc
+  _rbtree_node_new_internal
+  _rbtree_node_insert
+  rbtree_insert
+  main
+  [unknown]
+    1189
+
+Detaching...
+
+Note that user-space uses of stackcount can be somewhat more limited because
+a lot of user-space libraries and binaries are compiled without debuginfo, or
+with frame-pointer omission (-fomit-frame-pointer), which makes it impossible
+to reliably obtain the stack trace.
+
+
+In addition to kernel and user-space functions, kernel tracepoints and USDT
+tracepoints are also supported. 
+
+For example, to determine where threads are being created in a particular 
+process, use the pthread_create USDT tracepoint:
+
+# ./stackcount -p $(pidof parprimes) u:pthread:pthread_create
+Tracing 1 functions for "u:pthread:pthread_create"... Hit Ctrl-C to end.
+^C
+
+    parprimes [11923]
+  pthread_create@@GLIBC_2.2.5
+  main
+  __libc_start_main
+  [unknown]
+    7
+
+Similarly, to determine where context switching is happening in the kernel, 
+use the sched:sched_switch kernel tracepoint:
+
+# ./stackcount t:sched:sched_switch
+... (omitted for brevity)
+
+  __schedule
+  schedule
+  schedule_hrtimeout_range_clock
+  schedule_hrtimeout_range
+  poll_schedule_timeout
+  do_select
+  core_sys_select
+  SyS_select
+  entry_SYSCALL_64_fastpath
+    40
+
+  __schedule
+  schedule
+  schedule_preempt_disabled
+  cpu_startup_entry
+  start_secondary
+    85
+
+
 A -i option can be used to set an output interval, and -T to include a
 timestamp. For example:
 
@@ -434,12 +504,13 @@
 USAGE message:
 
 # ./stackcount -h
-usage: stackcount [-h] [-p PID] [-i INTERVAL] [-T] [-r] [-s] [-v] pattern
+usage: stackcount [-h] [-p PID] [-i INTERVAL] [-T] [-r] [-s]
+                  [-l LIBRARY] [-v] [-d] pattern
 
-Count kernel function calls and their stack traces
+Count function calls and their stack traces
 
 positional arguments:
-  pattern               search expression for kernel functions
+  pattern               search expression for functions
 
 optional arguments:
   -h, --help            show this help message and exit
@@ -450,14 +521,19 @@
   -r, --regexp          use regular expressions. Default is "*" wildcards
                         only.
   -s, --offset          show address offsets
+  -l, --library         trace user-space functions from this library or executable
   -v, --verbose         show raw addresses
+  -d, --debug           print BPF program before starting (for debugging purposes)
 
 examples:
-    ./stackcount submit_bio       # count kernel stack traces for submit_bio
-    ./stackcount ip_output        # count kernel stack traces for ip_output
-    ./stackcount -s ip_output     # show symbol offsets
-    ./stackcount -sv ip_output    # show offsets and raw addresses (verbose)
-    ./stackcount 'tcp_send*'      # count stacks for funcs matching tcp_send*
-    ./stackcount -r '^tcp_send.*' # same as above, using regular expressions
-    ./stackcount -Ti 5 ip_output  # output every 5 seconds, with timestamps
-    ./stackcount -p 185 ip_output # count ip_output stacks for PID 185 only
+    ./stackcount submit_bio          # count kernel stack traces for submit_bio
+    ./stackcount ip_output           # count kernel stack traces for ip_output
+    ./stackcount -s ip_output        # show symbol offsets
+    ./stackcount -sv ip_output       # show offsets and raw addresses (verbose)
+    ./stackcount 'tcp_send*'         # count stacks for funcs matching tcp_send*
+    ./stackcount -r '^tcp_send.*'    # same as above, using regular expressions
+    ./stackcount -Ti 5 ip_output     # output every 5 seconds, with timestamps
+    ./stackcount -p 185 ip_output    # count ip_output stacks for PID 185 only
+    ./stackcount -p 185 -l c malloc  # count stacks for malloc in PID 185
+    ./stackcount t:sched:sched_fork  # count stacks for the sched_fork tracepoint
+    ./stackcount -p 185 u:node:*     # count stacks for all USDT probes in node
diff --git a/tools/tplist.py b/tools/tplist.py
index 2572041..c063b00 100755
--- a/tools/tplist.py
+++ b/tools/tplist.py
@@ -25,8 +25,8 @@
                 "List USDT probes in the specified process")
 parser.add_argument("-l", "--lib", default="", help=
                 "List USDT probes in the specified library or executable")
-parser.add_argument("-v", dest="variables", action="store_true", help=
-                "Print the format (available variables)")
+parser.add_argument("-v", dest="verbosity", action="count", help=
+                "Increase verbosity level (print variables, arguments, etc.)")
 parser.add_argument(dest="filter", nargs="?", help=
                 "A filter that specifies which probes/tracepoints to print")
 args = parser.parse_args()
@@ -51,7 +51,7 @@
         tpoint = "%s:%s" % (category, event)
         if not args.filter or fnmatch.fnmatch(tpoint, args.filter):
                 print(tpoint)
-                if args.variables:
+                if args.verbosity > 0:
                         print_tpoint_format(category, event)
 
 def print_tracepoints():
@@ -64,6 +64,25 @@
                         if os.path.isdir(evt_dir):
                                 print_tpoint(category, event)
 
+def print_usdt_argument_details(location):
+        for idx in xrange(0, location.num_arguments):
+                arg = location.get_argument(idx)
+                print("    argument #%d %s" % (idx, arg))
+
+def print_usdt_details(probe):
+        if args.verbosity > 0:
+                print(probe)
+                if args.verbosity > 1:
+                        for idx in xrange(0, probe.num_locations):
+                                loc = probe.get_location(idx)
+                                print("  location #%d %s" % (idx, loc))
+                                print_usdt_argument_details(loc)
+                else:
+                        print("  %d location(s)" % probe.num_locations)
+                        print("  %d argument(s)" % probe.num_arguments)
+        else:
+                print("%s %s:%s" % (probe.bin_path, probe.provider, probe.name))
+
 def print_usdt(pid, lib):
         reader = USDT(path=lib, pid=pid)
         probes_seen = []
@@ -73,11 +92,7 @@
                         if probe_name in probes_seen:
                                 continue
                         probes_seen.append(probe_name)
-                        if args.variables:
-                                print(probe)
-                        else:
-                                print("%s %s:%s" % (probe.bin_path,
-                                                    probe.provider, probe.name))
+                        print_usdt_details(probe)
 
 if __name__ == "__main__":
         try:
diff --git a/tools/tplist_example.txt b/tools/tplist_example.txt
index 7beb9b2..9bcbc35 100644
--- a/tools/tplist_example.txt
+++ b/tools/tplist_example.txt
@@ -88,6 +88,31 @@
 you specify with argdist or trace.
 
 
+For debugging USDT probes, it is sometimes useful to see the exact locations
+and arguments of the probes, including the registers or global variables from
+which their values are coming from. In super-verbose mode, tplist will print
+this information (note the -vv):
+
+$ tplist -vv -l c *alloc*
+/lib64/libc.so.6 libc:memory_malloc_retry [sema 0x0]
+  location #0 0x835c0
+    argument #0 8 unsigned bytes @ bp
+  location #1 0x83778
+    argument #0 8 unsigned bytes @ bp
+  location #2 0x85a50
+    argument #0 8 unsigned bytes @ bp
+/lib64/libc.so.6 libc:memory_realloc_retry [sema 0x0]
+  location #0 0x84b90
+    argument #0 8 unsigned bytes @ r13
+    argument #1 8 unsigned bytes @ bp
+  location #1 0x85cf0
+    argument #0 8 unsigned bytes @ r13
+    argument #1 8 unsigned bytes @ bp
+/lib64/libc.so.6 libc:memory_calloc_retry [sema 0x0]
+  location #0 0x850f0
+    argument #0 8 unsigned bytes @ bp
+
+
 USAGE message:
 
 $ tplist -h
@@ -102,5 +127,5 @@
   -h, --help         show this help message and exit
   -p PID, --pid PID  List USDT probes in the specified process
   -l LIB, --lib LIB  List USDT probes in the specified library or executable
-  -v                 Print the format (available variables)
+  -v                 Increase verbosity level (print variables, arguments, etc.)
 
diff --git a/tools/trace.py b/tools/trace.py
index 6722bea..6915fc0 100755
--- a/tools/trace.py
+++ b/tools/trace.py
@@ -163,10 +163,11 @@
 
         def _parse_types(self, fmt):
                 for match in re.finditer(
-                                r'[^%]%(s|u|d|llu|lld|hu|hd|x|llx|c)', fmt):
+                                r'[^%]%(s|u|d|llu|lld|hu|hd|x|llx|c|K|U)', fmt):
                         self.types.append(match.group(1))
                 fmt = re.sub(r'([^%]%)(u|d|llu|lld|hu|hd)', r'\1d', fmt)
                 fmt = re.sub(r'([^%]%)(x|llx)', r'\1x', fmt)
+                fmt = re.sub('%K|%U', '%s', fmt)
                 self.python_format = fmt.strip('"')
 
         def _parse_action(self, action):
@@ -216,8 +217,8 @@
         p_type = { "u": ct.c_uint, "d": ct.c_int,
                    "llu": ct.c_ulonglong, "lld": ct.c_longlong,
                    "hu": ct.c_ushort, "hd": ct.c_short,
-                   "x": ct.c_uint, "llx": ct.c_ulonglong,
-                   "c": ct.c_ubyte }
+                   "x": ct.c_uint, "llx": ct.c_ulonglong, "c": ct.c_ubyte,
+                   "K": ct.c_ulonglong, "U": ct.c_ulonglong }
 
         def _generate_python_field_decl(self, idx, fields):
                 field_type = self.types[idx]
@@ -248,7 +249,8 @@
                    "llu": "unsigned long long", "lld": "long long",
                    "hu": "unsigned short", "hd": "short",
                    "x": "unsigned int", "llx": "unsigned long long",
-                   "c": "char" }
+                   "c": "char", "K": "unsigned long long",
+                   "U": "unsigned long long" }
         fmt_types = c_type.keys()
 
         def _generate_field_decl(self, idx):
@@ -417,12 +419,24 @@
 
         def print_stack(self, bpf, stack_id, pid):
             if stack_id < 0:
-                print("        %d" % stack_id)
-                return
+                    print("        %d" % stack_id)
+                    return
 
             stack = list(bpf.get_table(self.stacks_name).walk(stack_id))
             for addr in stack:
-                print("        %016x %s" % (addr, bpf.sym(addr, pid)))
+                    print("        %016x %s" % (addr, bpf.sym(addr, pid)))
+
+        def _format_message(self, bpf, pid, values):
+                # Replace each %K with kernel sym and %U with user sym in pid
+                kernel_placeholders = [i for i in xrange(0, len(self.types))
+                                       if self.types[i] == 'K']
+                user_placeholders   = [i for i in xrange(0, len(self.types))
+                                       if self.types[i] == 'U']
+                for kp in kernel_placeholders:
+                        values[kp] = bpf.ksymaddr(values[kp])
+                for up in user_placeholders:
+                        values[up] = bpf.symaddr(values[up], pid)
+                return self.python_format % tuple(values)
 
         def print_event(self, bpf, cpu, data, size):
                 # Cast as the generated structure type and display
@@ -430,7 +444,7 @@
                 event = ct.cast(data, ct.POINTER(self.python_struct)).contents
                 values = map(lambda i: getattr(event, "v%d" % i),
                              range(0, len(self.values)))
-                msg = self.python_format % tuple(values)
+                msg = self._format_message(bpf, event.pid, values)
                 time = strftime("%H:%M:%S") if Probe.use_localtime else \
                        Probe._time_off_str(event.timestamp_ns)
                 print("%-8s %-6d %-12s %-16s %s" % \
@@ -438,13 +452,13 @@
                      self._display_function(), msg))
 
                 if self.user_stack:
-                    print("    User Stack Trace:")
-                    self.print_stack(bpf, event.user_stack_id, event.pid)
+                        print("    User Stack Trace:")
+                        self.print_stack(bpf, event.user_stack_id, event.pid)
                 if self.kernel_stack:
-                    print("    Kernel Stack Trace:")
-                    self.print_stack(bpf, event.kernel_stack_id, -1)
+                        print("    Kernel Stack Trace:")
+                        self.print_stack(bpf, event.kernel_stack_id, -1)
                 if self.user_stack or self.kernel_stack:
-                    print("")
+                        print("")
 
                 Probe.event_count += 1
                 if Probe.max_events is not None and \
diff --git a/tools/trace_example.txt b/tools/trace_example.txt
index dbb8cf1..20d61c5 100644
--- a/tools/trace_example.txt
+++ b/tools/trace_example.txt
@@ -105,6 +105,37 @@
 This output tells you that you can use "args->dev", "args->sector", etc. in your
 predicate and trace arguments.
 
+
+More and more high-level libraries are instrumented with USDT probe support.
+These probes can be traced by trace just like kernel tracepoints. For example,
+trace new threads being created and their function name:
+
+# trace 'u:pthread:pthread_create "%U", arg3'
+TIME     PID    COMM         FUNC             -
+02:07:29 4051   contentions  pthread_create   primes_thread+0x0
+02:07:29 4051   contentions  pthread_create   primes_thread+0x0
+02:07:29 4051   contentions  pthread_create   primes_thread+0x0
+02:07:29 4051   contentions  pthread_create   primes_thread+0x0
+^C
+
+The "%U" format specifier tells trace to resolve arg3 as a user-space symbol,
+if possible. Similarly, use "%K" for kernel symbols.
+
+Ruby, Node, and OpenJDK are also instrumented with USDT. For example, let's
+trace Ruby methods being called (this requires a version of Ruby built with 
+the --enable-dtrace configure flag):
+
+# trace 'u:ruby:method__entry "%s.%s", arg1, arg2' -p $(pidof irb)
+TIME     PID    COMM         FUNC             -
+12:08:43 18420  irb          method__entry    IRB::Context.verbose?
+12:08:43 18420  irb          method__entry    RubyLex.ungetc
+12:08:43 18420  irb          method__entry    RuxyLex.debug?
+^C
+
+In the previous invocation, arg1 and arg2 are the class name and method name
+for the Ruby method being invoked.
+
+
 As a final example, let's trace open syscalls for a specific process. By 
 default, tracing is system-wide, but the -p switch overrides this:
 
diff --git a/tools/ttysnoop.py b/tools/ttysnoop.py
new file mode 100755
index 0000000..846882c
--- /dev/null
+++ b/tools/ttysnoop.py
@@ -0,0 +1,120 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# ttysnoop   Watch live output from a tty or pts device.
+#            For Linux, uses BCC, eBPF. Embedded C.
+#
+# Due to a limited buffer size (see BUFSIZE), some commands (eg, a vim
+# session) are likely to be printed a little messed up.
+#
+# Copyright (c) 2016 Brendan Gregg.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# Idea: from ttywatcher.
+#
+# 15-Oct-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF
+import ctypes as ct
+from subprocess import call
+import argparse
+from sys import argv
+import sys
+from os import stat
+
+def usage():
+    print("USAGE: %s [-Ch] {PTS | /dev/ttydev}  # try -h for help" % argv[0])
+    exit()
+
+# arguments
+examples = """examples:
+    ./ttysnoop /dev/pts/2    # snoop output from /dev/pts/2
+    ./ttysnoop 2             # snoop output from /dev/pts/2 (shortcut)
+    ./ttysnoop /dev/console  # snoop output from the system console
+    ./ttysnoop /dev/tty0     # snoop output from /dev/tty0
+"""
+parser = argparse.ArgumentParser(
+    description="Snoop output from a pts or tty device, eg, a shell",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-C", "--noclear", action="store_true",
+    help="don't clear the screen")
+parser.add_argument("device", default="-1",
+    help="path to a tty device (eg, /dev/tty0) or pts number")
+args = parser.parse_args()
+debug = 0
+
+if args.device == "-1":
+    usage()
+
+path = args.device
+if path.find('/') != 0:
+    path = "/dev/pts/" + path
+try:
+    pi = stat(path)
+except:
+    print("Unable to read device %s. Exiting." % path)
+    exit()
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <linux/fs.h>
+
+#define BUFSIZE 256
+struct data_t {
+    int count;
+    char buf[BUFSIZE];
+};
+
+BPF_PERF_OUTPUT(events);
+
+int kprobe__tty_write(struct pt_regs *ctx, struct file *file,
+    const char __user *buf, size_t count)
+{
+    if (file->f_inode->i_ino != PTS)
+        return 0;
+
+    // bpf_probe_read() can only use a fixed size, so truncate to count
+    // in user space:
+    struct data_t data = {};
+    bpf_probe_read(&data.buf, BUFSIZE, (void *)buf);
+    if (count > BUFSIZE)
+        data.count = BUFSIZE;
+    else
+        data.count = count;
+    events.perf_submit(ctx, &data, sizeof(data));
+
+    return 0;
+};
+"""
+
+bpf_text = bpf_text.replace('PTS', str(pi.st_ino))
+if debug:
+    print(bpf_text)
+
+# initialize BPF
+b = BPF(text=bpf_text)
+
+BUFSIZE = 256
+
+class Data(ct.Structure):
+    _fields_ = [
+        ("count", ct.c_int),
+        ("buf", ct.c_char * BUFSIZE)
+    ]
+
+if not args.noclear:
+    call("clear")
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    print("%s" % event.buf[0:event.count], end="")
+    sys.stdout.flush()
+
+# loop with callback to print_event
+b["events"].open_perf_buffer(print_event)
+while 1:
+    b.kprobe_poll()
diff --git a/tools/ttysnoop_example.txt b/tools/ttysnoop_example.txt
new file mode 100644
index 0000000..1c29961
--- /dev/null
+++ b/tools/ttysnoop_example.txt
@@ -0,0 +1,83 @@
+Demonstrations of ttysnoop, the Linux eBPF/bcc version.
+
+
+ttysnoop watches a tty or pts device, and prints the same output that is
+appearing on that device. It can be used to mirror the output from a shell
+session, or the system console.
+
+Let's snoop /dev/pts/2:
+
+# ./ttysnoop 2
+<screen clears>
+date
+Sun Oct 16 01:28:47 UTC 2016
+# uname -a
+Linux bgregg-xenial-bpf-i-xxx 4.8.0-rc4-virtual #1 SMP Wed Aug 31 22:54:37 UTC 2016 x86_64 x86_64 x86_64 GNU/Linux
+# df -h
+Filesystem      Size  Used Avail Use% Mounted on
+udev            7.4G     0  7.4G   0% /dev
+tmpfs           1.5G   89M  1.4G   6% /run
+/dev/xvda1      7.8G  4.5G  3.3G  59% /
+tmpfs           7.4G     0  7.4G   0% /dev/shm
+tmpfs           5.0M     0  5.0M   0% /run/lock
+tmpfs           7.4G     0  7.4G   0% /sys/fs/cgroup
+tmpfs           250M     0  250M   0% /run/shm
+/dev/md0        160G   20G  141G  13% /mnt
+tmpfs           1.5G     0  1.5G   0% /run/user/0
+# ^C
+
+What we're seeing is another shell session. The first line was "date" without
+the shell prompt ("#") because we began tracing after the prompt was printed.
+The other commands appeared, keystroke by keystroke, as the user was typing
+them. Spooky!
+
+Remember to Ctrl-C to exit ttysnoop.
+
+
+To figure out which pts device number to use, you can check your own with "ps"
+and other's with "w". For example:
+
+# ps -p $$
+  PID TTY          TIME CMD
+ 9605 pts/1    00:00:00 bash
+# w
+ 01:26:37 up 9 days, 35 min,  2 users,  load average: 0.22, 0.22, 0.15
+USER     TTY      FROM             LOGIN@   IDLE   JCPU   PCPU WHAT
+root     pts/1    100.127.65.241   00:39    2.00s  0.33s  0.33s -bash
+root     pts/2    100.127.65.241   00:40   16.00s  1.06s  1.06s -bash
+
+So I'm pts/1, and there's another session that's pts/2.
+
+
+This can also snoop tty devices using their full path. Eg, snooping the system
+console:
+
+# ./ttysnoop /dev/console
+Oct 16 01:32:06 bgregg-xenial-bpf-i-xxx kernel: [780087.407428] bash (9888): drop_caches: 1
+Oct 16 01:32:38 bgregg-xenial-bpf-i-xxx snmpd[2708]: Cannot statfs /sys/kernel/debug/tracing: Permission denied
+Oct 16 01:33:32 bgregg-xenial-bpf-i-xxx snmpd[2708]: Cannot statfs /sys/kernel/debug/tracing: Permission denied
+Oct 16 01:34:26 bgregg-xenial-bpf-i-xxx snmpd[2708]: Cannot statfs /sys/kernel/debug/tracing: Permission denied
+^C
+
+Neat!
+
+
+USAGE:
+
+# ./ttysnoop.py -h
+usage: ttysnoop.py [-h] [-C] device
+
+Snoop output from a pts or tty device, eg, a shell
+
+positional arguments:
+  device         path to a tty device (eg, /dev/tty0) or pts number
+
+optional arguments:
+  -h, --help     show this help message and exit
+  -C, --noclear  don't clear the screen
+
+examples:
+    ./ttysnoop /dev/pts/2    # snoop output from /dev/pts/2
+    ./ttysnoop 2             # snoop output from /dev/pts/2 (shortcut)
+    ./ttysnoop /dev/console  # snoop output from the system console
+    ./ttysnoop /dev/tty0     # snoop output from /dev/tty0
diff --git a/tools/xfsslower.py b/tools/xfsslower.py
index 2e9c5b2..80d9878 100755
--- a/tools/xfsslower.py
+++ b/tools/xfsslower.py
@@ -22,6 +22,7 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #
 # 11-Feb-2016   Brendan Gregg   Created this.
+# 16-Oct-2016   Dina Goldshtein -p to filter by process ID.
 
 from __future__ import print_function
 from bcc import BPF
@@ -84,7 +85,7 @@
     char file[DNAME_INLINE_LEN];
 };
 
-BPF_HASH(entryinfo, pid_t, struct val_t);
+BPF_HASH(entryinfo, u64, struct val_t);
 BPF_PERF_OUTPUT(events);
 
 //
@@ -94,18 +95,19 @@
 // xfs_file_read_iter(), xfs_file_write_iter():
 int trace_rw_entry(struct pt_regs *ctx, struct kiocb *iocb)
 {
-    u32 pid;
-    pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
     if (FILTER_PID)
         return 0;
 
-    // store filep and timestamp by pid
+    // store filep and timestamp by id
     struct val_t val = {};
     val.ts = bpf_ktime_get_ns();
     val.fp = iocb->ki_filp;
     val.offset = iocb->ki_pos;
     if (val.fp)
-        entryinfo.update(&pid, &val);
+        entryinfo.update(&id, &val);
 
     return 0;
 }
@@ -114,18 +116,19 @@
 int trace_open_entry(struct pt_regs *ctx, struct inode *inode,
     struct file *file)
 {
-    u32 pid;
-    pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
     if (FILTER_PID)
         return 0;
 
-    // store filep and timestamp by pid
+    // store filep and timestamp by id
     struct val_t val = {};
     val.ts = bpf_ktime_get_ns();
     val.fp = file;
     val.offset = 0;
     if (val.fp)
-        entryinfo.update(&pid, &val);
+        entryinfo.update(&id, &val);
 
     return 0;
 }
@@ -133,18 +136,19 @@
 // xfs_file_fsync():
 int trace_fsync_entry(struct pt_regs *ctx, struct file *file)
 {
-    u32 pid;
-    pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
     if (FILTER_PID)
         return 0;
 
-    // store filep and timestamp by pid
+    // store filep and timestamp by id
     struct val_t val = {};
     val.ts = bpf_ktime_get_ns();
     val.fp = file;
     val.offset = 0;
     if (val.fp)
-        entryinfo.update(&pid, &val);
+        entryinfo.update(&id, &val);
 
     return 0;
 }
@@ -156,9 +160,10 @@
 static int trace_return(struct pt_regs *ctx, int type)
 {
     struct val_t *valp;
-    u32 pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
 
-    valp = entryinfo.lookup(&pid);
+    valp = entryinfo.lookup(&id);
     if (valp == 0) {
         // missed tracing issue or filtered
         return 0;
@@ -167,7 +172,7 @@
     // calculate delta
     u64 ts = bpf_ktime_get_ns();
     u64 delta_us = (ts - valp->ts) / 1000;
-    entryinfo.delete(&pid);
+    entryinfo.delete(&id);
     if (FILTER_US)
         return 0;
 
diff --git a/tools/zfsslower.py b/tools/zfsslower.py
index 4250c59..8456f21 100755
--- a/tools/zfsslower.py
+++ b/tools/zfsslower.py
@@ -25,6 +25,7 @@
 # Licensed under the Apache License, Version 2.0 (the "License")
 #
 # 14-Feb-2016   Brendan Gregg   Created this.
+# 16-Oct-2016   Dina Goldshtein -p to filter by process ID.
 
 from __future__ import print_function
 from bcc import BPF
@@ -87,7 +88,7 @@
     char file[DNAME_INLINE_LEN];
 };
 
-BPF_HASH(entryinfo, pid_t, struct val_t);
+BPF_HASH(entryinfo, u64, struct val_t);
 BPF_PERF_OUTPUT(events);
 
 //
@@ -98,18 +99,19 @@
 int trace_rw_entry(struct pt_regs *ctx, struct file *filp, char __user *buf,
     size_t len, loff_t *ppos)
 {
-    u32 pid;
-    pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
     if (FILTER_PID)
         return 0;
 
-    // store filep and timestamp by pid
+    // store filep and timestamp by id
     struct val_t val = {};
     val.ts = bpf_ktime_get_ns();
     val.fp = filp;
     val.offset = *ppos;
     if (val.fp)
-        entryinfo.update(&pid, &val);
+        entryinfo.update(&id, &val);
 
     return 0;
 }
@@ -118,18 +120,19 @@
 int trace_open_entry(struct pt_regs *ctx, struct inode *inode,
     struct file *filp)
 {
-    u32 pid;
-    pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
     if (FILTER_PID)
         return 0;
 
-    // store filep and timestamp by pid
+    // store filep and timestamp by id
     struct val_t val = {};
     val.ts = bpf_ktime_get_ns();
     val.fp = filp;
     val.offset = 0;
     if (val.fp)
-        entryinfo.update(&pid, &val);
+        entryinfo.update(&id, &val);
 
     return 0;
 }
@@ -137,18 +140,19 @@
 // zpl_fsync():
 int trace_fsync_entry(struct pt_regs *ctx, struct file *filp)
 {
-    u32 pid;
-    pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
+
     if (FILTER_PID)
         return 0;
 
-    // store filp and timestamp by pid
+    // store filp and timestamp by id
     struct val_t val = {};
     val.ts = bpf_ktime_get_ns();
     val.fp = filp;
     val.offset = 0;
     if (val.fp)
-        entryinfo.update(&pid, &val);
+        entryinfo.update(&id, &val);
 
     return 0;
 }
@@ -160,9 +164,10 @@
 static int trace_return(struct pt_regs *ctx, int type)
 {
     struct val_t *valp;
-    u32 pid = bpf_get_current_pid_tgid();
+    u64 id = bpf_get_current_pid_tgid();
+    u32 pid = id >> 32; // PID is higher part
 
-    valp = entryinfo.lookup(&pid);
+    valp = entryinfo.lookup(&id);
     if (valp == 0) {
         // missed tracing issue or filtered
         return 0;
@@ -171,7 +176,7 @@
     // calculate delta
     u64 ts = bpf_ktime_get_ns();
     u64 delta_us = (ts - valp->ts) / 1000;
-    entryinfo.delete(&pid);
+    entryinfo.delete(&id);
     if (FILTER_US)
         return 0;