Make perf ring buffer size configurable
As discussed in #966, this PR makes the size of the ring buffer used to send
data to userspace configurable. It changes the Python, Lua and C++ APIs to
expose this knob.
It also defaults the buffer size to a larger value (64 pages per CPU, an 8x
increase) for several tools which produce a lot of output, as well as making it
configurable in `trace` via a `-b` flag.
diff --git a/tools/biosnoop.lua b/tools/biosnoop.lua
index ac08897..fac7f3b 100644
--- a/tools/biosnoop.lua
+++ b/tools/biosnoop.lua
@@ -175,9 +175,9 @@
uint64_t sector;
uint64_t len;
uint64_t ts;
- char disk_name[%d];
- char name[%d];
+ char disk_name[$];
+ char name[$];
}
- ]] % {DISK_NAME_LEN, TASK_COMM_LEN})
+ ]], {DISK_NAME_LEN, TASK_COMM_LEN}, 64)
bpf:kprobe_poll_loop()
end
diff --git a/tools/biosnoop.py b/tools/biosnoop.py
index aa8a077..3d77e52 100755
--- a/tools/biosnoop.py
+++ b/tools/biosnoop.py
@@ -182,6 +182,6 @@
start_ts = 1
# loop with callback to print_event
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
diff --git a/tools/btrfsslower.py b/tools/btrfsslower.py
index fcc155e..8b34900 100755
--- a/tools/btrfsslower.py
+++ b/tools/btrfsslower.py
@@ -343,6 +343,6 @@
"BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
# read events
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
diff --git a/tools/cpuunclaimed.py b/tools/cpuunclaimed.py
index 9624b50..3998f9f 100755
--- a/tools/cpuunclaimed.py
+++ b/tools/cpuunclaimed.py
@@ -205,7 +205,7 @@
trigger = int(0.8 * (1000000000 / frequency))
# read events
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
# allow some buffering by calling sleep(), to reduce the context switch
# rate and lower overhead.
diff --git a/tools/dbslower.py b/tools/dbslower.py
index 70e0503..6ddec41 100755
--- a/tools/dbslower.py
+++ b/tools/dbslower.py
@@ -131,7 +131,7 @@
(', '.join(map(str, args.pids)), args.threshold))
print("%-14s %-6s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY"))
-bpf["events"].open_perf_buffer(print_event)
+bpf["events"].open_perf_buffer(print_event, page_cnt=64)
while True:
bpf.kprobe_poll()
diff --git a/tools/dcsnoop.py b/tools/dcsnoop.py
index d162a66..a72ba41 100755
--- a/tools/dcsnoop.py
+++ b/tools/dcsnoop.py
@@ -153,6 +153,6 @@
# header
print("%-11s %-6s %-16s %1s %s" % ("TIME(s)", "PID", "COMM", "T", "FILE"))
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
diff --git a/tools/ext4slower.py b/tools/ext4slower.py
index 20865a5..4950325 100755
--- a/tools/ext4slower.py
+++ b/tools/ext4slower.py
@@ -337,6 +337,6 @@
"BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
# read events
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
diff --git a/tools/fileslower.py b/tools/fileslower.py
index 2ae4756..ab29990 100755
--- a/tools/fileslower.py
+++ b/tools/fileslower.py
@@ -243,6 +243,6 @@
time.time() - start_ts, event.comm, event.pid, mode_s[event.mode],
event.sz, ms, name))
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
diff --git a/tools/mysqld_qslower.py b/tools/mysqld_qslower.py
index 94906a8..3ed18ec 100755
--- a/tools/mysqld_qslower.py
+++ b/tools/mysqld_qslower.py
@@ -128,6 +128,6 @@
event.pid, float(event.delta) / 1000000, event.query))
# loop with callback to print_event
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
diff --git a/tools/opensnoop.py b/tools/opensnoop.py
index 0c2b9b5..dae4ff4 100755
--- a/tools/opensnoop.py
+++ b/tools/opensnoop.py
@@ -178,6 +178,6 @@
event.comm, fd_s, err, event.fname))
# loop with callback to print_event
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
diff --git a/tools/stacksnoop.lua b/tools/stacksnoop.lua
index 8f5f5b4..7dfaf3d 100755
--- a/tools/stacksnoop.lua
+++ b/tools/stacksnoop.lua
@@ -102,6 +102,6 @@
bpf:get_table("events"):open_perf_buffer(print_event,
"struct { uint64_t stack_id; uint32_t pid; char comm[$]; }",
- TASK_COMM_LEN)
+ {TASK_COMM_LEN})
bpf:kprobe_poll_loop()
end
diff --git a/tools/statsnoop.py b/tools/statsnoop.py
index 2fc2164..d9164b6 100755
--- a/tools/statsnoop.py
+++ b/tools/statsnoop.py
@@ -159,6 +159,6 @@
fd_s, err, event.fname))
# loop with callback to print_event
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
diff --git a/tools/tcplife.py b/tools/tcplife.py
index 1125f9c..69ba174 100755
--- a/tools/tcplife.py
+++ b/tools/tcplife.py
@@ -354,7 +354,7 @@
start_ts = 0
# read events
-b["ipv4_events"].open_perf_buffer(print_ipv4_event)
-b["ipv6_events"].open_perf_buffer(print_ipv6_event)
+b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64)
+b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64)
while 1:
b.kprobe_poll()
diff --git a/tools/trace.py b/tools/trace.py
index 46bc97e..029194c 100755
--- a/tools/trace.py
+++ b/tools/trace.py
@@ -29,6 +29,7 @@
use_localtime = True
tgid = -1
pid = -1
+ page_cnt = None
@classmethod
def configure(cls, args):
@@ -38,6 +39,7 @@
cls.first_ts = BPF.monotonic_time()
cls.tgid = args.tgid or -1
cls.pid = args.pid or -1
+ cls.page_cnt = args.buffer_pages
def __init__(self, probe, string_size, kernel_stack, user_stack):
self.usdt = None
@@ -510,7 +512,8 @@
self._attach_u(bpf)
self.python_struct = self._generate_python_data_decl()
callback = partial(self.print_event, bpf)
- bpf[self.events_name].open_perf_buffer(callback)
+ bpf[self.events_name].open_perf_buffer(callback,
+ page_cnt=self.page_cnt)
def _attach_k(self, bpf):
if self.probe_type == "r":
@@ -543,6 +546,7 @@
pid=Probe.tgid)
class Tool(object):
+ DEFAULT_PERF_BUFFER_PAGES = 64
examples = """
EXAMPLES:
@@ -577,6 +581,10 @@
"functions and print trace messages.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=Tool.examples)
+ parser.add_argument("-b", "--buffer-pages", type=int,
+ default=Tool.DEFAULT_PERF_BUFFER_PAGES,
+ help="number of pages to use for perf_events ring buffer "
+ "(default: %(default)d)")
# we'll refer to the userspace concepts of "pid" and "tid" by
# their kernel names -- tgid and pid -- inside the script
parser.add_argument("-p", "--pid", type=int, metavar="PID",
diff --git a/tools/trace_example.txt b/tools/trace_example.txt
index 504030c..eb72e5e 100644
--- a/tools/trace_example.txt
+++ b/tools/trace_example.txt
@@ -201,11 +201,28 @@
libraries and then accessing the /home/vagrant directory listing.
+Lastly, if a high-frequency event is traced you may overflow the perf ring
+buffer. This shows as "Lost N samples":
+
+# trace sys_open
+5087 5087 pgrep sys_open
+5087 5087 pgrep sys_open
+5087 5087 pgrep sys_open
+5087 5087 pgrep sys_open
+5087 5087 pgrep sys_open
+Lost 764896 samples
+Lost 764896 samples
+Lost 764896 samples
+
+The perf ring buffer size can be changed with -b. The unit is size per-CPU buffer
+size and is measured in pages. The value must be a power of two and defaults to
+64 pages.
+
+
USAGE message:
-# trace -h
-usage: trace [-h] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S]
- [-M MAX_EVENTS] [-t] [-T] [-K] [-U] [-I header]
+usage: trace [-h] [-b BUFFER_PAGES] [-p PID] [-L TID] [-v] [-Z STRING_SIZE]
+ [-S] [-M MAX_EVENTS] [-t] [-T] [-K] [-U] [-I header]
probe [probe ...]
Attach to functions and print trace messages.
@@ -215,6 +232,9 @@
optional arguments:
-h, --help show this help message and exit
+ -b BUFFER_PAGES, --buffer-pages BUFFER_PAGES
+ number of pages to use for perf_events ring buffer
+ (default: 64)
-p PID, --pid PID id of the process to trace (optional)
-L TID, --tid TID id of the thread to trace (optional)
-v, --verbose print resulting BPF program code before executing
@@ -224,7 +244,7 @@
-M MAX_EVENTS, --max-events MAX_EVENTS
number of events to print before quitting
-t, --timestamp print timestamp column (offset from trace start)
- -T, --time print time column
+ -T, --time print time column
-K, --kernel-stack output kernel stack trace
-U, --user-stack output user stack trace
-I header, --include header
@@ -247,9 +267,9 @@
Trace malloc calls and print the size being allocated
trace 'p:c:write (arg1 == 1) "writing %d bytes to STDOUT", arg3'
Trace the write() call from libc to monitor writes to STDOUT
-trace 'r::__kmalloc (retval == 0) "kmalloc failed!"
+trace 'r::__kmalloc (retval == 0) "kmalloc failed!"'
Trace returns from __kmalloc which returned a null pointer
-trace 'r:c:malloc (retval) "allocated = %x", retval
+trace 'r:c:malloc (retval) "allocated = %x", retval'
Trace returns from malloc and print non-NULL allocated buffers
trace 't:block:block_rq_complete "sectors=%d", args->nr_sector'
Trace the block_rq_complete kernel tracepoint and print # of tx sectors
diff --git a/tools/xfsslower.py b/tools/xfsslower.py
index 25c5a20..3fbc96d 100755
--- a/tools/xfsslower.py
+++ b/tools/xfsslower.py
@@ -293,6 +293,6 @@
"BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
# read events
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()
diff --git a/tools/zfsslower.py b/tools/zfsslower.py
index e2be684..f5e8cbb 100755
--- a/tools/zfsslower.py
+++ b/tools/zfsslower.py
@@ -297,6 +297,6 @@
"BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
# read events
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
b.kprobe_poll()