Make perf ring buffer size configurable

As discussed in #966, this PR makes the size of the ring buffer used to send
data to userspace configurable. It changes the Python, Lua and C++ APIs to
expose this knob.

It also defaults the buffer size to a larger value (64 pages per CPU, an 8x
increase) for several tools which produce a lot of output, as well as making it
configurable in `trace` via a `-b` flag.
diff --git a/docs/reference_guide.md b/docs/reference_guide.md
index f976443..9e10a99 100644
--- a/docs/reference_guide.md
+++ b/docs/reference_guide.md
@@ -865,9 +865,9 @@
 
 ### 2. open_perf_buffer()
 
-Syntax: ```table.open_perf_buffers(callback)```
+Syntax: ```table.open_perf_buffers(callback, page_cnt=N)```
 
-This operates on a table as defined in BPF as BPF_PERF_OUTPUT(), and associates the callback Python function ```callback``` to be called when data is available in the perf ring buffer. This is part of the recommended mechanism for transferring per-event data from kernel to user space.
+This operates on a table as defined in BPF as BPF_PERF_OUTPUT(), and associates the callback Python function ```callback``` to be called when data is available in the perf ring buffer. This is part of the recommended mechanism for transferring per-event data from kernel to user space. The size of the perf ring buffer can be specified via the ```page_cnt``` parameter, which must be a power of two number of pages and defaults to 8.
 
 Example:
 
diff --git a/examples/lua/bashreadline.lua b/examples/lua/bashreadline.lua
index ebb4c35..045fbc0 100755
--- a/examples/lua/bashreadline.lua
+++ b/examples/lua/bashreadline.lua
@@ -24,7 +24,7 @@
     print("%-9s %-6d %s" % {os.date("%H:%M:%S"), tonumber(event.pid), ffi.string(event.str)})
   end
 
-  b:get_table("events"):open_perf_buffer(print_readline, "struct { uint64_t pid; char str[80]; }")
+  b:get_table("events"):open_perf_buffer(print_readline, "struct { uint64_t pid; char str[80]; }", nil)
 
   print("%-9s %-6s %s" % {"TIME", "PID", "COMMAND"})
   b:kprobe_poll_loop()
diff --git a/src/cc/BPF.cc b/src/cc/BPF.cc
index 9e1c23a..809bfdb 100644
--- a/src/cc/BPF.cc
+++ b/src/cc/BPF.cc
@@ -392,11 +392,14 @@
 }
 
 StatusTuple BPF::open_perf_buffer(const std::string& name,
-                                  perf_reader_raw_cb cb, void* cb_cookie) {
+                                  perf_reader_raw_cb cb, void* cb_cookie,
+                                  int page_cnt) {
   if (perf_buffers_.find(name) == perf_buffers_.end())
     perf_buffers_[name] = new BPFPerfBuffer(bpf_module_.get(), name);
+  if ((page_cnt & (page_cnt - 1)) != 0)
+    return StatusTuple(-1, "open_perf_buffer page_cnt must be a power of two");
   auto table = perf_buffers_[name];
-  TRY2(table->open_all_cpu(cb, cb_cookie));
+  TRY2(table->open_all_cpu(cb, cb_cookie, page_cnt));
   return StatusTuple(0);
 }
 
diff --git a/src/cc/BPF.h b/src/cc/BPF.h
index ba2c15b..a4a8817 100644
--- a/src/cc/BPF.h
+++ b/src/cc/BPF.h
@@ -27,6 +27,8 @@
 #include "compat/linux/bpf.h"
 #include "libbpf.h"
 
+static const int DEFAULT_PERF_BUFFER_PAGE_CNT = 8;
+
 namespace ebpf {
 
 struct open_probe_t {
@@ -96,7 +98,8 @@
   }
 
   StatusTuple open_perf_buffer(const std::string& name, perf_reader_raw_cb cb,
-                               void* cb_cookie = nullptr);
+                               void* cb_cookie = nullptr,
+                               int page_cnt = DEFAULT_PERF_BUFFER_PAGE_CNT);
   StatusTuple close_perf_buffer(const std::string& name);
   void poll_perf_buffer(const std::string& name, int timeout = -1);
 
diff --git a/src/cc/BPFTable.cc b/src/cc/BPFTable.cc
index 837d5bd..d0f2b99 100644
--- a/src/cc/BPFTable.cc
+++ b/src/cc/BPFTable.cc
@@ -67,11 +67,11 @@
 }
 
 StatusTuple BPFPerfBuffer::open_on_cpu(perf_reader_raw_cb cb, int cpu,
-                                       void* cb_cookie) {
+                                       void* cb_cookie, int page_cnt) {
   if (cpu_readers_.find(cpu) != cpu_readers_.end())
     return StatusTuple(-1, "Perf buffer already open on CPU %d", cpu);
   auto reader =
-      static_cast<perf_reader*>(bpf_open_perf_buffer(cb, cb_cookie, -1, cpu));
+      static_cast<perf_reader*>(bpf_open_perf_buffer(cb, cb_cookie, -1, cpu, page_cnt));
   if (reader == nullptr)
     return StatusTuple(-1, "Unable to construct perf reader");
   int reader_fd = perf_reader_fd(reader);
@@ -86,12 +86,12 @@
 }
 
 StatusTuple BPFPerfBuffer::open_all_cpu(perf_reader_raw_cb cb,
-                                        void* cb_cookie) {
+                                        void* cb_cookie, int page_cnt) {
   if (cpu_readers_.size() != 0 || readers_.size() != 0)
     return StatusTuple(-1, "Previously opened perf buffer not cleaned");
 
   for (int i: get_online_cpus()) {
-    auto res = open_on_cpu(cb, i, cb_cookie);
+    auto res = open_on_cpu(cb, i, cb_cookie, page_cnt);
     if (res.code() != 0) {
       TRY2(close_all_cpu());
       return res;
diff --git a/src/cc/BPFTable.h b/src/cc/BPFTable.h
index c5f805d..98424f1 100644
--- a/src/cc/BPFTable.h
+++ b/src/cc/BPFTable.h
@@ -126,12 +126,14 @@
       : BPFTableBase<int, int>(bpf_module, name) {}
   ~BPFPerfBuffer();
 
-  StatusTuple open_all_cpu(perf_reader_raw_cb cb, void* cb_cookie);
+  StatusTuple open_all_cpu(perf_reader_raw_cb cb, void* cb_cookie,
+                           int page_cnt);
   StatusTuple close_all_cpu();
   void poll(int timeout);
 
 private:
-  StatusTuple open_on_cpu(perf_reader_raw_cb cb, int cpu, void* cb_cookie);
+  StatusTuple open_on_cpu(perf_reader_raw_cb cb, int cpu, void* cb_cookie,
+                          int page_cnt);
   StatusTuple close_on_cpu(int cpu);
 
   std::map<int, perf_reader*> cpu_readers_;
diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c
index f3d2881..864b89e 100644
--- a/src/cc/libbpf.c
+++ b/src/cc/libbpf.c
@@ -65,6 +65,8 @@
 #define PERF_FLAG_FD_CLOEXEC (1UL << 3)
 #endif
 
+static int probe_perf_reader_page_cnt = 8;
+
 static __u64 ptr_to_u64(void *ptr)
 {
   return (__u64) (unsigned long) ptr;
@@ -351,7 +353,7 @@
   int n;
 
   snprintf(new_name, sizeof(new_name), "%s_bcc_%d", ev_name, getpid());
-  reader = perf_reader_new(cb, NULL, cb_cookie);
+  reader = perf_reader_new(cb, NULL, cb_cookie, probe_perf_reader_page_cnt);
   if (!reader)
     goto error;
 
@@ -411,7 +413,7 @@
   int n;
 
   snprintf(new_name, sizeof(new_name), "%s_bcc_%d", ev_name, getpid());
-  reader = perf_reader_new(cb, NULL, cb_cookie);
+  reader = perf_reader_new(cb, NULL, cb_cookie, probe_perf_reader_page_cnt);
   if (!reader)
     goto error;
 
@@ -493,7 +495,7 @@
   char buf[256];
   struct perf_reader *reader = NULL;
 
-  reader = perf_reader_new(cb, NULL, cb_cookie);
+  reader = perf_reader_new(cb, NULL, cb_cookie, probe_perf_reader_page_cnt);
   if (!reader)
     goto error;
 
@@ -515,12 +517,13 @@
   return 0;
 }
 
-void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu) {
+void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid,
+    int cpu, int page_cnt) {
   int pfd;
   struct perf_event_attr attr = {};
   struct perf_reader *reader = NULL;
 
-  reader = perf_reader_new(NULL, raw_cb, cb_cookie);
+  reader = perf_reader_new(NULL, raw_cb, cb_cookie, page_cnt);
   if (!reader)
     goto error;
 
diff --git a/src/cc/libbpf.h b/src/cc/libbpf.h
index 07afaa9..0e9fda2 100644
--- a/src/cc/libbpf.h
+++ b/src/cc/libbpf.h
@@ -68,7 +68,8 @@
                              int group_fd, perf_reader_cb cb, void *cb_cookie);
 int bpf_detach_tracepoint(const char *tp_category, const char *tp_name);
 
-void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu);
+void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid,
+    int cpu, int page_cnt);
 
 /* attached a prog expressed by progfd to the device specified in dev_name */
 int bpf_attach_xdp(const char *dev_name, int progfd);
diff --git a/src/cc/perf_reader.c b/src/cc/perf_reader.c
index 793a069..a9ed18e 100644
--- a/src/cc/perf_reader.c
+++ b/src/cc/perf_reader.c
@@ -26,8 +26,6 @@
 #include "libbpf.h"
 #include "perf_reader.h"
 
-int perf_reader_page_cnt = 8;
-
 struct perf_reader {
   perf_reader_cb cb;
   perf_reader_raw_cb raw_cb;
@@ -42,7 +40,8 @@
   uint64_t sample_type;
 };
 
-struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_cb, void *cb_cookie) {
+struct perf_reader * perf_reader_new(perf_reader_cb cb,
+    perf_reader_raw_cb raw_cb, void *cb_cookie, int page_cnt) {
   struct perf_reader *reader = calloc(1, sizeof(struct perf_reader));
   if (!reader)
     return NULL;
@@ -51,7 +50,7 @@
   reader->cb_cookie = cb_cookie;
   reader->fd = -1;
   reader->page_size = getpagesize();
-  reader->page_cnt = perf_reader_page_cnt;
+  reader->page_cnt = page_cnt;
   return reader;
 }
 
diff --git a/src/cc/perf_reader.h b/src/cc/perf_reader.h
index 6376c47..4bbb1e3 100644
--- a/src/cc/perf_reader.h
+++ b/src/cc/perf_reader.h
@@ -25,7 +25,8 @@
 
 struct perf_reader;
 
-struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_cb, void *cb_cookie);
+struct perf_reader * perf_reader_new(perf_reader_cb cb,
+    perf_reader_raw_cb raw_cb, void *cb_cookie, int page_cnt);
 void perf_reader_free(void *ptr);
 int perf_reader_mmap(struct perf_reader *reader, unsigned type, unsigned long sample_type);
 int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout);
diff --git a/src/lua/bcc/libbcc.lua b/src/lua/bcc/libbcc.lua
index 762db9c..fa28e21 100644
--- a/src/lua/bcc/libbcc.lua
+++ b/src/lua/bcc/libbcc.lua
@@ -54,7 +54,7 @@
 
 int bpf_detach_uprobe(const char *ev_name);
 
-void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu);
+void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu, int page_cnt);
 ]]
 
 ffi.cdef[[
diff --git a/src/lua/bcc/table.lua b/src/lua/bcc/table.lua
index c01f006..3144f22 100644
--- a/src/lua/bcc/table.lua
+++ b/src/lua/bcc/table.lua
@@ -243,13 +243,14 @@
   return string.format("bcc:perf_event_array:%d:%d", tonumber(id), cpu or 0)
 end
 
-function PerfEventArray:_open_perf_buffer(cpu, callback, ctype)
+function PerfEventArray:_open_perf_buffer(cpu, callback, ctype, page_cnt)
   local _cb = ffi.cast("perf_reader_raw_cb",
     function (cookie, data, size)
       callback(cpu, ctype(data)[0])
     end)
 
-  local reader = libbcc.bpf_open_perf_buffer(_cb, nil, -1, cpu)
+  -- default to 8 pages per buffer
+  local reader = libbcc.bpf_open_perf_buffer(_cb, nil, -1, cpu, page_cnt or 8)
   assert(reader, "failed to open perf buffer")
 
   local fd = libbcc.perf_reader_fd(reader)
@@ -258,11 +259,11 @@
   self._callbacks[cpu] = _cb
 end
 
-function PerfEventArray:open_perf_buffer(callback, data_type, ...)
+function PerfEventArray:open_perf_buffer(callback, data_type, data_params, page_cnt)
   assert(data_type, "a data type is needed for callback conversion")
-  local ctype = ffi.typeof(data_type.."*", ...)
+  local ctype = ffi.typeof(data_type.."*", unpack(data_params or {}))
   for i = 0, Posix.cpu_count() - 1 do
-    self:_open_perf_buffer(i, callback, ctype)
+    self:_open_perf_buffer(i, callback, ctype, page_cnt)
   end
 end
 
diff --git a/src/python/bcc/libbcc.py b/src/python/bcc/libbcc.py
index 9db1550..099af8f 100644
--- a/src/python/bcc/libbcc.py
+++ b/src/python/bcc/libbcc.py
@@ -102,7 +102,7 @@
 lib.bpf_detach_tracepoint.restype = ct.c_int
 lib.bpf_detach_tracepoint.argtypes = [ct.c_char_p, ct.c_char_p]
 lib.bpf_open_perf_buffer.restype = ct.c_void_p
-lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, ct.py_object, ct.c_int, ct.c_int]
+lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, ct.py_object, ct.c_int, ct.c_int, ct.c_int]
 lib.bpf_open_perf_event.restype = ct.c_int
 lib.bpf_open_perf_event.argtypes = [ct.c_uint, ct.c_ulonglong, ct.c_int, ct.c_int]
 lib.perf_reader_poll.restype = ct.c_int
diff --git a/src/python/bcc/table.py b/src/python/bcc/table.py
index bb4cd41..18851f2 100644
--- a/src/python/bcc/table.py
+++ b/src/python/bcc/table.py
@@ -507,20 +507,25 @@
         super(PerfEventArray, self).__delitem__(key)
         self.close_perf_buffer(key)
 
-    def open_perf_buffer(self, callback):
+    def open_perf_buffer(self, callback, page_cnt=8):
         """open_perf_buffers(callback)
 
         Opens a set of per-cpu ring buffer to receive custom perf event
         data from the bpf program. The callback will be invoked for each
-        event submitted from the kernel, up to millions per second.
+        event submitted from the kernel, up to millions per second. Use
+        page_cnt to change the size of the per-cpu ring buffer. The value
+        must be a power of two and defaults to 8.
         """
 
-        for i in get_online_cpus():
-            self._open_perf_buffer(i, callback)
+        if page_cnt & (page_cnt - 1) != 0:
+            raise Exception("Perf buffer page_cnt must be a power of two")
 
-    def _open_perf_buffer(self, cpu, callback):
+        for i in get_online_cpus():
+            self._open_perf_buffer(i, callback, page_cnt)
+
+    def _open_perf_buffer(self, cpu, callback, page_cnt):
         fn = _RAW_CB_TYPE(lambda _, data, size: callback(cpu, data, size))
-        reader = lib.bpf_open_perf_buffer(fn, None, -1, cpu)
+        reader = lib.bpf_open_perf_buffer(fn, None, -1, cpu, page_cnt)
         if not reader:
             raise Exception("Could not open perf buffer")
         fd = lib.perf_reader_fd(reader)
diff --git a/tools/biosnoop.lua b/tools/biosnoop.lua
index ac08897..fac7f3b 100644
--- a/tools/biosnoop.lua
+++ b/tools/biosnoop.lua
@@ -175,9 +175,9 @@
       uint64_t sector;
       uint64_t len;
       uint64_t ts;
-      char disk_name[%d];
-      char name[%d];
+      char disk_name[$];
+      char name[$];
     }
-  ]] % {DISK_NAME_LEN, TASK_COMM_LEN})
+  ]], {DISK_NAME_LEN, TASK_COMM_LEN}, 64)
   bpf:kprobe_poll_loop()
 end
diff --git a/tools/biosnoop.py b/tools/biosnoop.py
index aa8a077..3d77e52 100755
--- a/tools/biosnoop.py
+++ b/tools/biosnoop.py
@@ -182,6 +182,6 @@
     start_ts = 1
 
 # loop with callback to print_event
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
     b.kprobe_poll()
diff --git a/tools/btrfsslower.py b/tools/btrfsslower.py
index fcc155e..8b34900 100755
--- a/tools/btrfsslower.py
+++ b/tools/btrfsslower.py
@@ -343,6 +343,6 @@
         "BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
 
 # read events
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
     b.kprobe_poll()
diff --git a/tools/cpuunclaimed.py b/tools/cpuunclaimed.py
index 9624b50..3998f9f 100755
--- a/tools/cpuunclaimed.py
+++ b/tools/cpuunclaimed.py
@@ -205,7 +205,7 @@
 trigger = int(0.8 * (1000000000 / frequency))
 
 # read events
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
     # allow some buffering by calling sleep(), to reduce the context switch
     # rate and lower overhead.
diff --git a/tools/dbslower.py b/tools/dbslower.py
index 70e0503..6ddec41 100755
--- a/tools/dbslower.py
+++ b/tools/dbslower.py
@@ -131,7 +131,7 @@
       (', '.join(map(str, args.pids)), args.threshold))
 print("%-14s %-6s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY"))
 
-bpf["events"].open_perf_buffer(print_event)
+bpf["events"].open_perf_buffer(print_event, page_cnt=64)
 while True:
     bpf.kprobe_poll()
 
diff --git a/tools/dcsnoop.py b/tools/dcsnoop.py
index d162a66..a72ba41 100755
--- a/tools/dcsnoop.py
+++ b/tools/dcsnoop.py
@@ -153,6 +153,6 @@
 # header
 print("%-11s %-6s %-16s %1s %s" % ("TIME(s)", "PID", "COMM", "T", "FILE"))
 
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
     b.kprobe_poll()
diff --git a/tools/ext4slower.py b/tools/ext4slower.py
index 20865a5..4950325 100755
--- a/tools/ext4slower.py
+++ b/tools/ext4slower.py
@@ -337,6 +337,6 @@
         "BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
 
 # read events
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
     b.kprobe_poll()
diff --git a/tools/fileslower.py b/tools/fileslower.py
index 2ae4756..ab29990 100755
--- a/tools/fileslower.py
+++ b/tools/fileslower.py
@@ -243,6 +243,6 @@
         time.time() - start_ts, event.comm, event.pid, mode_s[event.mode],
         event.sz, ms, name))
 
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
     b.kprobe_poll()
diff --git a/tools/mysqld_qslower.py b/tools/mysqld_qslower.py
index 94906a8..3ed18ec 100755
--- a/tools/mysqld_qslower.py
+++ b/tools/mysqld_qslower.py
@@ -128,6 +128,6 @@
         event.pid, float(event.delta) / 1000000, event.query))
 
 # loop with callback to print_event
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
     b.kprobe_poll()
diff --git a/tools/opensnoop.py b/tools/opensnoop.py
index 0c2b9b5..dae4ff4 100755
--- a/tools/opensnoop.py
+++ b/tools/opensnoop.py
@@ -178,6 +178,6 @@
            event.comm, fd_s, err, event.fname))
 
 # loop with callback to print_event
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
     b.kprobe_poll()
diff --git a/tools/stacksnoop.lua b/tools/stacksnoop.lua
index 8f5f5b4..7dfaf3d 100755
--- a/tools/stacksnoop.lua
+++ b/tools/stacksnoop.lua
@@ -102,6 +102,6 @@
 
   bpf:get_table("events"):open_perf_buffer(print_event,
     "struct { uint64_t stack_id; uint32_t pid; char comm[$]; }",
-    TASK_COMM_LEN)
+    {TASK_COMM_LEN})
   bpf:kprobe_poll_loop()
 end
diff --git a/tools/statsnoop.py b/tools/statsnoop.py
index 2fc2164..d9164b6 100755
--- a/tools/statsnoop.py
+++ b/tools/statsnoop.py
@@ -159,6 +159,6 @@
         fd_s, err, event.fname))
 
 # loop with callback to print_event
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
     b.kprobe_poll()
diff --git a/tools/tcplife.py b/tools/tcplife.py
index 1125f9c..69ba174 100755
--- a/tools/tcplife.py
+++ b/tools/tcplife.py
@@ -354,7 +354,7 @@
 start_ts = 0
 
 # read events
-b["ipv4_events"].open_perf_buffer(print_ipv4_event)
-b["ipv6_events"].open_perf_buffer(print_ipv6_event)
+b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64)
+b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64)
 while 1:
     b.kprobe_poll()
diff --git a/tools/trace.py b/tools/trace.py
index 46bc97e..029194c 100755
--- a/tools/trace.py
+++ b/tools/trace.py
@@ -29,6 +29,7 @@
         use_localtime = True
         tgid = -1
         pid = -1
+        page_cnt = None
 
         @classmethod
         def configure(cls, args):
@@ -38,6 +39,7 @@
                 cls.first_ts = BPF.monotonic_time()
                 cls.tgid = args.tgid or -1
                 cls.pid = args.pid or -1
+                cls.page_cnt = args.buffer_pages
 
         def __init__(self, probe, string_size, kernel_stack, user_stack):
                 self.usdt = None
@@ -510,7 +512,8 @@
                         self._attach_u(bpf)
                 self.python_struct = self._generate_python_data_decl()
                 callback = partial(self.print_event, bpf)
-                bpf[self.events_name].open_perf_buffer(callback)
+                bpf[self.events_name].open_perf_buffer(callback,
+                        page_cnt=self.page_cnt)
 
         def _attach_k(self, bpf):
                 if self.probe_type == "r":
@@ -543,6 +546,7 @@
                                           pid=Probe.tgid)
 
 class Tool(object):
+        DEFAULT_PERF_BUFFER_PAGES = 64
         examples = """
 EXAMPLES:
 
@@ -577,6 +581,10 @@
                   "functions and print trace messages.",
                   formatter_class=argparse.RawDescriptionHelpFormatter,
                   epilog=Tool.examples)
+                parser.add_argument("-b", "--buffer-pages", type=int,
+                  default=Tool.DEFAULT_PERF_BUFFER_PAGES,
+                  help="number of pages to use for perf_events ring buffer "
+                       "(default: %(default)d)")
                 # we'll refer to the userspace concepts of "pid" and "tid" by
                 # their kernel names -- tgid and pid -- inside the script
                 parser.add_argument("-p", "--pid", type=int, metavar="PID",
diff --git a/tools/trace_example.txt b/tools/trace_example.txt
index 504030c..eb72e5e 100644
--- a/tools/trace_example.txt
+++ b/tools/trace_example.txt
@@ -201,11 +201,28 @@
 libraries and then accessing the /home/vagrant directory listing.
 
 
+Lastly, if a high-frequency event is traced you may overflow the perf ring
+buffer. This shows as "Lost N samples":
+
+# trace sys_open
+5087   5087   pgrep        sys_open
+5087   5087   pgrep        sys_open
+5087   5087   pgrep        sys_open
+5087   5087   pgrep        sys_open
+5087   5087   pgrep        sys_open
+Lost 764896 samples
+Lost 764896 samples
+Lost 764896 samples
+
+The perf ring buffer size can be changed with -b. The unit is size per-CPU buffer
+size and is measured in pages. The value must be a power of two and defaults to
+64 pages.
+
+
 USAGE message:
 
-# trace -h
-usage: trace [-h] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S]
-             [-M MAX_EVENTS] [-t] [-T] [-K] [-U] [-I header]
+usage: trace [-h] [-b BUFFER_PAGES] [-p PID] [-L TID] [-v] [-Z STRING_SIZE]
+             [-S] [-M MAX_EVENTS] [-t] [-T] [-K] [-U] [-I header]
              probe [probe ...]
 
 Attach to functions and print trace messages.
@@ -215,6 +232,9 @@
 
 optional arguments:
   -h, --help            show this help message and exit
+  -b BUFFER_PAGES, --buffer-pages BUFFER_PAGES
+                        number of pages to use for perf_events ring buffer
+                        (default: 64)
   -p PID, --pid PID     id of the process to trace (optional)
   -L TID, --tid TID     id of the thread to trace (optional)
   -v, --verbose         print resulting BPF program code before executing
@@ -224,7 +244,7 @@
   -M MAX_EVENTS, --max-events MAX_EVENTS
                         number of events to print before quitting
   -t, --timestamp       print timestamp column (offset from trace start)
-  -T, --time		print time column
+  -T, --time            print time column
   -K, --kernel-stack    output kernel stack trace
   -U, --user-stack      output user stack trace
   -I header, --include header
@@ -247,9 +267,9 @@
         Trace malloc calls and print the size being allocated
 trace 'p:c:write (arg1 == 1) "writing %d bytes to STDOUT", arg3'
         Trace the write() call from libc to monitor writes to STDOUT
-trace 'r::__kmalloc (retval == 0) "kmalloc failed!"
+trace 'r::__kmalloc (retval == 0) "kmalloc failed!"'
         Trace returns from __kmalloc which returned a null pointer
-trace 'r:c:malloc (retval) "allocated = %x", retval
+trace 'r:c:malloc (retval) "allocated = %x", retval'
         Trace returns from malloc and print non-NULL allocated buffers
 trace 't:block:block_rq_complete "sectors=%d", args->nr_sector'
         Trace the block_rq_complete kernel tracepoint and print # of tx sectors
diff --git a/tools/xfsslower.py b/tools/xfsslower.py
index 25c5a20..3fbc96d 100755
--- a/tools/xfsslower.py
+++ b/tools/xfsslower.py
@@ -293,6 +293,6 @@
         "BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
 
 # read events
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
     b.kprobe_poll()
diff --git a/tools/zfsslower.py b/tools/zfsslower.py
index e2be684..f5e8cbb 100755
--- a/tools/zfsslower.py
+++ b/tools/zfsslower.py
@@ -297,6 +297,6 @@
         "BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
 
 # read events
-b["events"].open_perf_buffer(print_event)
+b["events"].open_perf_buffer(print_event, page_cnt=64)
 while 1:
     b.kprobe_poll()