memleak: expand allocator coverage (#1214)
* memleak: handle libc allocation functions other than malloc
* memleak: use tracepoints to track kernel allocations
* memleak: add combined-only mode
With large number of outstanding allocations, amount of data passed from
kernel becomes large, which slows everything down.
This patch calculates allocation statistics inside kernel, allowing user-
space part to pull combined statistics data only, thus significantly
reducing amount of passed data.
* memleak: increase hashtable capacities
There are a lot of allocations happen in kernel. Default values are not
enough to keep up.
* test: add a test for the memleak tool
diff --git a/tools/memleak.py b/tools/memleak.py
index 80a1dc3..484c386 100755
--- a/tools/memleak.py
+++ b/tools/memleak.py
@@ -4,8 +4,8 @@
# memory leaks in user-mode processes and the kernel.
#
# USAGE: memleak [-h] [-p PID] [-t] [-a] [-o OLDER] [-c COMMAND]
-# [-s SAMPLE_RATE] [-d STACK_DEPTH] [-T TOP] [-z MIN_SIZE]
-# [-Z MAX_SIZE]
+# [--combined-only] [-s SAMPLE_RATE] [-T TOP] [-z MIN_SIZE]
+# [-Z MAX_SIZE] [-O OBJ]
# [interval] [count]
#
# Licensed under the Apache License, Version 2.0 (the "License")
@@ -44,7 +44,7 @@
Trace allocations and display a summary of "leaked" (outstanding)
allocations every 5 seconds
./memleak -p $(pidof allocs) -t
- Trace allocations and display each individual call to malloc/free
+ Trace allocations and display each individual allocator function call
./memleak -ap $(pidof allocs) 10
Trace allocations and display allocated addresses, sizes, and stacks
every 10 seconds for outstanding allocations
@@ -62,8 +62,9 @@
description = """
Trace outstanding memory allocations that weren't freed.
-Supports both user-mode allocations made with malloc/free and kernel-mode
-allocations made with kmalloc/kfree.
+Supports both user-mode allocations made with libc functions and kernel-mode
+allocations made with kmalloc/kmem_cache_alloc/get_free_pages and corresponding
+memory release functions.
"""
parser = argparse.ArgumentParser(description=description,
@@ -83,6 +84,8 @@
help="prune allocations younger than this age in milliseconds")
parser.add_argument("-c", "--command",
help="execute and trace the specified command")
+parser.add_argument("--combined-only", default=False, action="store_true",
+ help="show combined allocation statistics only")
parser.add_argument("-s", "--sample-rate", default=1, type=int,
help="sample every N-th allocation to decrease the overhead")
parser.add_argument("-T", "--top", type=int, default=10,
@@ -92,7 +95,7 @@
parser.add_argument("-Z", "--max-size", type=int,
help="capture only allocations smaller than this size")
parser.add_argument("-O", "--obj", type=str, default="c",
- help="attach to malloc & free in the specified object")
+ help="attach to allocator functions in the specified object")
args = parser.parse_args()
@@ -126,12 +129,51 @@
int stack_id;
};
-BPF_HASH(sizes, u64);
-BPF_HASH(allocs, u64, struct alloc_info_t);
-BPF_STACK_TRACE(stack_traces, 1024)
+struct combined_alloc_info_t {
+ u64 total_size;
+ u64 number_of_allocs;
+};
-int alloc_enter(struct pt_regs *ctx, size_t size)
-{
+BPF_HASH(sizes, u64);
+BPF_TABLE("hash", u64, struct alloc_info_t, allocs, 1000000);
+BPF_HASH(memptrs, u64, u64);
+BPF_STACK_TRACE(stack_traces, 10240)
+BPF_TABLE("hash", u64, struct combined_alloc_info_t, combined_allocs, 10240);
+
+static inline void update_statistics_add(u64 stack_id, u64 sz) {
+ struct combined_alloc_info_t *existing_cinfo;
+ struct combined_alloc_info_t cinfo = {0};
+
+ existing_cinfo = combined_allocs.lookup(&stack_id);
+ if (existing_cinfo != 0)
+ cinfo = *existing_cinfo;
+
+ cinfo.total_size += sz;
+ cinfo.number_of_allocs += 1;
+
+ combined_allocs.update(&stack_id, &cinfo);
+}
+
+static inline void update_statistics_del(u64 stack_id, u64 sz) {
+ struct combined_alloc_info_t *existing_cinfo;
+ struct combined_alloc_info_t cinfo = {0};
+
+ existing_cinfo = combined_allocs.lookup(&stack_id);
+ if (existing_cinfo != 0)
+ cinfo = *existing_cinfo;
+
+ if (sz >= cinfo.total_size)
+ cinfo.total_size = 0;
+ else
+ cinfo.total_size -= sz;
+
+ if (cinfo.number_of_allocs > 0)
+ cinfo.number_of_allocs -= 1;
+
+ combined_allocs.update(&stack_id, &cinfo);
+}
+
+static inline int gen_alloc_enter(struct pt_regs *ctx, size_t size) {
SIZE_FILTER
if (SAMPLE_EVERY_N > 1) {
u64 ts = bpf_ktime_get_ns();
@@ -148,9 +190,7 @@
return 0;
}
-int alloc_exit(struct pt_regs *ctx)
-{
- u64 address = PT_REGS_RC(ctx);
+static inline int gen_alloc_exit2(struct pt_regs *ctx, u64 address) {
u64 pid = bpf_get_current_pid_tgid();
u64* size64 = sizes.lookup(&pid);
struct alloc_info_t info = {0};
@@ -164,6 +204,7 @@
info.timestamp_ns = bpf_ktime_get_ns();
info.stack_id = stack_traces.get_stackid(ctx, STACK_FLAGS);
allocs.update(&address, &info);
+ update_statistics_add(info.stack_id, info.size);
if (SHOULD_PRINT) {
bpf_trace_printk("alloc exited, size = %lu, result = %lx\\n",
@@ -172,14 +213,18 @@
return 0;
}
-int free_enter(struct pt_regs *ctx, void *address)
-{
+static inline int gen_alloc_exit(struct pt_regs *ctx) {
+ return gen_alloc_exit2(ctx, PT_REGS_RC(ctx));
+}
+
+static inline int gen_free_enter(struct pt_regs *ctx, void *address) {
u64 addr = (u64)address;
struct alloc_info_t *info = allocs.lookup(&addr);
if (info == 0)
return 0;
allocs.delete(&addr);
+ update_statistics_del(info->stack_id, info->size);
if (SHOULD_PRINT) {
bpf_trace_printk("free entered, address = %lx, size = %lu\\n",
@@ -187,7 +232,138 @@
}
return 0;
}
+
+int malloc_enter(struct pt_regs *ctx, size_t size) {
+ return gen_alloc_enter(ctx, size);
+}
+
+int malloc_exit(struct pt_regs *ctx) {
+ return gen_alloc_exit(ctx);
+}
+
+int free_enter(struct pt_regs *ctx, void *address) {
+ return gen_free_enter(ctx, address);
+}
+
+int calloc_enter(struct pt_regs *ctx, size_t nmemb, size_t size) {
+ return gen_alloc_enter(ctx, nmemb * size);
+}
+
+int calloc_exit(struct pt_regs *ctx) {
+ return gen_alloc_exit(ctx);
+}
+
+int realloc_enter(struct pt_regs *ctx, void *ptr, size_t size) {
+ gen_free_enter(ctx, ptr);
+ return gen_alloc_enter(ctx, size);
+}
+
+int realloc_exit(struct pt_regs *ctx) {
+ return gen_alloc_exit(ctx);
+}
+
+int posix_memalign_enter(struct pt_regs *ctx, void **memptr, size_t alignment,
+ size_t size) {
+ u64 memptr64 = (u64)(size_t)memptr;
+ u64 pid = bpf_get_current_pid_tgid();
+
+ memptrs.update(&pid, &memptr64);
+ return gen_alloc_enter(ctx, size);
+}
+
+int posix_memalign_exit(struct pt_regs *ctx) {
+ u64 pid = bpf_get_current_pid_tgid();
+ u64 *memptr64 = memptrs.lookup(&pid);
+ void *addr;
+
+ if (memptr64 == 0)
+ return 0;
+
+ memptrs.delete(&pid);
+
+ if (bpf_probe_read(&addr, sizeof(void*), (void*)(size_t)*memptr64) != 0)
+ return 0;
+
+ u64 addr64 = (u64)(size_t)addr;
+ return gen_alloc_exit2(ctx, addr64);
+}
+
+int aligned_alloc_enter(struct pt_regs *ctx, size_t alignment, size_t size) {
+ return gen_alloc_enter(ctx, size);
+}
+
+int aligned_alloc_exit(struct pt_regs *ctx) {
+ return gen_alloc_exit(ctx);
+}
+
+int valloc_enter(struct pt_regs *ctx, size_t size) {
+ return gen_alloc_enter(ctx, size);
+}
+
+int valloc_exit(struct pt_regs *ctx) {
+ return gen_alloc_exit(ctx);
+}
+
+int memalign_enter(struct pt_regs *ctx, size_t alignment, size_t size) {
+ return gen_alloc_enter(ctx, size);
+}
+
+int memalign_exit(struct pt_regs *ctx) {
+ return gen_alloc_exit(ctx);
+}
+
+int pvalloc_enter(struct pt_regs *ctx, size_t size) {
+ return gen_alloc_enter(ctx, size);
+}
+
+int pvalloc_exit(struct pt_regs *ctx) {
+ return gen_alloc_exit(ctx);
+}
"""
+
+bpf_source_kernel = """
+
+TRACEPOINT_PROBE(kmem, kmalloc) {
+ gen_alloc_enter((struct pt_regs *)args, args->bytes_alloc);
+ return gen_alloc_exit2((struct pt_regs *)args, (size_t)args->ptr);
+}
+
+TRACEPOINT_PROBE(kmem, kmalloc_node) {
+ gen_alloc_enter((struct pt_regs *)args, args->bytes_alloc);
+ return gen_alloc_exit2((struct pt_regs *)args, (size_t)args->ptr);
+}
+
+TRACEPOINT_PROBE(kmem, kfree) {
+ return gen_free_enter((struct pt_regs *)args, (void *)args->ptr);
+}
+
+TRACEPOINT_PROBE(kmem, kmem_cache_alloc) {
+ gen_alloc_enter((struct pt_regs *)args, args->bytes_alloc);
+ return gen_alloc_exit2((struct pt_regs *)args, (size_t)args->ptr);
+}
+
+TRACEPOINT_PROBE(kmem, kmem_cache_alloc_node) {
+ gen_alloc_enter((struct pt_regs *)args, args->bytes_alloc);
+ return gen_alloc_exit2((struct pt_regs *)args, (size_t)args->ptr);
+}
+
+TRACEPOINT_PROBE(kmem, kmem_cache_free) {
+ return gen_free_enter((struct pt_regs *)args, (void *)args->ptr);
+}
+
+TRACEPOINT_PROBE(kmem, mm_page_alloc) {
+ gen_alloc_enter((struct pt_regs *)args, PAGE_SIZE << args->order);
+ return gen_alloc_exit2((struct pt_regs *)args, args->pfn);
+}
+
+TRACEPOINT_PROBE(kmem, mm_page_free) {
+ return gen_free_enter((struct pt_regs *)args, (void *)args->pfn);
+}
+"""
+
+if kernel_trace:
+ bpf_source += bpf_source_kernel
+
bpf_source = bpf_source.replace("SHOULD_PRINT", "1" if trace_all else "0")
bpf_source = bpf_source.replace("SAMPLE_EVERY_N", str(sample_every_n))
@@ -209,18 +385,54 @@
bpf_program = BPF(text=bpf_source)
if not kernel_trace:
- print("Attaching to malloc and free in pid %d, Ctrl+C to quit." % pid)
- bpf_program.attach_uprobe(name=obj, sym="malloc",
- fn_name="alloc_enter", pid=pid)
- bpf_program.attach_uretprobe(name=obj, sym="malloc",
- fn_name="alloc_exit", pid=pid)
- bpf_program.attach_uprobe(name=obj, sym="free",
- fn_name="free_enter", pid=pid)
+ print("Attaching to pid %d, Ctrl+C to quit." % pid)
+
+ def attach_probes(sym, fn_prefix=None, can_fail=False):
+ if fn_prefix is None:
+ fn_prefix = sym
+
+ try:
+ bpf_program.attach_uprobe(name=obj, sym=sym,
+ fn_name=fn_prefix+"_enter",
+ pid=pid)
+ bpf_program.attach_uretprobe(name=obj, sym=sym,
+ fn_name=fn_prefix+"_exit",
+ pid=pid)
+ except Exception:
+ if can_fail:
+ return
+ else:
+ raise
+
+ attach_probes("malloc")
+ attach_probes("calloc")
+ attach_probes("realloc")
+ attach_probes("posix_memalign")
+ attach_probes("valloc")
+ attach_probes("memalign")
+ attach_probes("pvalloc")
+ attach_probes("aligned_alloc", can_fail=True) # added in C11
+ bpf_program.attach_uprobe(name=obj, sym="free", fn_name="free_enter",
+ pid=pid)
+
else:
- print("Attaching to kmalloc and kfree, Ctrl+C to quit.")
- bpf_program.attach_kprobe(event="__kmalloc", fn_name="alloc_enter")
- bpf_program.attach_kretprobe(event="__kmalloc", fn_name="alloc_exit")
- bpf_program.attach_kprobe(event="kfree", fn_name="free_enter")
+ print("Attaching to kernel allocators, Ctrl+C to quit.")
+
+ # No probe attaching here. Allocations are counted by attaching to
+ # tracepoints.
+ #
+ # Memory allocations in Linux kernel are not limited to malloc/free
+ # equivalents. It's also common to allocate a memory page or multiple
+ # pages. Page allocator have two interfaces, one working with page frame
+ # numbers (PFN), while other working with page addresses. It's possible
+ # to allocate pages with one kind of functions, and free them with
+ # another. Code in kernel can easy convert PFNs to addresses and back,
+ # but it's hard to do the same in eBPF kprobe without fragile hacks.
+ #
+ # Fortunately, Linux exposes tracepoints for memory allocations, which
+ # can be instrumented by eBPF programs. Tracepoint for page allocations
+ # gives access to PFNs for both allocator interfaces. So there is no
+ # need to guess which allocation corresponds to which free.
def print_outstanding():
print("[%s] Top %d stacks with outstanding allocations:" %
@@ -252,6 +464,37 @@
print("\t%d bytes in %d allocations from stack\n\t\t%s" %
(alloc.size, alloc.count, "\n\t\t".join(alloc.stack)))
+def print_outstanding_combined():
+ stack_traces = bpf_program["stack_traces"]
+ stacks = sorted(bpf_program["combined_allocs"].items(),
+ key=lambda a: -a[1].total_size)
+ cnt = 1
+ entries = []
+ for stack_id, info in stacks:
+ try:
+ trace = []
+ for addr in stack_traces.walk(stack_id.value):
+ sym = bpf_program.sym(addr, pid,
+ show_module=True,
+ show_offset=True)
+ trace.append(sym)
+ trace = "\n\t\t".join(trace)
+ except KeyError:
+ trace = "stack information lost"
+
+ entry = ("\t%d bytes in %d allocations from stack\n\t\t%s" %
+ (info.total_size, info.number_of_allocs, trace))
+ entries.append(entry)
+
+ cnt += 1
+ if cnt > top_stacks:
+ break
+
+ print("[%s] Top %d stacks with outstanding allocations:" %
+ (datetime.now().strftime("%H:%M:%S"), top_stacks))
+
+ print('\n'.join(reversed(entries)))
+
count_so_far = 0
while True:
if trace_all:
@@ -261,7 +504,10 @@
sleep(interval)
except KeyboardInterrupt:
exit()
- print_outstanding()
+ if args.combined_only:
+ print_outstanding_combined()
+ else:
+ print_outstanding()
count_so_far += 1
if num_prints is not None and count_so_far >= num_prints:
exit()