| #!/usr/bin/env python |
| # |
| # memleak.py Trace and display outstanding allocations to detect |
| # memory leaks in user-mode processes and the kernel. |
| # |
| # USAGE: memleak.py [-h] [-p PID] [-t] [-a] [-o OLDER] [-c COMMAND] |
| # [-s SAMPLE_RATE] [-d STACK_DEPTH] [-T TOP] [-z MIN_SIZE] |
| # [-Z MAX_SIZE] |
| # [interval] [count] |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License") |
| # Copyright (C) 2016 Sasha Goldshtein. |
| |
| from bcc import BPF |
| from time import sleep |
| from datetime import datetime |
| import argparse |
| import subprocess |
| import ctypes |
| import os |
| |
| class Time(object): |
| # BPF timestamps come from the monotonic clock. To be able to filter |
| # and compare them from Python, we need to invoke clock_gettime. |
| # Adapted from http://stackoverflow.com/a/1205762 |
| CLOCK_MONOTONIC_RAW = 4 # see <linux/time.h> |
| |
| class timespec(ctypes.Structure): |
| _fields_ = [ |
| ('tv_sec', ctypes.c_long), |
| ('tv_nsec', ctypes.c_long) |
| ] |
| |
| librt = ctypes.CDLL('librt.so.1', use_errno=True) |
| clock_gettime = librt.clock_gettime |
| clock_gettime.argtypes = [ctypes.c_int, ctypes.POINTER(timespec)] |
| |
| @staticmethod |
| def monotonic_time(): |
| t = Time.timespec() |
| if Time.clock_gettime( |
| Time.CLOCK_MONOTONIC_RAW, ctypes.pointer(t)) != 0: |
| errno_ = ctypes.get_errno() |
| raise OSError(errno_, os.strerror(errno_)) |
| return t.tv_sec * 1e9 + t.tv_nsec |
| |
| class StackDecoder(object): |
| def __init__(self, pid, bpf): |
| self.pid = pid |
| self.bpf = bpf |
| self.ranges_cache = {} |
| self.refresh_code_ranges() |
| |
| def refresh_code_ranges(self): |
| if self.pid == -1: |
| return |
| self.code_ranges = self._get_code_ranges() |
| |
| @staticmethod |
| def _is_binary_segment(parts): |
| return len(parts) == 6 and \ |
| parts[5][0] != '[' and 'x' in parts[1] |
| |
| def _get_code_ranges(self): |
| ranges = {} |
| raw_ranges = open("/proc/%d/maps" % self.pid).readlines() |
| # A typical line from /proc/PID/maps looks like this: |
| # 7f21b6635000-7f21b67eb000 r-xp ... /usr/lib64/libc-2.21.so |
| # We are looking for executable segments that have a .so file |
| # or the main executable. The first two lines are the range of |
| # that memory segment, which we index by binary name. |
| for raw_range in raw_ranges: |
| parts = raw_range.split() |
| if not StackDecoder._is_binary_segment(parts): |
| continue |
| binary = parts[5] |
| range_parts = parts[0].split('-') |
| addr_range = (int(range_parts[0], 16), |
| int(range_parts[1], 16)) |
| ranges[binary] = addr_range |
| return ranges |
| |
| @staticmethod |
| def _is_function_symbol(parts): |
| return len(parts) == 6 and parts[3] == ".text" \ |
| and parts[2] == "F" |
| |
| def _get_sym_ranges(self, binary): |
| if binary in self.ranges_cache: |
| return self.ranges_cache[binary] |
| sym_ranges = {} |
| raw_symbols = run_command_get_output("objdump -t %s" % binary) |
| for raw_symbol in raw_symbols: |
| # A typical line from objdump -t looks like this: |
| # 00000000004007f5 g F .text 000000000000010e main |
| # We only care about functions in the .text segment. |
| # The first number is the start address, and the second |
| # number is the length. |
| parts = raw_symbol.split() |
| if not StackDecoder._is_function_symbol(parts): |
| continue |
| sym_start = int(parts[0], 16) |
| sym_len = int(parts[4], 16) |
| sym_name = parts[5] |
| sym_ranges[sym_name] = (sym_start, sym_len) |
| self.ranges_cache[binary] = sym_ranges |
| return sym_ranges |
| |
| def _decode_sym(self, binary, offset): |
| sym_ranges = self._get_sym_ranges(binary) |
| # Find the symbol that contains the specified offset. |
| # There might not be one. |
| for name, (start, length) in sym_ranges.items(): |
| if offset >= start and offset <= (start + length): |
| return "%s+0x%x" % (name, offset - start) |
| return "%x" % offset |
| |
| def _decode_addr(self, addr): |
| code_ranges = self._get_code_ranges() |
| # Find the binary that contains the specified address. |
| # For .so files, look at the relative address; for the main |
| # executable, look at the absolute address. |
| for binary, (start, end) in code_ranges.items(): |
| if addr >= start and addr <= end: |
| offset = addr - start \ |
| if binary.endswith(".so") else addr |
| return "%s [%s]" % (self._decode_sym(binary, |
| offset), binary) |
| return "%x" % addr |
| |
| def decode_stack(self, info, is_kernel_trace): |
| stack = "" |
| if info.num_frames <= 0: |
| return "???" |
| for i in range(0, info.num_frames): |
| addr = info.callstack[i] |
| if is_kernel_trace: |
| stack += " %s [kernel] (%x) ;" % \ |
| (self.bpf.ksym(addr), addr) |
| else: |
| # At some point, we hope to have native BPF |
| # user-mode symbol decoding, but for now we |
| # have to use our own. |
| stack += " %s (%x) ;" % \ |
| (self._decode_addr(addr), addr) |
| return stack |
| |
| def run_command_get_output(command): |
| p = subprocess.Popen(command.split(), |
| stdout=subprocess.PIPE, stderr=subprocess.STDOUT) |
| return iter(p.stdout.readline, b'') |
| |
| def run_command_get_pid(command): |
| p = subprocess.Popen(command.split()) |
| return p.pid |
| |
| examples = """ |
| EXAMPLES: |
| |
| ./memleak.py -p $(pidof allocs) |
| Trace allocations and display a summary of "leaked" (outstanding) |
| allocations every 5 seconds |
| ./memleak.py -p $(pidof allocs) -t |
| Trace allocations and display each individual call to malloc/free |
| ./memleak.py -ap $(pidof allocs) 10 |
| Trace allocations and display allocated addresses, sizes, and stacks |
| every 10 seconds for outstanding allocations |
| ./memleak.py -c "./allocs" |
| Run the specified command and trace its allocations |
| ./memleak.py |
| Trace allocations in kernel mode and display a summary of outstanding |
| allocations every 5 seconds |
| ./memleak.py -o 60000 |
| Trace allocations in kernel mode and display a summary of outstanding |
| allocations that are at least one minute (60 seconds) old |
| ./memleak.py -s 5 |
| Trace roughly every 5th allocation, to reduce overhead |
| """ |
| |
| description = """ |
| Trace outstanding memory allocations that weren't freed. |
| Supports both user-mode allocations made with malloc/free and kernel-mode |
| allocations made with kmalloc/kfree. |
| """ |
| |
| parser = argparse.ArgumentParser(description=description, |
| formatter_class=argparse.RawDescriptionHelpFormatter, |
| epilog=examples) |
| parser.add_argument("-p", "--pid", type=int, default=-1, |
| help="the PID to trace; if not specified, trace kernel allocs") |
| parser.add_argument("-t", "--trace", action="store_true", |
| help="print trace messages for each alloc/free call") |
| parser.add_argument("interval", nargs="?", default=5, type=int, |
| help="interval in seconds to print outstanding allocations") |
| parser.add_argument("count", nargs="?", type=int, |
| help="number of times to print the report before exiting") |
| parser.add_argument("-a", "--show-allocs", default=False, action="store_true", |
| help="show allocation addresses and sizes as well as call stacks") |
| parser.add_argument("-o", "--older", default=500, type=int, |
| help="prune allocations younger than this age in milliseconds") |
| parser.add_argument("-c", "--command", |
| help="execute and trace the specified command") |
| parser.add_argument("-s", "--sample-rate", default=1, type=int, |
| help="sample every N-th allocation to decrease the overhead") |
| parser.add_argument("-d", "--stack-depth", default=10, type=int, |
| help="maximum stack depth to capture") |
| parser.add_argument("-T", "--top", type=int, default=10, |
| help="display only this many top allocating stacks (by size)") |
| parser.add_argument("-z", "--min-size", type=int, |
| help="capture only allocations larger than this size") |
| parser.add_argument("-Z", "--max-size", type=int, |
| help="capture only allocations smaller than this size") |
| |
| args = parser.parse_args() |
| |
| pid = args.pid |
| command = args.command |
| kernel_trace = (pid == -1 and command is None) |
| trace_all = args.trace |
| interval = args.interval |
| min_age_ns = 1e6 * args.older |
| sample_every_n = args.sample_rate |
| num_prints = args.count |
| max_stack_size = args.stack_depth + 2 |
| top_stacks = args.top |
| min_size = args.min_size |
| max_size = args.max_size |
| |
| if min_size is not None and max_size is not None and min_size > max_size: |
| print("min_size (-z) can't be greater than max_size (-Z)") |
| exit(1) |
| |
| if command is not None: |
| print("Executing '%s' and tracing the resulting process." % command) |
| pid = run_command_get_pid(command) |
| |
| bpf_source = """ |
| #include <uapi/linux/ptrace.h> |
| |
| struct alloc_info_t { |
| u64 size; |
| u64 timestamp_ns; |
| int num_frames; |
| u64 callstack[MAX_STACK_SIZE]; |
| }; |
| |
| BPF_HASH(sizes, u64); |
| BPF_HASH(allocs, u64, struct alloc_info_t); |
| |
| // Adapted from https://github.com/iovisor/bcc/tools/offcputime.py |
| static u64 get_frame(u64 *bp) { |
| if (*bp) { |
| // The following stack walker is x86_64 specific |
| u64 ret = 0; |
| if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8))) |
| return 0; |
| if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp)) |
| *bp = 0; |
| return ret; |
| } |
| return 0; |
| } |
| static int grab_stack(struct pt_regs *ctx, struct alloc_info_t *info) |
| { |
| int depth = 0; |
| u64 bp = ctx->bp; |
| GRAB_ONE_FRAME |
| return depth; |
| } |
| |
| int alloc_enter(struct pt_regs *ctx, size_t size) |
| { |
| SIZE_FILTER |
| if (SAMPLE_EVERY_N > 1) { |
| u64 ts = bpf_ktime_get_ns(); |
| if (ts % SAMPLE_EVERY_N != 0) |
| return 0; |
| } |
| |
| u64 pid = bpf_get_current_pid_tgid(); |
| u64 size64 = size; |
| sizes.update(&pid, &size64); |
| |
| if (SHOULD_PRINT) |
| bpf_trace_printk("alloc entered, size = %u\\n", size); |
| return 0; |
| } |
| |
| int alloc_exit(struct pt_regs *ctx) |
| { |
| u64 address = ctx->ax; |
| u64 pid = bpf_get_current_pid_tgid(); |
| u64* size64 = sizes.lookup(&pid); |
| struct alloc_info_t info = {0}; |
| |
| if (size64 == 0) |
| return 0; // missed alloc entry |
| |
| info.size = *size64; |
| sizes.delete(&pid); |
| |
| info.timestamp_ns = bpf_ktime_get_ns(); |
| info.num_frames = grab_stack(ctx, &info) - 2; |
| allocs.update(&address, &info); |
| |
| if (SHOULD_PRINT) { |
| bpf_trace_printk("alloc exited, size = %lu, result = %lx, frames = %d\\n", |
| info.size, address, info.num_frames); |
| } |
| return 0; |
| } |
| |
| int free_enter(struct pt_regs *ctx, void *address) |
| { |
| u64 addr = (u64)address; |
| struct alloc_info_t *info = allocs.lookup(&addr); |
| if (info == 0) |
| return 0; |
| |
| allocs.delete(&addr); |
| |
| if (SHOULD_PRINT) { |
| bpf_trace_printk("free entered, address = %lx, size = %lu\\n", |
| address, info->size); |
| } |
| return 0; |
| } |
| """ |
| bpf_source = bpf_source.replace("SHOULD_PRINT", "1" if trace_all else "0") |
| bpf_source = bpf_source.replace("SAMPLE_EVERY_N", str(sample_every_n)) |
| bpf_source = bpf_source.replace("GRAB_ONE_FRAME", max_stack_size * |
| "\tif (!(info->callstack[depth++] = get_frame(&bp))) return depth;\n") |
| bpf_source = bpf_source.replace("MAX_STACK_SIZE", str(max_stack_size)) |
| |
| size_filter = "" |
| if min_size is not None and max_size is not None: |
| size_filter = "if (size < %d || size > %d) return 0;" % \ |
| (min_size, max_size) |
| elif min_size is not None: |
| size_filter = "if (size < %d) return 0;" % min_size |
| elif max_size is not None: |
| size_filter = "if (size > %d) return 0;" % max_size |
| bpf_source = bpf_source.replace("SIZE_FILTER", size_filter) |
| |
| bpf_program = BPF(text=bpf_source) |
| |
| if not kernel_trace: |
| print("Attaching to malloc and free in pid %d, Ctrl+C to quit." % pid) |
| bpf_program.attach_uprobe(name="c", sym="malloc", |
| fn_name="alloc_enter", pid=pid) |
| bpf_program.attach_uretprobe(name="c", sym="malloc", |
| fn_name="alloc_exit", pid=pid) |
| bpf_program.attach_uprobe(name="c", sym="free", |
| fn_name="free_enter", pid=pid) |
| else: |
| print("Attaching to kmalloc and kfree, Ctrl+C to quit.") |
| bpf_program.attach_kprobe(event="__kmalloc", fn_name="alloc_enter") |
| bpf_program.attach_kretprobe(event="__kmalloc", fn_name="alloc_exit") |
| bpf_program.attach_kprobe(event="kfree", fn_name="free_enter") |
| |
| decoder = StackDecoder(pid, bpf_program) |
| |
| def print_outstanding(): |
| stacks = {} |
| print("[%s] Top %d stacks with outstanding allocations:" % |
| (datetime.now().strftime("%H:%M:%S"), top_stacks)) |
| allocs = bpf_program.get_table("allocs") |
| for address, info in sorted(allocs.items(), key=lambda a: a[1].size): |
| if Time.monotonic_time() - min_age_ns < info.timestamp_ns: |
| continue |
| stack = decoder.decode_stack(info, kernel_trace) |
| if stack in stacks: |
| stacks[stack] = (stacks[stack][0] + 1, |
| stacks[stack][1] + info.size) |
| else: |
| stacks[stack] = (1, info.size) |
| if args.show_allocs: |
| print("\taddr = %x size = %s" % |
| (address.value, info.size)) |
| to_show = sorted(stacks.items(), key=lambda s: s[1][1])[-top_stacks:] |
| for stack, (count, size) in to_show: |
| print("\t%d bytes in %d allocations from stack\n\t\t%s" % |
| (size, count, stack.replace(";", "\n\t\t"))) |
| |
| count_so_far = 0 |
| while True: |
| if trace_all: |
| print bpf_program.trace_fields() |
| else: |
| try: |
| sleep(interval) |
| except KeyboardInterrupt: |
| exit() |
| decoder.refresh_code_ranges() |
| print_outstanding() |
| count_so_far += 1 |
| if num_prints is not None and count_so_far >= num_prints: |
| exit() |