blob: 9227459a59bd27fab65e2f18293c24fca0a477dc [file] [log] [blame]
#!/usr/bin/env python
#
# memleak.py Trace and display outstanding allocations to detect
# memory leaks in user-mode processes and the kernel.
#
# USAGE: memleak.py [-h] [-p PID] [-t] [-a] [-o OLDER] [-c COMMAND]
# [-s SAMPLE_RATE] [-d STACK_DEPTH] [-T TOP] [-z MIN_SIZE]
# [-Z MAX_SIZE]
# [interval] [count]
#
# Licensed under the Apache License, Version 2.0 (the "License")
# Copyright (C) 2016 Sasha Goldshtein.
from bcc import BPF
from time import sleep
from datetime import datetime
import argparse
import subprocess
import ctypes
import os
class Time(object):
# BPF timestamps come from the monotonic clock. To be able to filter
# and compare them from Python, we need to invoke clock_gettime.
# Adapted from http://stackoverflow.com/a/1205762
CLOCK_MONOTONIC_RAW = 4 # see <linux/time.h>
class timespec(ctypes.Structure):
_fields_ = [
('tv_sec', ctypes.c_long),
('tv_nsec', ctypes.c_long)
]
librt = ctypes.CDLL('librt.so.1', use_errno=True)
clock_gettime = librt.clock_gettime
clock_gettime.argtypes = [ctypes.c_int, ctypes.POINTER(timespec)]
@staticmethod
def monotonic_time():
t = Time.timespec()
if Time.clock_gettime(
Time.CLOCK_MONOTONIC_RAW, ctypes.pointer(t)) != 0:
errno_ = ctypes.get_errno()
raise OSError(errno_, os.strerror(errno_))
return t.tv_sec * 1e9 + t.tv_nsec
class StackDecoder(object):
def __init__(self, pid, bpf):
self.pid = pid
self.bpf = bpf
self.ranges_cache = {}
self.refresh_code_ranges()
def refresh_code_ranges(self):
if self.pid == -1:
return
self.code_ranges = self._get_code_ranges()
@staticmethod
def _is_binary_segment(parts):
return len(parts) == 6 and \
parts[5][0] != '[' and 'x' in parts[1]
def _get_code_ranges(self):
ranges = {}
raw_ranges = open("/proc/%d/maps" % self.pid).readlines()
# A typical line from /proc/PID/maps looks like this:
# 7f21b6635000-7f21b67eb000 r-xp ... /usr/lib64/libc-2.21.so
# We are looking for executable segments that have a .so file
# or the main executable. The first two lines are the range of
# that memory segment, which we index by binary name.
for raw_range in raw_ranges:
parts = raw_range.split()
if not StackDecoder._is_binary_segment(parts):
continue
binary = parts[5]
range_parts = parts[0].split('-')
addr_range = (int(range_parts[0], 16),
int(range_parts[1], 16))
ranges[binary] = addr_range
return ranges
@staticmethod
def _is_function_symbol(parts):
return len(parts) == 6 and parts[3] == ".text" \
and parts[2] == "F"
def _get_sym_ranges(self, binary):
if binary in self.ranges_cache:
return self.ranges_cache[binary]
sym_ranges = {}
raw_symbols = run_command_get_output("objdump -t %s" % binary)
for raw_symbol in raw_symbols:
# A typical line from objdump -t looks like this:
# 00000000004007f5 g F .text 000000000000010e main
# We only care about functions in the .text segment.
# The first number is the start address, and the second
# number is the length.
parts = raw_symbol.split()
if not StackDecoder._is_function_symbol(parts):
continue
sym_start = int(parts[0], 16)
sym_len = int(parts[4], 16)
sym_name = parts[5]
sym_ranges[sym_name] = (sym_start, sym_len)
self.ranges_cache[binary] = sym_ranges
return sym_ranges
def _decode_sym(self, binary, offset):
sym_ranges = self._get_sym_ranges(binary)
# Find the symbol that contains the specified offset.
# There might not be one.
for name, (start, length) in sym_ranges.items():
if offset >= start and offset <= (start + length):
return "%s+0x%x" % (name, offset - start)
return "%x" % offset
def _decode_addr(self, addr):
code_ranges = self._get_code_ranges()
# Find the binary that contains the specified address.
# For .so files, look at the relative address; for the main
# executable, look at the absolute address.
for binary, (start, end) in code_ranges.items():
if addr >= start and addr <= end:
offset = addr - start \
if binary.endswith(".so") else addr
return "%s [%s]" % (self._decode_sym(binary,
offset), binary)
return "%x" % addr
def decode_stack(self, info, is_kernel_trace):
stack = ""
if info.num_frames <= 0:
return "???"
for i in range(0, info.num_frames):
addr = info.callstack[i]
if is_kernel_trace:
stack += " %s [kernel] (%x) ;" % \
(self.bpf.ksym(addr), addr)
else:
# At some point, we hope to have native BPF
# user-mode symbol decoding, but for now we
# have to use our own.
stack += " %s (%x) ;" % \
(self._decode_addr(addr), addr)
return stack
def run_command_get_output(command):
p = subprocess.Popen(command.split(),
stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
return iter(p.stdout.readline, b'')
def run_command_get_pid(command):
p = subprocess.Popen(command.split())
return p.pid
examples = """
EXAMPLES:
./memleak.py -p $(pidof allocs)
Trace allocations and display a summary of "leaked" (outstanding)
allocations every 5 seconds
./memleak.py -p $(pidof allocs) -t
Trace allocations and display each individual call to malloc/free
./memleak.py -ap $(pidof allocs) 10
Trace allocations and display allocated addresses, sizes, and stacks
every 10 seconds for outstanding allocations
./memleak.py -c "./allocs"
Run the specified command and trace its allocations
./memleak.py
Trace allocations in kernel mode and display a summary of outstanding
allocations every 5 seconds
./memleak.py -o 60000
Trace allocations in kernel mode and display a summary of outstanding
allocations that are at least one minute (60 seconds) old
./memleak.py -s 5
Trace roughly every 5th allocation, to reduce overhead
"""
description = """
Trace outstanding memory allocations that weren't freed.
Supports both user-mode allocations made with malloc/free and kernel-mode
allocations made with kmalloc/kfree.
"""
parser = argparse.ArgumentParser(description=description,
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-p", "--pid", type=int, default=-1,
help="the PID to trace; if not specified, trace kernel allocs")
parser.add_argument("-t", "--trace", action="store_true",
help="print trace messages for each alloc/free call")
parser.add_argument("interval", nargs="?", default=5, type=int,
help="interval in seconds to print outstanding allocations")
parser.add_argument("count", nargs="?", type=int,
help="number of times to print the report before exiting")
parser.add_argument("-a", "--show-allocs", default=False, action="store_true",
help="show allocation addresses and sizes as well as call stacks")
parser.add_argument("-o", "--older", default=500, type=int,
help="prune allocations younger than this age in milliseconds")
parser.add_argument("-c", "--command",
help="execute and trace the specified command")
parser.add_argument("-s", "--sample-rate", default=1, type=int,
help="sample every N-th allocation to decrease the overhead")
parser.add_argument("-d", "--stack-depth", default=10, type=int,
help="maximum stack depth to capture")
parser.add_argument("-T", "--top", type=int, default=10,
help="display only this many top allocating stacks (by size)")
parser.add_argument("-z", "--min-size", type=int,
help="capture only allocations larger than this size")
parser.add_argument("-Z", "--max-size", type=int,
help="capture only allocations smaller than this size")
args = parser.parse_args()
pid = args.pid
command = args.command
kernel_trace = (pid == -1 and command is None)
trace_all = args.trace
interval = args.interval
min_age_ns = 1e6 * args.older
sample_every_n = args.sample_rate
num_prints = args.count
max_stack_size = args.stack_depth + 2
top_stacks = args.top
min_size = args.min_size
max_size = args.max_size
if min_size is not None and max_size is not None and min_size > max_size:
print("min_size (-z) can't be greater than max_size (-Z)")
exit(1)
if command is not None:
print("Executing '%s' and tracing the resulting process." % command)
pid = run_command_get_pid(command)
bpf_source = """
#include <uapi/linux/ptrace.h>
struct alloc_info_t {
u64 size;
u64 timestamp_ns;
int num_frames;
u64 callstack[MAX_STACK_SIZE];
};
BPF_HASH(sizes, u64);
BPF_HASH(allocs, u64, struct alloc_info_t);
// Adapted from https://github.com/iovisor/bcc/tools/offcputime.py
static u64 get_frame(u64 *bp) {
if (*bp) {
// The following stack walker is x86_64 specific
u64 ret = 0;
if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8)))
return 0;
if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp))
*bp = 0;
return ret;
}
return 0;
}
static int grab_stack(struct pt_regs *ctx, struct alloc_info_t *info)
{
int depth = 0;
u64 bp = ctx->bp;
GRAB_ONE_FRAME
return depth;
}
int alloc_enter(struct pt_regs *ctx, size_t size)
{
SIZE_FILTER
if (SAMPLE_EVERY_N > 1) {
u64 ts = bpf_ktime_get_ns();
if (ts % SAMPLE_EVERY_N != 0)
return 0;
}
u64 pid = bpf_get_current_pid_tgid();
u64 size64 = size;
sizes.update(&pid, &size64);
if (SHOULD_PRINT)
bpf_trace_printk("alloc entered, size = %u\\n", size);
return 0;
}
int alloc_exit(struct pt_regs *ctx)
{
u64 address = ctx->ax;
u64 pid = bpf_get_current_pid_tgid();
u64* size64 = sizes.lookup(&pid);
struct alloc_info_t info = {0};
if (size64 == 0)
return 0; // missed alloc entry
info.size = *size64;
sizes.delete(&pid);
info.timestamp_ns = bpf_ktime_get_ns();
info.num_frames = grab_stack(ctx, &info) - 2;
allocs.update(&address, &info);
if (SHOULD_PRINT) {
bpf_trace_printk("alloc exited, size = %lu, result = %lx, frames = %d\\n",
info.size, address, info.num_frames);
}
return 0;
}
int free_enter(struct pt_regs *ctx, void *address)
{
u64 addr = (u64)address;
struct alloc_info_t *info = allocs.lookup(&addr);
if (info == 0)
return 0;
allocs.delete(&addr);
if (SHOULD_PRINT) {
bpf_trace_printk("free entered, address = %lx, size = %lu\\n",
address, info->size);
}
return 0;
}
"""
bpf_source = bpf_source.replace("SHOULD_PRINT", "1" if trace_all else "0")
bpf_source = bpf_source.replace("SAMPLE_EVERY_N", str(sample_every_n))
bpf_source = bpf_source.replace("GRAB_ONE_FRAME", max_stack_size *
"\tif (!(info->callstack[depth++] = get_frame(&bp))) return depth;\n")
bpf_source = bpf_source.replace("MAX_STACK_SIZE", str(max_stack_size))
size_filter = ""
if min_size is not None and max_size is not None:
size_filter = "if (size < %d || size > %d) return 0;" % \
(min_size, max_size)
elif min_size is not None:
size_filter = "if (size < %d) return 0;" % min_size
elif max_size is not None:
size_filter = "if (size > %d) return 0;" % max_size
bpf_source = bpf_source.replace("SIZE_FILTER", size_filter)
bpf_program = BPF(text=bpf_source)
if not kernel_trace:
print("Attaching to malloc and free in pid %d, Ctrl+C to quit." % pid)
bpf_program.attach_uprobe(name="c", sym="malloc",
fn_name="alloc_enter", pid=pid)
bpf_program.attach_uretprobe(name="c", sym="malloc",
fn_name="alloc_exit", pid=pid)
bpf_program.attach_uprobe(name="c", sym="free",
fn_name="free_enter", pid=pid)
else:
print("Attaching to kmalloc and kfree, Ctrl+C to quit.")
bpf_program.attach_kprobe(event="__kmalloc", fn_name="alloc_enter")
bpf_program.attach_kretprobe(event="__kmalloc", fn_name="alloc_exit")
bpf_program.attach_kprobe(event="kfree", fn_name="free_enter")
decoder = StackDecoder(pid, bpf_program)
def print_outstanding():
stacks = {}
print("[%s] Top %d stacks with outstanding allocations:" %
(datetime.now().strftime("%H:%M:%S"), top_stacks))
allocs = bpf_program.get_table("allocs")
for address, info in sorted(allocs.items(), key=lambda a: a[1].size):
if Time.monotonic_time() - min_age_ns < info.timestamp_ns:
continue
stack = decoder.decode_stack(info, kernel_trace)
if stack in stacks:
stacks[stack] = (stacks[stack][0] + 1,
stacks[stack][1] + info.size)
else:
stacks[stack] = (1, info.size)
if args.show_allocs:
print("\taddr = %x size = %s" %
(address.value, info.size))
to_show = sorted(stacks.items(), key=lambda s: s[1][1])[-top_stacks:]
for stack, (count, size) in to_show:
print("\t%d bytes in %d allocations from stack\n\t\t%s" %
(size, count, stack.replace(";", "\n\t\t")))
count_so_far = 0
while True:
if trace_all:
print bpf_program.trace_fields()
else:
try:
sleep(interval)
except KeyboardInterrupt:
exit()
decoder.refresh_code_ranges()
print_outstanding()
count_so_far += 1
if num_prints is not None and count_so_far >= num_prints:
exit()