blob: e0344d12e6addd95203f008d16a55ac5caa27d61 [file] [log] [blame]
#!/usr/bin/python
# @lint-avoid-python-3-compatibility-imports
#
# drsnoop Trace direct reclaim and print details including issuing PID.
# For Linux, uses BCC, eBPF.
#
# This uses in-kernel eBPF maps to cache process details (PID and comm) by
# direct reclaim begin, as well as a starting timestamp for calculating
# latency.
#
# Copyright (c) 2019 Wenbo Zhang
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 20-Feb-2019 Wenbo Zhang Created this.
# 09-Mar-2019 Wenbo Zhang Updated for show sys mem info.
from __future__ import print_function
from bcc import ArgString, BPF
import argparse
from datetime import datetime, timedelta
import os
import math
import sys
# symbols
kallsyms = "/proc/kallsyms"
# arguments
examples = """examples:
./drsnoop # trace all direct reclaim
./drsnoop -T # include timestamps
./drsnoop -U # include UID
./drsnoop -P 181 # only trace PID 181
./drsnoop -t 123 # only trace TID 123
./drsnoop -u 1000 # only trace UID 1000
./drsnoop -d 10 # trace for 10 seconds only
./drsnoop -n main # only print process names containing "main"
"""
parser = argparse.ArgumentParser(
description="Trace direct reclaim",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-T", "--timestamp", action="store_true",
help="include timestamp on output")
parser.add_argument("-U", "--print-uid", action="store_true",
help="print UID column")
parser.add_argument("-p", "--pid",
help="trace this PID only")
parser.add_argument("-t", "--tid",
help="trace this TID only")
parser.add_argument("-u", "--uid",
help="trace this UID only")
parser.add_argument("-d", "--duration",
help="total duration of trace in seconds")
parser.add_argument("-n", "--name",
type=ArgString,
help="only print process names containing this name")
parser.add_argument("-v", "--verbose", action="store_true",
help="show system memory state")
parser.add_argument("--ebpf", action="store_true",
help=argparse.SUPPRESS)
args = parser.parse_args()
debug = 0
if args.duration:
args.duration = timedelta(seconds=int(args.duration))
# vm_stat
vm_stat_addr = ''
with open(kallsyms) as syms:
for line in syms:
(addr, size, name) = line.rstrip().split(" ", 2)
name = name.split("\t")[0]
if name == "vm_stat":
vm_stat_addr = "0x" + addr
break
if name == "vm_zone_stat":
vm_stat_addr = "0x" + addr
break
if vm_stat_addr == '':
print("ERROR: no vm_stat or vm_zone_stat in /proc/kallsyms. Exiting.")
print("HINT: the kernel should be built with CONFIG_KALLSYMS_ALL.")
exit()
NR_FREE_PAGES = 0
PAGE_SIZE = os.sysconf("SC_PAGE_SIZE")
PAGE_SHIFT = int(math.log(PAGE_SIZE) / math.log(2))
def K(x):
return x << (PAGE_SHIFT - 10)
# load BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
#include <linux/mmzone.h>
struct val_t {
u64 id;
u64 ts; // start time
char name[TASK_COMM_LEN];
u64 vm_stat[NR_VM_ZONE_STAT_ITEMS];
};
struct data_t {
u64 id;
u32 uid;
u64 nr_reclaimed;
u64 delta;
u64 ts; // end time
char name[TASK_COMM_LEN];
u64 vm_stat[NR_VM_ZONE_STAT_ITEMS];
};
BPF_HASH(start, u64, struct val_t);
BPF_PERF_OUTPUT(events);
TRACEPOINT_PROBE(vmscan, mm_vmscan_direct_reclaim_begin) {
struct val_t val = {};
u64 id = bpf_get_current_pid_tgid();
u32 pid = id >> 32; // PID is higher part
u32 tid = id; // Cast and get the lower part
u32 uid = bpf_get_current_uid_gid();
u64 ts;
PID_TID_FILTER
UID_FILTER
if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) {
val.id = id;
val.ts = bpf_ktime_get_ns();
bpf_probe_read_kernel(&val.vm_stat, sizeof(val.vm_stat), (const void *)%s);
start.update(&id, &val);
}
return 0;
}
TRACEPOINT_PROBE(vmscan, mm_vmscan_direct_reclaim_end) {
u64 id = bpf_get_current_pid_tgid();
struct val_t *valp;
struct data_t data = {};
u64 ts = bpf_ktime_get_ns();
valp = start.lookup(&id);
if (valp == NULL) {
// missed entry
return 0;
}
data.delta = ts - valp->ts;
data.ts = ts / 1000;
data.id = valp->id;
data.uid = bpf_get_current_uid_gid();
bpf_probe_read_kernel(&data.name, sizeof(data.name), valp->name);
bpf_probe_read_kernel(&data.vm_stat, sizeof(data.vm_stat), valp->vm_stat);
data.nr_reclaimed = args->nr_reclaimed;
events.perf_submit(args, &data, sizeof(data));
start.delete(&id);
return 0;
}
""" % vm_stat_addr
if args.tid: # TID trumps PID
bpf_text = bpf_text.replace('PID_TID_FILTER',
'if (tid != %s) { return 0; }' % args.tid)
elif args.pid:
bpf_text = bpf_text.replace('PID_TID_FILTER',
'if (pid != %s) { return 0; }' % args.pid)
else:
bpf_text = bpf_text.replace('PID_TID_FILTER', '')
if args.uid:
bpf_text = bpf_text.replace('UID_FILTER',
'if (uid != %s) { return 0; }' % args.uid)
else:
bpf_text = bpf_text.replace('UID_FILTER', '')
if debug or args.ebpf:
print(bpf_text)
if args.ebpf:
exit()
# initialize BPF
b = BPF(text=bpf_text)
initial_ts = 0
# header
if args.timestamp:
print("%-14s" % ("TIME(s)"), end="")
if args.print_uid:
print("%-6s" % ("UID"), end="")
print("%-14s %-6s %8s %5s" %
("COMM", "TID" if args.tid else "PID", "LAT(ms)", "PAGES"), end="")
if args.verbose:
print("%10s" % ("FREE(KB)"))
else:
print("")
# process event
def print_event(cpu, data, size):
event = b["events"].event(data)
global initial_ts
if not initial_ts:
initial_ts = event.ts
if args.name and bytes(args.name) not in event.name:
return
if args.timestamp:
delta = event.ts - initial_ts
print("%-14.9f" % (float(delta) / 1000000), end="")
if args.print_uid:
print("%-6d" % event.uid, end="")
print("%-14.14s %-6s %8.2f %5d" %
(event.name.decode('utf-8', 'replace'),
event.id & 0xffffffff if args.tid else event.id >> 32,
float(event.delta) / 1000000, event.nr_reclaimed), end="")
if args.verbose:
print("%10d" % K(event.vm_stat[NR_FREE_PAGES]))
else:
print("")
sys.stdout.flush()
# loop with callback to print_event
b["events"].open_perf_buffer(print_event, page_cnt=64)
start_time = datetime.now()
while not args.duration or datetime.now() - start_time < args.duration:
try:
b.perf_buffer_poll()
except KeyboardInterrupt:
exit()