Zwb | 6d147e6 | 2019-03-10 01:59:53 +0800 | [diff] [blame] | 1 | #!/usr/bin/python |
| 2 | # @lint-avoid-python-3-compatibility-imports |
| 3 | # |
| 4 | # drsnoop Trace direct reclaim and print details including issuing PID. |
| 5 | # For Linux, uses BCC, eBPF. |
| 6 | # |
| 7 | # This uses in-kernel eBPF maps to cache process details (PID and comm) by |
| 8 | # direct reclaim begin, as well as a starting timestamp for calculating |
| 9 | # latency. |
| 10 | # |
Wenbo Zhang | c421970 | 2020-03-04 04:31:27 +0000 | [diff] [blame] | 11 | # Copyright (c) 2019 Wenbo Zhang |
Zwb | 6d147e6 | 2019-03-10 01:59:53 +0800 | [diff] [blame] | 12 | # Licensed under the Apache License, Version 2.0 (the "License") |
| 13 | # |
Wenbo Zhang | c421970 | 2020-03-04 04:31:27 +0000 | [diff] [blame] | 14 | # 20-Feb-2019 Wenbo Zhang Created this. |
| 15 | # 09-Mar-2019 Wenbo Zhang Updated for show sys mem info. |
Zwb | 6d147e6 | 2019-03-10 01:59:53 +0800 | [diff] [blame] | 16 | |
| 17 | from __future__ import print_function |
| 18 | from bcc import ArgString, BPF |
| 19 | import argparse |
| 20 | from datetime import datetime, timedelta |
| 21 | import os |
| 22 | import math |
xingfeng2510 | 7b2c114 | 2022-03-24 10:51:26 +0800 | [diff] [blame] | 23 | import sys |
Zwb | 6d147e6 | 2019-03-10 01:59:53 +0800 | [diff] [blame] | 24 | |
| 25 | # symbols |
| 26 | kallsyms = "/proc/kallsyms" |
| 27 | |
| 28 | # arguments |
| 29 | examples = """examples: |
| 30 | ./drsnoop # trace all direct reclaim |
| 31 | ./drsnoop -T # include timestamps |
| 32 | ./drsnoop -U # include UID |
| 33 | ./drsnoop -P 181 # only trace PID 181 |
| 34 | ./drsnoop -t 123 # only trace TID 123 |
| 35 | ./drsnoop -u 1000 # only trace UID 1000 |
| 36 | ./drsnoop -d 10 # trace for 10 seconds only |
| 37 | ./drsnoop -n main # only print process names containing "main" |
| 38 | """ |
| 39 | parser = argparse.ArgumentParser( |
| 40 | description="Trace direct reclaim", |
| 41 | formatter_class=argparse.RawDescriptionHelpFormatter, |
| 42 | epilog=examples) |
| 43 | parser.add_argument("-T", "--timestamp", action="store_true", |
| 44 | help="include timestamp on output") |
| 45 | parser.add_argument("-U", "--print-uid", action="store_true", |
| 46 | help="print UID column") |
| 47 | parser.add_argument("-p", "--pid", |
| 48 | help="trace this PID only") |
| 49 | parser.add_argument("-t", "--tid", |
| 50 | help="trace this TID only") |
| 51 | parser.add_argument("-u", "--uid", |
| 52 | help="trace this UID only") |
| 53 | parser.add_argument("-d", "--duration", |
| 54 | help="total duration of trace in seconds") |
| 55 | parser.add_argument("-n", "--name", |
| 56 | type=ArgString, |
| 57 | help="only print process names containing this name") |
| 58 | parser.add_argument("-v", "--verbose", action="store_true", |
| 59 | help="show system memory state") |
| 60 | parser.add_argument("--ebpf", action="store_true", |
| 61 | help=argparse.SUPPRESS) |
| 62 | args = parser.parse_args() |
| 63 | debug = 0 |
| 64 | if args.duration: |
| 65 | args.duration = timedelta(seconds=int(args.duration)) |
| 66 | |
| 67 | |
| 68 | # vm_stat |
| 69 | vm_stat_addr = '' |
| 70 | with open(kallsyms) as syms: |
| 71 | for line in syms: |
| 72 | (addr, size, name) = line.rstrip().split(" ", 2) |
| 73 | name = name.split("\t")[0] |
| 74 | if name == "vm_stat": |
| 75 | vm_stat_addr = "0x" + addr |
| 76 | break |
| 77 | if name == "vm_zone_stat": |
| 78 | vm_stat_addr = "0x" + addr |
| 79 | break |
| 80 | if vm_stat_addr == '': |
| 81 | print("ERROR: no vm_stat or vm_zone_stat in /proc/kallsyms. Exiting.") |
| 82 | print("HINT: the kernel should be built with CONFIG_KALLSYMS_ALL.") |
| 83 | exit() |
| 84 | |
| 85 | NR_FREE_PAGES = 0 |
| 86 | |
| 87 | PAGE_SIZE = os.sysconf("SC_PAGE_SIZE") |
| 88 | PAGE_SHIFT = int(math.log(PAGE_SIZE) / math.log(2)) |
| 89 | |
| 90 | def K(x): |
| 91 | return x << (PAGE_SHIFT - 10) |
| 92 | |
| 93 | # load BPF program |
| 94 | bpf_text = """ |
| 95 | #include <uapi/linux/ptrace.h> |
| 96 | #include <linux/sched.h> |
| 97 | #include <linux/mmzone.h> |
| 98 | |
| 99 | struct val_t { |
| 100 | u64 id; |
| 101 | u64 ts; // start time |
| 102 | char name[TASK_COMM_LEN]; |
| 103 | u64 vm_stat[NR_VM_ZONE_STAT_ITEMS]; |
| 104 | }; |
| 105 | |
| 106 | struct data_t { |
| 107 | u64 id; |
| 108 | u32 uid; |
| 109 | u64 nr_reclaimed; |
| 110 | u64 delta; |
| 111 | u64 ts; // end time |
| 112 | char name[TASK_COMM_LEN]; |
| 113 | u64 vm_stat[NR_VM_ZONE_STAT_ITEMS]; |
| 114 | }; |
| 115 | |
| 116 | BPF_HASH(start, u64, struct val_t); |
| 117 | BPF_PERF_OUTPUT(events); |
| 118 | |
| 119 | TRACEPOINT_PROBE(vmscan, mm_vmscan_direct_reclaim_begin) { |
| 120 | struct val_t val = {}; |
| 121 | u64 id = bpf_get_current_pid_tgid(); |
| 122 | u32 pid = id >> 32; // PID is higher part |
| 123 | u32 tid = id; // Cast and get the lower part |
| 124 | u32 uid = bpf_get_current_uid_gid(); |
| 125 | u64 ts; |
| 126 | |
| 127 | PID_TID_FILTER |
| 128 | UID_FILTER |
| 129 | if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) { |
| 130 | val.id = id; |
| 131 | val.ts = bpf_ktime_get_ns(); |
Sumanth Korikkar | 7f6066d | 2020-05-20 10:49:56 -0500 | [diff] [blame] | 132 | bpf_probe_read_kernel(&val.vm_stat, sizeof(val.vm_stat), (const void *)%s); |
Zwb | 6d147e6 | 2019-03-10 01:59:53 +0800 | [diff] [blame] | 133 | start.update(&id, &val); |
| 134 | } |
| 135 | return 0; |
| 136 | } |
| 137 | |
| 138 | TRACEPOINT_PROBE(vmscan, mm_vmscan_direct_reclaim_end) { |
| 139 | u64 id = bpf_get_current_pid_tgid(); |
| 140 | struct val_t *valp; |
| 141 | struct data_t data = {}; |
| 142 | u64 ts = bpf_ktime_get_ns(); |
| 143 | |
| 144 | valp = start.lookup(&id); |
| 145 | if (valp == NULL) { |
| 146 | // missed entry |
| 147 | return 0; |
| 148 | } |
| 149 | |
| 150 | data.delta = ts - valp->ts; |
| 151 | data.ts = ts / 1000; |
| 152 | data.id = valp->id; |
| 153 | data.uid = bpf_get_current_uid_gid(); |
Sumanth Korikkar | 7f6066d | 2020-05-20 10:49:56 -0500 | [diff] [blame] | 154 | bpf_probe_read_kernel(&data.name, sizeof(data.name), valp->name); |
| 155 | bpf_probe_read_kernel(&data.vm_stat, sizeof(data.vm_stat), valp->vm_stat); |
Zwb | 6d147e6 | 2019-03-10 01:59:53 +0800 | [diff] [blame] | 156 | data.nr_reclaimed = args->nr_reclaimed; |
| 157 | |
| 158 | events.perf_submit(args, &data, sizeof(data)); |
| 159 | start.delete(&id); |
| 160 | |
| 161 | return 0; |
| 162 | } |
| 163 | """ % vm_stat_addr |
| 164 | |
| 165 | if args.tid: # TID trumps PID |
| 166 | bpf_text = bpf_text.replace('PID_TID_FILTER', |
| 167 | 'if (tid != %s) { return 0; }' % args.tid) |
| 168 | elif args.pid: |
| 169 | bpf_text = bpf_text.replace('PID_TID_FILTER', |
| 170 | 'if (pid != %s) { return 0; }' % args.pid) |
| 171 | else: |
| 172 | bpf_text = bpf_text.replace('PID_TID_FILTER', '') |
| 173 | if args.uid: |
| 174 | bpf_text = bpf_text.replace('UID_FILTER', |
| 175 | 'if (uid != %s) { return 0; }' % args.uid) |
| 176 | else: |
| 177 | bpf_text = bpf_text.replace('UID_FILTER', '') |
| 178 | if debug or args.ebpf: |
| 179 | print(bpf_text) |
| 180 | if args.ebpf: |
| 181 | exit() |
| 182 | |
| 183 | # initialize BPF |
| 184 | b = BPF(text=bpf_text) |
| 185 | |
| 186 | initial_ts = 0 |
| 187 | |
| 188 | # header |
| 189 | if args.timestamp: |
| 190 | print("%-14s" % ("TIME(s)"), end="") |
| 191 | if args.print_uid: |
| 192 | print("%-6s" % ("UID"), end="") |
| 193 | print("%-14s %-6s %8s %5s" % |
| 194 | ("COMM", "TID" if args.tid else "PID", "LAT(ms)", "PAGES"), end="") |
| 195 | if args.verbose: |
| 196 | print("%10s" % ("FREE(KB)")) |
| 197 | else: |
| 198 | print("") |
| 199 | |
| 200 | # process event |
| 201 | def print_event(cpu, data, size): |
| 202 | event = b["events"].event(data) |
| 203 | |
| 204 | global initial_ts |
| 205 | |
| 206 | if not initial_ts: |
| 207 | initial_ts = event.ts |
| 208 | |
| 209 | if args.name and bytes(args.name) not in event.name: |
| 210 | return |
| 211 | |
| 212 | if args.timestamp: |
| 213 | delta = event.ts - initial_ts |
| 214 | print("%-14.9f" % (float(delta) / 1000000), end="") |
| 215 | |
| 216 | if args.print_uid: |
| 217 | print("%-6d" % event.uid, end="") |
| 218 | |
| 219 | print("%-14.14s %-6s %8.2f %5d" % |
| 220 | (event.name.decode('utf-8', 'replace'), |
| 221 | event.id & 0xffffffff if args.tid else event.id >> 32, |
| 222 | float(event.delta) / 1000000, event.nr_reclaimed), end="") |
| 223 | if args.verbose: |
| 224 | print("%10d" % K(event.vm_stat[NR_FREE_PAGES])) |
| 225 | else: |
| 226 | print("") |
| 227 | |
xingfeng2510 | 7b2c114 | 2022-03-24 10:51:26 +0800 | [diff] [blame] | 228 | sys.stdout.flush() |
| 229 | |
Zwb | 6d147e6 | 2019-03-10 01:59:53 +0800 | [diff] [blame] | 230 | |
| 231 | # loop with callback to print_event |
| 232 | b["events"].open_perf_buffer(print_event, page_cnt=64) |
| 233 | start_time = datetime.now() |
| 234 | while not args.duration or datetime.now() - start_time < args.duration: |
| 235 | try: |
| 236 | b.perf_buffer_poll() |
| 237 | except KeyboardInterrupt: |
| 238 | exit() |