blob: 13c8c98ca7579eea59412db6464cfbdccf984a8d [file] [log] [blame]
Brendan Greggaf2b46a2016-01-30 11:02:29 -08001#!/usr/bin/python
2#
3# offwaketime Summarize blocked time by kernel off-CPU stack + waker stack
4# For Linux, uses BCC, eBPF.
5#
6# USAGE: offwaketime [-h] [-u] [-p PID] [-T] [duration]
7#
8# The current implementation uses an unrolled loop for x86_64, and was written
9# as a proof of concept. This implementation should be replaced in the future
10# with an appropriate bpf_ call, when available.
11#
12# The Off-CPU stack is currently limited to a stack trace depth of 20
13# (maxtdepth), and the waker stack limited to 10 (maxwdepth). This is also
14# limited to kernel stacks, and x86_64 only. Check for future versions, where
15# these limitations should be removed.
16#
17# Copyright 2016 Netflix, Inc.
18# Licensed under the Apache License, Version 2.0 (the "License")
19#
Alexei Starovoitov7583a4e2016-02-03 21:25:43 -080020# 20-Jan-2016 Brendan Gregg Created this.
Brendan Greggaf2b46a2016-01-30 11:02:29 -080021
22from __future__ import print_function
23from bcc import BPF
Alexei Starovoitov7583a4e2016-02-03 21:25:43 -080024from time import sleep
Brendan Greggaf2b46a2016-01-30 11:02:29 -080025import argparse
26import signal
27
28# arguments
29examples = """examples:
30 ./offwaketime # trace off-CPU + waker stack time until Ctrl-C
31 ./offwaketime 5 # trace for 5 seconds only
32 ./offwaketime -f 5 # 5 seconds, and output in folded format
33 ./offwaketime -u # don't include kernel threads (user only)
34 ./offwaketime -p 185 # trace fo PID 185 only
35"""
36parser = argparse.ArgumentParser(
37 description="Summarize blocked time by kernel stack trace + waker stack",
38 formatter_class=argparse.RawDescriptionHelpFormatter,
39 epilog=examples)
40parser.add_argument("-u", "--useronly", action="store_true",
41 help="user threads only (no kernel threads)")
42parser.add_argument("-p", "--pid",
43 help="trace this PID only")
44parser.add_argument("-v", "--verbose", action="store_true",
45 help="show raw addresses")
46parser.add_argument("-f", "--folded", action="store_true",
47 help="output folded format")
48parser.add_argument("duration", nargs="?", default=99999999,
49 help="duration of trace, in seconds")
50args = parser.parse_args()
51folded = args.folded
52duration = int(args.duration)
53debug = 0
54maxwdepth = 10 # and MAXWDEPTH
55maxtdepth = 20 # and MAXTDEPTH
56if args.pid and args.useronly:
57 print("ERROR: use either -p or -u.")
58 exit()
59
60# signal handler
61def signal_ignore(signal, frame):
62 print()
63
64# define BPF program
65bpf_text = """
66#include <uapi/linux/ptrace.h>
67#include <linux/sched.h>
68
69#define MAXWDEPTH 10
70#define MAXTDEPTH 20
71#define MINBLOCK_US 1
72
73struct key_t {
74 char waker[TASK_COMM_LEN];
75 char target[TASK_COMM_LEN];
76 u64 wret[MAXWDEPTH];
77 u64 tret[MAXTDEPTH];
78};
79BPF_HASH(counts, struct key_t);
80BPF_HASH(start, u32);
81struct wokeby_t {
82 char name[TASK_COMM_LEN];
83 u64 ret[MAXWDEPTH];
84};
85BPF_HASH(wokeby, u32, struct wokeby_t);
86
87static u64 get_frame(u64 *bp) {
88 if (*bp) {
89 // The following stack walker is x86_64 specific
90 u64 ret = 0;
91 if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8)))
92 return 0;
93 if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp))
94 *bp = 0;
95 if (ret < __START_KERNEL_map)
96 return 0;
97 return ret;
98 }
99 return 0;
100}
101
102int waker(struct pt_regs *ctx, struct task_struct *p) {
103 u32 pid = p->pid;
104
105 if (!(FILTER))
106 return 0;
107
108 u64 bp = 0;
109 struct wokeby_t woke = {};
110 int depth = 0;
111 bpf_get_current_comm(&woke.name, sizeof(woke.name));
112 bp = ctx->bp;
113
114 // unrolled loop (MAXWDEPTH):
115 if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
116 if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
117 if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
118 if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
119 if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
120 if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
121 if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
122 if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
123 if (!(woke.ret[depth++] = get_frame(&bp))) goto out;
124 woke.ret[depth] = get_frame(&bp);
125
126out:
127 wokeby.update(&pid, &woke);
128 return 0;
129}
130
131int oncpu(struct pt_regs *ctx, struct task_struct *p) {
132 u32 pid;
133 u64 ts, *tsp;
134
135 // record previous thread sleep time
136 if (FILTER) {
137 pid = p->pid;
138 ts = bpf_ktime_get_ns();
139 start.update(&pid, &ts);
140 }
141
142 // calculate current thread's delta time
143 pid = bpf_get_current_pid_tgid();
144 tsp = start.lookup(&pid);
145 if (tsp == 0)
146 return 0; // missed start or filtered
147 u64 delta = bpf_ktime_get_ns() - *tsp;
148 start.delete(&pid);
149 delta = delta / 1000;
150 if (delta < MINBLOCK_US)
151 return 0;
152
153 // create map key
154 u64 zero = 0, *val, bp = 0;
155 int depth = 0;
156 struct key_t key = {};
157 struct wokeby_t *woke;
158 bpf_get_current_comm(&key.target, sizeof(key.target));
159 bp = ctx->bp;
160
161 // unrolled loop (MAXTDEPTH):
162 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
163 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
164 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
165 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
166 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
167 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
168 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
169 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
170 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
171 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
172
173 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
174 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
175 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
176 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
177 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
178 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
179 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
180 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
181 if (!(key.tret[depth++] = get_frame(&bp))) goto out;
182 key.tret[depth] = get_frame(&bp);
183
184out:
185 woke = wokeby.lookup(&pid);
186 if (woke) {
Alexei Starovoitov7583a4e2016-02-03 21:25:43 -0800187 __builtin_memcpy(&key.wret, woke->ret, sizeof(key.wret));
188 __builtin_memcpy(&key.waker, woke->name, TASK_COMM_LEN);
Brendan Greggaf2b46a2016-01-30 11:02:29 -0800189 wokeby.delete(&pid);
190 }
191
192 val = counts.lookup_or_init(&key, &zero);
193 (*val) += delta;
194 return 0;
195}
196"""
197if args.pid:
198 filter = 'pid == %s' % args.pid
199elif args.useronly:
200 filter = '!(p->flags & PF_KTHREAD)'
201else:
202 filter = '1'
203bpf_text = bpf_text.replace('FILTER', filter)
204if debug:
205 print(bpf_text)
206
207# initialize BPF
208b = BPF(text=bpf_text)
209b.attach_kprobe(event="finish_task_switch", fn_name="oncpu")
210b.attach_kprobe(event="try_to_wake_up", fn_name="waker")
211matched = b.num_open_kprobes()
212if matched == 0:
213 print("0 functions traced. Exiting.")
214 exit()
215
216# header
217if not folded:
218 print("Tracing blocked time (us) by kernel off-CPU and waker stack",
219 end="")
220 if duration < 99999999:
221 print(" for %d secs." % duration)
222 else:
223 print("... Hit Ctrl-C to end.")
224
225# output
226while (1):
227 try:
228 sleep(duration)
229 except KeyboardInterrupt:
230 # as cleanup can take many seconds, trap Ctrl-C:
231 signal.signal(signal.SIGINT, signal_ignore)
232
233 if not folded:
234 print()
235 counts = b.get_table("counts")
236 for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
237 if folded:
238 # fold target stack
239 line = k.target + ";"
240 for i in reversed(range(0, maxtdepth)):
241 if k.tret[i] == 0:
242 continue
243 line = line + b.ksym(k.tret[i])
244 if i != 0:
245 line = line + ";"
246
247 # add delimiter
248 line = line + ";-"
249
250 # fold waker stack
251 for i in range(0, maxwdepth):
252 line = line + ";"
253 if k.wret[i] == 0:
254 break
255 line = line + b.ksym(k.wret[i])
256 if i != 0:
257 line = line + ";" + k.waker
258
259 # print as a line
260 print("%s %d" % (line, v.value))
261 else:
262 # print wakeup name then stack in reverse order
263 print(" %-16s %s" % ("waker:", k.waker))
264 for i in reversed(range(0, maxwdepth)):
265 if k.wret[i] == 0:
266 continue
267 print(" %-16x %s" % (k.wret[i],
268 b.ksym(k.wret[i])))
269
270 # print delimiter
271 print(" %-16s %s" % ("-", "-"))
272
273 # print default multi-line stack output
274 for i in range(0, maxtdepth):
275 if k.tret[i] == 0:
276 break
277 print(" %-16x %s" % (k.tret[i],
278 b.ksym(k.tret[i])))
279 print(" %-16s %s" % ("target:", k.target))
280 print(" %d\n" % v.value)
281 counts.clear()
282
283 if not folded:
284 print("Detaching...")
285 exit()