blob: 4b4661068443aed1847b2a9deafd428b02c8582c [file] [log] [blame]
Jiri Olsaac00ac52019-11-15 12:45:59 +01001#!/usr/bin/python
2#
Yonghong Song71f9c2a2019-12-04 19:45:27 -08003# klockstat traces lock events and display locks statistics.
Jiri Olsaac00ac52019-11-15 12:45:59 +01004#
Yonghong Song71f9c2a2019-12-04 19:45:27 -08005# USAGE: klockstat
Jiri Olsaac00ac52019-11-15 12:45:59 +01006#
7
8from __future__ import print_function
9from bcc import BPF, USDT
10import argparse
11import subprocess
12import ctypes as ct
13from time import sleep, strftime
14from datetime import datetime, timedelta
15import errno
16from sys import stderr
17
18examples = """
Yonghong Song71f9c2a2019-12-04 19:45:27 -080019 klockstat # trace system wide
20 klockstat -d 5 # trace for 5 seconds only
21 klockstat -i 5 # display stats every 5 seconds
22 klockstat -p 123 # trace locks for PID 123
23 klockstat -t 321 # trace locks for PID 321
24 klockstat -c pipe_ # display stats only for lock callers with 'pipe_' substring
25 klockstat -S acq_count # sort lock acquired results on acquired count
26 klockstat -S hld_total # sort lock held results on total held time
27 klockstat -S acq_count,hld_total # combination of above
28 klockstat -n 3 # display 3 locks
29 klockstat -s 3 # display 3 levels of stack
Jiri Olsaac00ac52019-11-15 12:45:59 +010030"""
31
32# arg validation
33def positive_int(val):
34 try:
35 ival = int(val)
36 except ValueError:
37 raise argparse.ArgumentTypeError("must be an integer")
38
39 if ival < 0:
40 raise argparse.ArgumentTypeError("must be positive")
41 return ival
42
43def positive_nonzero_int(val):
44 ival = positive_int(val)
45 if ival == 0:
46 raise argparse.ArgumentTypeError("must be nonzero")
47 return ival
48
49def stack_id_err(stack_id):
Michael Prokopc14d02a2020-01-09 02:29:18 +010050 # -EFAULT in get_stackid normally means the stack-trace is not available,
Jiri Olsaac00ac52019-11-15 12:45:59 +010051 # Such as getting kernel stack trace in userspace code
52 return (stack_id < 0) and (stack_id != -errno.EFAULT)
53
54parser = argparse.ArgumentParser(
55 description="",
56 formatter_class=argparse.RawDescriptionHelpFormatter,
57 epilog=examples)
58
59time_group = parser.add_mutually_exclusive_group()
60time_group.add_argument("-d", "--duration", type=int,
61 help="total duration of trace in seconds")
62time_group.add_argument("-i", "--interval", type=int,
63 help="print summary at this interval (seconds)")
64parser.add_argument("-n", "--locks", type=int, default=999999999,
65 help="print given number of locks")
66parser.add_argument("-s", "--stacks", type=int, default=1,
67 help="print given number of stack entries")
68parser.add_argument("-c", "--caller",
69 help="print locks taken by given caller")
70parser.add_argument("-S", "--sort",
71 help="sort data on <aq_field,hd_field>, fields: acq_[max|total|count] hld_[max|total|count]")
72parser.add_argument("-p", "--pid",
73 help="trace this PID only")
74parser.add_argument("-t", "--tid",
75 help="trace this TID only")
76parser.add_argument("--stack-storage-size", default=16384,
77 type=positive_nonzero_int,
78 help="the number of unique stack traces that can be stored and "
79 "displayed (default 16384)")
80
81args = parser.parse_args()
82
83program = """
84#include <uapi/linux/ptrace.h>
85
86struct depth_id {
87 u64 id;
88 u64 depth;
89};
90
91BPF_ARRAY(enabled, u64, 1);
92BPF_HASH(track, u64, u64);
93BPF_HASH(time_aq, u64, u64);
94BPF_HASH(lock_depth, u64, u64);
95BPF_HASH(time_held, struct depth_id, u64);
96BPF_HASH(stack, struct depth_id, int);
97
98BPF_HASH(aq_report_count, int, u64);
99BPF_HASH(aq_report_max, int, u64);
100BPF_HASH(aq_report_total, int, u64);
101
102BPF_HASH(hl_report_count, int, u64);
103BPF_HASH(hl_report_max, int, u64);
104BPF_HASH(hl_report_total, int, u64);
105
106BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
107
108static bool is_enabled(void)
109{
110 int key = 0;
111 u64 *ret;
112
113 ret = enabled.lookup(&key);
114 return ret && *ret == 1;
115}
116
117static bool allow_pid(u64 id)
118{
119 u32 pid = id >> 32; // PID is higher part
120 u32 tid = id; // Cast and get the lower part
121
122 FILTER
123
124 return 1;
125}
126
127int mutex_lock_enter(struct pt_regs *ctx)
128{
129 if (!is_enabled())
130 return 0;
131
132 u64 id = bpf_get_current_pid_tgid();
133
134 if (!allow_pid(id))
135 return 0;
136
137 u64 one = 1, zero = 0;
138
139 track.update(&id, &one);
140
141 u64 *depth = lock_depth.lookup(&id);
142
143 if (!depth) {
144 lock_depth.update(&id, &zero);
145
146 depth = lock_depth.lookup(&id);
147 /* something is wrong.. */
148 if (!depth)
149 return 0;
150 }
151
152 int stackid = stack_traces.get_stackid(ctx, 0);
153 struct depth_id did = {
154 .id = id,
155 .depth = *depth,
156 };
157 stack.update(&did, &stackid);
158
159 u64 ts = bpf_ktime_get_ns();
160 time_aq.update(&id, &ts);
161
162 *depth += 1;
163 return 0;
164}
165
166static void update_aq_report_count(int *stackid)
167{
168 u64 *count, one = 1;
169
170 count = aq_report_count.lookup(stackid);
171 if (!count) {
172 aq_report_count.update(stackid, &one);
173 } else {
174 *count += 1;
175 }
176}
177
178static void update_hl_report_count(int *stackid)
179{
180 u64 *count, one = 1;
181
182 count = hl_report_count.lookup(stackid);
183 if (!count) {
184 hl_report_count.update(stackid, &one);
185 } else {
186 *count += 1;
187 }
188}
189
190static void update_aq_report_max(int *stackid, u64 time)
191{
192 u64 *max;
193
194 max = aq_report_max.lookup(stackid);
195 if (!max || *max < time)
196 aq_report_max.update(stackid, &time);
197}
198
199static void update_hl_report_max(int *stackid, u64 time)
200{
201 u64 *max;
202
203 max = hl_report_max.lookup(stackid);
204 if (!max || *max < time)
205 hl_report_max.update(stackid, &time);
206}
207
208static void update_aq_report_total(int *stackid, u64 delta)
209{
210 u64 *count, *time;
211
212 count = aq_report_count.lookup(stackid);
213 if (!count)
214 return;
215
216 time = aq_report_total.lookup(stackid);
217 if (!time) {
218 aq_report_total.update(stackid, &delta);
219 } else {
220 *time = *time + delta;
221 }
222}
223
224static void update_hl_report_total(int *stackid, u64 delta)
225{
226 u64 *count, *time;
227
228 count = hl_report_count.lookup(stackid);
229 if (!count)
230 return;
231
232 time = hl_report_total.lookup(stackid);
233 if (!time) {
234 hl_report_total.update(stackid, &delta);
235 } else {
236 *time = *time + delta;
237 }
238}
239
240int mutex_lock_return(struct pt_regs *ctx)
241{
242 if (!is_enabled())
243 return 0;
244
245 u64 id = bpf_get_current_pid_tgid();
246
247 if (!allow_pid(id))
248 return 0;
249
250 u64 *one = track.lookup(&id);
251
252 if (!one)
253 return 0;
254
255 track.delete(&id);
256
257 u64 *depth = lock_depth.lookup(&id);
258 if (!depth)
259 return 0;
260
261 struct depth_id did = {
262 .id = id,
263 .depth = *depth - 1,
264 };
265
266 u64 *aq = time_aq.lookup(&id);
267 if (!aq)
268 return 0;
269
270 int *stackid = stack.lookup(&did);
271 if (!stackid)
272 return 0;
273
uppinggoerea8bbbf2019-11-29 15:33:44 +0800274 int stackid_ = *stackid;
Jiri Olsaac00ac52019-11-15 12:45:59 +0100275 u64 cur = bpf_ktime_get_ns();
276
277 if (cur > *aq) {
278 int val = cur - *aq;
uppinggoerea8bbbf2019-11-29 15:33:44 +0800279 update_aq_report_count(&stackid_);
280 update_aq_report_max(&stackid_, val);
281 update_aq_report_total(&stackid_, val);
Jiri Olsaac00ac52019-11-15 12:45:59 +0100282 }
283
284 time_held.update(&did, &cur);
285 return 0;
286}
287
288int mutex_unlock_enter(struct pt_regs *ctx)
289{
290 if (!is_enabled())
291 return 0;
292
293 u64 id = bpf_get_current_pid_tgid();
294
295 if (!allow_pid(id))
296 return 0;
297
298 u64 *depth = lock_depth.lookup(&id);
299
300 if (!depth || *depth == 0)
301 return 0;
302
303 *depth -= 1;
304
305 struct depth_id did = {
306 .id = id,
307 .depth = *depth,
308 };
309
310 u64 *held = time_held.lookup(&did);
311 if (!held)
312 return 0;
313
314 int *stackid = stack.lookup(&did);
315 if (!stackid)
316 return 0;
317
uppinggoerea8bbbf2019-11-29 15:33:44 +0800318
319 int stackid_ = *stackid;
Jiri Olsaac00ac52019-11-15 12:45:59 +0100320 u64 cur = bpf_ktime_get_ns();
321
322 if (cur > *held) {
323 u64 val = cur - *held;
uppinggoerea8bbbf2019-11-29 15:33:44 +0800324 update_hl_report_count(&stackid_);
325 update_hl_report_max(&stackid_, val);
326 update_hl_report_total(&stackid_, val);
Jiri Olsaac00ac52019-11-15 12:45:59 +0100327 }
328
329 stack.delete(&did);
330 time_held.delete(&did);
331 return 0;
332}
333
334"""
335
336def sort_list(maxs, totals, counts):
337 if (not args.sort):
338 return maxs;
339
340 for field in args.sort.split(','):
341 if (field == "acq_max" or field == "hld_max"):
342 return maxs
343 if (field == "acq_total" or field == "hld_total"):
344 return totals
345 if (field == "acq_count" or field == "hld_count"):
346 return counts
347
348 print("Wrong sort argument: %s", args.sort)
349 exit(-1)
350
351def display(sort, maxs, totals, counts):
352 global missing_stacks
353 global has_enomem
354
355 for k, v in sorted(sort.items(), key=lambda sort: sort[1].value, reverse=True)[:args.locks]:
356 missing_stacks += int(stack_id_err(k.value))
357 has_enomem = has_enomem or (k.value == -errno.ENOMEM)
358
359 caller = "[Missed Kernel Stack]"
360 stack = []
361
362 if (k.value >= 0):
363 stack = list(stack_traces.walk(k.value))
364 caller = b.ksym(stack[1], show_offset=True)
365
Marko Myllynenc71394a2019-12-16 11:35:25 +0200366 if (args.caller and caller.find(args.caller.encode())):
Jiri Olsaac00ac52019-11-15 12:45:59 +0100367 continue
368
369 avg = totals[k].value / counts[k].value
370
371 print("%40s %10lu %6lu %10lu %10lu" % (caller, avg, counts[k].value, maxs[k].value, totals[k].value))
372
373 for addr in stack[1:args.stacks]:
374 print("%40s" % b.ksym(addr, show_offset=True))
375
376
377if args.tid: # TID trumps PID
378 program = program.replace('FILTER',
379 'if (tid != %s) { return 0; }' % args.tid)
380elif args.pid:
381 program = program.replace('FILTER',
382 'if (pid != %s) { return 0; }' % args.pid)
383else:
384 program = program.replace('FILTER', '')
385
386program = program.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
387
388b = BPF(text=program)
389
390b.attach_kprobe(event="mutex_unlock", fn_name="mutex_unlock_enter")
391b.attach_kretprobe(event="mutex_lock", fn_name="mutex_lock_return")
392b.attach_kprobe(event="mutex_lock", fn_name="mutex_lock_enter")
393
394enabled = b.get_table("enabled");
395
396stack_traces = b.get_table("stack_traces")
397aq_counts = b.get_table("aq_report_count")
398aq_maxs = b.get_table("aq_report_max")
399aq_totals = b.get_table("aq_report_total")
400
401hl_counts = b.get_table("hl_report_count")
402hl_maxs = b.get_table("hl_report_max")
403hl_totals = b.get_table("hl_report_total")
404
405aq_sort = sort_list(aq_maxs, aq_totals, aq_counts)
406hl_sort = sort_list(hl_maxs, hl_totals, hl_counts)
407
408print("Tracing lock events... Hit Ctrl-C to end.")
409
410# duration and interval are mutualy exclusive
411exiting = 0 if args.interval else 1
412exiting = 1 if args.duration else 0
413
414seconds = 999999999
415if args.interval:
416 seconds = args.interval
417if args.duration:
418 seconds = args.duration
419
420missing_stacks = 0
421has_enomem = False
422
423while (1):
424 enabled[ct.c_int(0)] = ct.c_int(1)
425
426 try:
427 sleep(seconds)
428 except KeyboardInterrupt:
429 exiting = 1
430
431 enabled[ct.c_int(0)] = ct.c_int(0)
432
433 print("\n%40s %10s %6s %10s %10s" % ("Caller", "Avg Spin", "Count", "Max spin", "Total spin"))
434 display(aq_sort, aq_maxs, aq_totals, aq_counts)
435
436
437 print("\n%40s %10s %6s %10s %10s" % ("Caller", "Avg Hold", "Count", "Max hold", "Total hold"))
438 display(hl_sort, hl_maxs, hl_totals, hl_counts)
439
440 if exiting:
441 break;
442
443 stack_traces.clear()
444 aq_counts.clear()
445 aq_maxs.clear()
446 aq_totals.clear()
447 hl_counts.clear()
448 hl_maxs.clear()
449 hl_totals.clear()
450
451if missing_stacks > 0:
452 enomem_str = " Consider increasing --stack-storage-size."
453 print("WARNING: %d stack traces lost and could not be displayed.%s" %
454 (missing_stacks, (enomem_str if has_enomem else "")),
455 file=stderr)