blob: e45731c8795e2ccaa1384699cec65d0161bc8e4c [file] [log] [blame]
Joel0b813f82018-06-18 19:37:38 -07001#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# criticalstat Trace long critical sections (IRQs or preemption disabled)
5# For Linux, uses BCC, eBPF. Requires kernel built with
6# CONFIG_DEBUG_PREEMPT and CONFIG_PREEMPTIRQ_EVENTS
7#
8# USAGE: criticalstat [-h] [-p] [-i] [-d DURATION]
9#
10# Copyright (c) 2018, Google, Inc.
11# Licensed under the Apache License, Version 2.0 (the "License")
12#
13# By Joel Fernandes <joel@joelfernandes.org>
14
15from __future__ import print_function
16from bcc import BPF
17import argparse
18import ctypes as ct
19import sys
20import subprocess
21import os.path
22
23examples=""
24
25parser = argparse.ArgumentParser(
26 description="Trace long critical sections",
27 formatter_class=argparse.RawDescriptionHelpFormatter,
28 epilog=examples)
29
30parser.add_argument("-p", "--preemptoff", action="store_true",
31 help="Find long sections where preemption was off")
32
33parser.add_argument("-i", "--irqoff", action="store_true",
34 help="Find long sections where IRQ was off")
35
36parser.add_argument("-d", "--duration", default=100,
37 help="Duration in uS (microseconds) below which we filter")
38
39args = parser.parse_args()
40
41preemptoff = False
42irqoff = False
43
44if args.irqoff:
45 preemptoff = False
46 irqoff = True
47elif args.preemptoff:
48 preemptoff = True
49 irqoff = False
50
51debugfs_path = subprocess.Popen ("cat /proc/mounts | grep -w debugfs" +
52 " | awk '{print $2}'",
53 shell=True,
54 stdout=subprocess.PIPE).stdout.read().split(b"\n")[0]
55
56if debugfs_path == "":
57 print("ERROR: Unable to find debugfs mount point");
58 sys.exit(0);
59
60trace_path = debugfs_path + b"/tracing/events/preemptirq/";
61
62if (not os.path.exists(trace_path + b"irq_disable") or
63 not os.path.exists(trace_path + b"irq_enable") or
64 not os.path.exists(trace_path + b"preempt_disable") or
65 not os.path.exists(trace_path + b"preempt_enable")):
66 print("ERROR: required tracing events are not available\n" +
67 "Make sure the kernel is built with CONFIG_DEBUG_PREEMPT " +
68 "and CONFIG_PREEMPTIRQ_EVENTS enabled. Also please disable " +
69 "CONFIG_PROVE_LOCKING and CONFIG_LOCKDEP on older kernels.")
70 sys.exit(0)
71
72bpf_text = """
73#include <uapi/linux/ptrace.h>
74#include <linux/sched.h>
75
76enum addr_offs {
77 START_CALLER_OFF,
78 START_PARENT_OFF,
79 END_CALLER_OFF,
80 END_PARENT_OFF
81};
82
83struct start_data {
84 u32 addr_offs[2];
85 u64 ts;
86 int idle_skip;
87 int active;
88};
89
90struct data_t {
91 u64 time;
92 s64 stack_id;
93 u32 cpu;
94 u64 id;
95 u32 addrs[4]; /* indexed by addr_offs */
96 char comm[TASK_COMM_LEN];
97};
98
99BPF_STACK_TRACE(stack_traces, 16384);
100BPF_PERCPU_ARRAY(sts, struct start_data, 1);
101BPF_PERCPU_ARRAY(isidle, u64, 1);
102BPF_PERF_OUTPUT(events);
103
104/*
105 * In the below code we install tracepoint probes on preempt or
106 * IRQ disable/enable critical sections and idle events, the cases
107 * are combinations of 4 different states.
108 * The states are defined as:
109 * CSenter: A critical section has been entered. Either due to
110 * preempt disable or irq disable.
111 * CSexit: A critical section has been exited. Either due to
112 * preempt enable or irq enable.
113 * Ienter: The CPU has entered an idle state.
114 * Iexit: The CPU has exited an idle state.
115 *
116 * The scenario we are trying to detect is if there is an overlap
117 * between Critical sections and idle entry/exit. If there are any
118 * such cases, we avoid recording those critical sections since they
119 * are not worth while to record and just add noise.
120 */
121TRACEPOINT_PROBE(power, cpu_idle)
122{
123 int idx = 0;
124 u64 val;
125 struct start_data *stdp, std;
126
127 // Mark active sections as that they should be skipped
128
129 // Handle the case CSenter, Ienter, CSexit, Iexit
130 // Handle the case CSenter, Ienter, Iexit, CSexit
131 stdp = sts.lookup(&idx);
132 if (stdp && stdp->active) {
133 /*
134 * Due to verifier issues, we have to copy contents
135 * of stdp onto the stack before the update.
136 * Fix it to directly update once kernel patch d71962f
137 * becomes more widespread.
138 */
139 std = *stdp;
140 std.idle_skip = 1;
141 sts.update(&idx, &std);
142 }
143
144 // Mark CPU as actively within idle or not.
145 if (args->state < 100) {
146 val = 1;
147 isidle.update(&idx, &val);
148 } else {
149 val = 0;
150 isidle.update(&idx, &val);
151 }
152 return 0;
153}
154
155static int in_idle(void)
156{
157 u64 *idlep;
158 int idx = 0;
159
160 // Skip event if we're in idle loop
161 idlep = isidle.lookup(&idx);
162 if (idlep && *idlep)
163 return 1;
164 return 0;
165}
166
167static void reset_state(void)
168{
169 int idx = 0;
170 struct start_data s = {};
171
172 sts.update(&idx, &s);
173}
174
175TRACEPOINT_PROBE(preemptirq, TYPE_disable)
176{
177 int idx = 0;
178 struct start_data s;
179
180 // Handle the case Ienter, CSenter, CSexit, Iexit
181 // Handle the case Ienter, CSenter, Iexit, CSexit
182 if (in_idle()) {
183 reset_state();
184 return 0;
185 }
186
187 u64 ts = bpf_ktime_get_ns();
188
189 s.idle_skip = 0;
190 s.addr_offs[START_CALLER_OFF] = args->caller_offs;
191 s.addr_offs[START_PARENT_OFF] = args->parent_offs;
192 s.ts = ts;
193 s.active = 1;
194
195 sts.update(&idx, &s);
196 return 0;
197}
198
199TRACEPOINT_PROBE(preemptirq, TYPE_enable)
200{
201 int idx = 0;
202 u64 start_ts, end_ts, diff;
203 struct start_data *stdp;
204
205 // Handle the case CSenter, Ienter, CSexit, Iexit
206 // Handle the case Ienter, CSenter, CSexit, Iexit
207 if (in_idle()) {
208 reset_state();
209 return 0;
210 }
211
212 stdp = sts.lookup(&idx);
213 if (!stdp) {
214 reset_state();
215 return 0;
216 }
217
218 // Handle the case Ienter, Csenter, Iexit, Csexit
219 if (!stdp->active) {
220 reset_state();
221 return 0;
222 }
223
224 // Handle the case CSenter, Ienter, Iexit, CSexit
225 if (stdp->idle_skip) {
226 reset_state();
227 return 0;
228 }
229
230 end_ts = bpf_ktime_get_ns();
231 start_ts = stdp->ts;
232
233 if (start_ts > end_ts) {
234 reset_state();
235 return 0;
236 }
237
238 diff = end_ts - start_ts;
239
240 if (diff < DURATION) {
241 reset_state();
242 return 0;
243 }
244
245 u64 id = bpf_get_current_pid_tgid();
246 struct data_t data = {};
247
248 if (bpf_get_current_comm(&data.comm, sizeof(data.comm)) == 0) {
249 data.addrs[START_CALLER_OFF] = stdp->addr_offs[START_CALLER_OFF];
250 data.addrs[START_PARENT_OFF] = stdp->addr_offs[START_PARENT_OFF];
251 data.addrs[END_CALLER_OFF] = args->caller_offs;
252 data.addrs[END_PARENT_OFF] = args->parent_offs;
253
254 data.id = id;
255 data.stack_id = stack_traces.get_stackid(args, 0);
256 data.time = diff;
257 data.cpu = bpf_get_smp_processor_id();
258 events.perf_submit(args, &data, sizeof(data));
259 }
260
261 reset_state();
262 return 0;
263}
264"""
265bpf_text = bpf_text.replace('DURATION', '{}'.format(int(args.duration) * 1000))
266
267if preemptoff:
268 bpf_text = bpf_text.replace('TYPE', 'preempt')
269else:
270 bpf_text = bpf_text.replace('TYPE', 'irq')
271
272b = BPF(text=bpf_text)
273
274TASK_COMM_LEN = 16 # linux/sched.h
275
276class Data(ct.Structure):
277 _fields_ = [
278 ("time", ct.c_ulonglong),
279 ("stack_id", ct.c_longlong),
280 ("cpu", ct.c_int),
281 ("id", ct.c_ulonglong),
282 ("addrs", ct.c_int * 4),
283 ("comm", ct.c_char * TASK_COMM_LEN),
284 ]
285
286def get_syms(kstack):
287 syms = []
288
289 for addr in kstack:
290 s = b.ksym(addr, show_offset=True)
291 syms.append(s)
292
293 return syms
294
295# process event
296def print_event(cpu, data, size):
297 try:
298 global b
299 event = ct.cast(data, ct.POINTER(Data)).contents
300 stack_traces = b['stack_traces']
301 stext = b.ksymname('_stext')
302
303 print("===================================")
304 print("TASK: %s (pid %5d tid %5d) Total Time: %-9.3fus\n\n" % (event.comm, \
305 (event.id >> 32), (event.id & 0xffffffff), float(event.time) / 1000), end="")
306 print("Section start: {} -> {}".format(b.ksym(stext + event.addrs[0]), b.ksym(stext + event.addrs[1])))
307 print("Section end: {} -> {}".format(b.ksym(stext + event.addrs[2]), b.ksym(stext + event.addrs[3])))
308
Joeld6019842018-06-19 16:33:56 -0700309 if event.stack_id >= 0:
Joel0b813f82018-06-18 19:37:38 -0700310 kstack = stack_traces.walk(event.stack_id)
311 syms = get_syms(kstack)
312 if not syms:
313 return
314
315 for s in syms:
316 print(" ", end="")
317 print("%s" % s)
318 else:
319 print("NO STACK FOUND DUE TO COLLISION")
320 print("===================================")
321 print("")
322 except:
323 sys.exit(0)
324
325b["events"].open_perf_buffer(print_event, page_cnt=256)
326
327print("Finding critical section with {} disabled for > {}us".format(
328 ('preempt' if preemptoff else 'IRQ'), args.duration))
329
330while 1:
331 b.perf_buffer_poll();