blob: 6c94d6c6b6bcfb1d71dc05402d8db66e8dc5f03d [file] [log] [blame]
Alexey Ivanovcc01a9c2019-01-16 09:50:46 -08001#!/usr/bin/python
Ivan Babrou5c48a3f2018-05-08 17:24:19 -07002# @lint-avoid-python-3-compatibility-imports
3#
4# runqslower Trace long process scheduling delays.
5# For Linux, uses BCC, eBPF.
6#
7# This script traces high scheduling delays between tasks being
8# ready to run and them running on CPU after that.
9#
zhenwei pi508d9692021-08-12 18:04:17 +080010# USAGE: runqslower [-p PID] [-t TID] [-P] [min_us]
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070011#
12# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support).
13#
14# This measures the time a task spends waiting on a run queue for a turn
15# on-CPU, and shows this time as a individual events. This time should be small,
16# but a task may need to wait its turn due to CPU load.
17#
18# This measures two types of run queue latency:
19# 1. The time from a task being enqueued on a run queue to its context switch
20# and execution. This traces ttwu_do_wakeup(), wake_up_new_task() ->
21# finish_task_switch() with either raw tracepoints (if supported) or kprobes
22# and instruments the run queue latency after a voluntary context switch.
23# 2. The time from when a task was involuntary context switched and still
24# in the runnable state, to when it next executed. This is instrumented
25# from finish_task_switch() alone.
26#
27# Copyright 2016 Cloudflare, Inc.
28# Licensed under the Apache License, Version 2.0 (the "License")
29#
30# 02-May-2018 Ivan Babrou Created this.
Sumanth Korikkar7f6066d2020-05-20 10:49:56 -050031# 18-Nov-2019 Gergely Bod BUG fix: Use bpf_probe_read_kernel_str() to extract the
Greg Bod65885f32019-11-18 19:35:31 +000032# process name from 'task_struct* next' in raw tp code.
33# bpf_get_current_comm() operates on the current task
34# which might already be different than 'next'.
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070035
36from __future__ import print_function
37from bcc import BPF
38import argparse
39from time import strftime
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070040
41# arguments
42examples = """examples:
43 ./runqslower # trace run queue latency higher than 10000 us (default)
44 ./runqslower 1000 # trace run queue latency higher than 1000 us
bodgergelydb826442020-01-03 05:37:24 +000045 ./runqslower -p 123 # trace pid 123
46 ./runqslower -t 123 # trace tid 123 (use for threads only)
zhenwei pi508d9692021-08-12 18:04:17 +080047 ./runqslower -P # also show previous task comm and TID
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070048"""
49parser = argparse.ArgumentParser(
50 description="Trace high run queue latency",
51 formatter_class=argparse.RawDescriptionHelpFormatter,
52 epilog=examples)
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070053parser.add_argument("min_us", nargs="?", default='10000',
Aditya Mandaleeka78b0f072020-06-04 13:23:14 -070054 help="minimum run queue latency to trace, in us (default 10000)")
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070055parser.add_argument("--ebpf", action="store_true",
56 help=argparse.SUPPRESS)
bodgergelydb826442020-01-03 05:37:24 +000057
58thread_group = parser.add_mutually_exclusive_group()
59thread_group.add_argument("-p", "--pid", metavar="PID", dest="pid",
60 help="trace this PID only", type=int)
61thread_group.add_argument("-t", "--tid", metavar="TID", dest="tid",
62 help="trace this TID only", type=int)
zhenwei pi508d9692021-08-12 18:04:17 +080063thread_group.add_argument("-P", "--previous", action="store_true",
64 help="also show previous task name and TID")
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070065args = parser.parse_args()
bodgergelydb826442020-01-03 05:37:24 +000066
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070067min_us = int(args.min_us)
68debug = 0
69
70# define BPF program
71bpf_text = """
72#include <uapi/linux/ptrace.h>
73#include <linux/sched.h>
74#include <linux/nsproxy.h>
75#include <linux/pid_namespace.h>
76
77BPF_HASH(start, u32);
78
79struct rq;
80
81struct data_t {
82 u32 pid;
zhenwei pi508d9692021-08-12 18:04:17 +080083 u32 prev_pid;
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070084 char task[TASK_COMM_LEN];
zhenwei pi508d9692021-08-12 18:04:17 +080085 char prev_task[TASK_COMM_LEN];
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070086 u64 delta_us;
87};
88
89BPF_PERF_OUTPUT(events);
90
91// record enqueue timestamp
92static int trace_enqueue(u32 tgid, u32 pid)
93{
bodgergelydb826442020-01-03 05:37:24 +000094 if (FILTER_PID || FILTER_TGID || pid == 0)
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070095 return 0;
96 u64 ts = bpf_ktime_get_ns();
97 start.update(&pid, &ts);
98 return 0;
99}
100"""
101
102bpf_text_kprobe = """
103int trace_wake_up_new_task(struct pt_regs *ctx, struct task_struct *p)
104{
105 return trace_enqueue(p->tgid, p->pid);
106}
107
108int trace_ttwu_do_wakeup(struct pt_regs *ctx, struct rq *rq, struct task_struct *p,
109 int wake_flags)
110{
111 return trace_enqueue(p->tgid, p->pid);
112}
113
114// calculate latency
115int trace_run(struct pt_regs *ctx, struct task_struct *prev)
116{
zhenwei pi508d9692021-08-12 18:04:17 +0800117 u32 pid, tgid, prev_pid;
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700118
119 // ivcsw: treat like an enqueue event and store timestamp
zhenwei pi508d9692021-08-12 18:04:17 +0800120 prev_pid = prev->pid;
Hengqi Chen08765a92021-10-31 23:20:10 +0800121 if (prev->STATE_FIELD == TASK_RUNNING) {
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700122 tgid = prev->tgid;
bodgergelydb826442020-01-03 05:37:24 +0000123 u64 ts = bpf_ktime_get_ns();
zhenwei pi508d9692021-08-12 18:04:17 +0800124 if (prev_pid != 0) {
bodgergelydb826442020-01-03 05:37:24 +0000125 if (!(FILTER_PID) && !(FILTER_TGID)) {
zhenwei pi508d9692021-08-12 18:04:17 +0800126 start.update(&prev_pid, &ts);
bodgergelydb826442020-01-03 05:37:24 +0000127 }
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700128 }
129 }
130
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700131 pid = bpf_get_current_pid_tgid();
132
133 u64 *tsp, delta_us;
134
135 // fetch timestamp and calculate delta
136 tsp = start.lookup(&pid);
137 if (tsp == 0) {
138 return 0; // missed enqueue
139 }
140 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
141
142 if (FILTER_US)
143 return 0;
144
145 struct data_t data = {};
146 data.pid = pid;
zhenwei pi508d9692021-08-12 18:04:17 +0800147 data.prev_pid = prev_pid;
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700148 data.delta_us = delta_us;
149 bpf_get_current_comm(&data.task, sizeof(data.task));
zhenwei pi508d9692021-08-12 18:04:17 +0800150 bpf_probe_read_kernel_str(&data.prev_task, sizeof(data.prev_task), prev->comm);
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700151
152 // output
153 events.perf_submit(ctx, &data, sizeof(data));
154
155 start.delete(&pid);
156 return 0;
157}
158"""
159
160bpf_text_raw_tp = """
161RAW_TRACEPOINT_PROBE(sched_wakeup)
162{
163 // TP_PROTO(struct task_struct *p)
164 struct task_struct *p = (struct task_struct *)ctx->args[0];
Paul Chaignon8d78edd2018-06-29 07:47:44 +0200165 return trace_enqueue(p->tgid, p->pid);
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700166}
167
168RAW_TRACEPOINT_PROBE(sched_wakeup_new)
169{
170 // TP_PROTO(struct task_struct *p)
171 struct task_struct *p = (struct task_struct *)ctx->args[0];
172 u32 tgid, pid;
173
Sumanth Korikkar7f6066d2020-05-20 10:49:56 -0500174 bpf_probe_read_kernel(&tgid, sizeof(tgid), &p->tgid);
175 bpf_probe_read_kernel(&pid, sizeof(pid), &p->pid);
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700176 return trace_enqueue(tgid, pid);
177}
178
179RAW_TRACEPOINT_PROBE(sched_switch)
180{
181 // TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
182 struct task_struct *prev = (struct task_struct *)ctx->args[1];
183 struct task_struct *next= (struct task_struct *)ctx->args[2];
zhenwei pi508d9692021-08-12 18:04:17 +0800184 u32 tgid, pid, prev_pid;
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700185 long state;
186
187 // ivcsw: treat like an enqueue event and store timestamp
Hengqi Chen08765a92021-10-31 23:20:10 +0800188 bpf_probe_read_kernel(&state, sizeof(long), (const void *)&prev->STATE_FIELD);
zhenwei pi508d9692021-08-12 18:04:17 +0800189 bpf_probe_read_kernel(&prev_pid, sizeof(prev->pid), &prev->pid);
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700190 if (state == TASK_RUNNING) {
Sumanth Korikkar7f6066d2020-05-20 10:49:56 -0500191 bpf_probe_read_kernel(&tgid, sizeof(prev->tgid), &prev->tgid);
bodgergelydb826442020-01-03 05:37:24 +0000192 u64 ts = bpf_ktime_get_ns();
zhenwei pi508d9692021-08-12 18:04:17 +0800193 if (prev_pid != 0) {
bodgergelydb826442020-01-03 05:37:24 +0000194 if (!(FILTER_PID) && !(FILTER_TGID)) {
zhenwei pi508d9692021-08-12 18:04:17 +0800195 start.update(&prev_pid, &ts);
bodgergelydb826442020-01-03 05:37:24 +0000196 }
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700197 }
bodgergelydb826442020-01-03 05:37:24 +0000198
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700199 }
200
Sumanth Korikkar7f6066d2020-05-20 10:49:56 -0500201 bpf_probe_read_kernel(&pid, sizeof(next->pid), &next->pid);
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700202
203 u64 *tsp, delta_us;
204
205 // fetch timestamp and calculate delta
206 tsp = start.lookup(&pid);
207 if (tsp == 0) {
208 return 0; // missed enqueue
209 }
210 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
211
212 if (FILTER_US)
213 return 0;
214
215 struct data_t data = {};
216 data.pid = pid;
zhenwei pi508d9692021-08-12 18:04:17 +0800217 data.prev_pid = prev_pid;
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700218 data.delta_us = delta_us;
Sumanth Korikkar7f6066d2020-05-20 10:49:56 -0500219 bpf_probe_read_kernel_str(&data.task, sizeof(data.task), next->comm);
zhenwei pi508d9692021-08-12 18:04:17 +0800220 bpf_probe_read_kernel_str(&data.prev_task, sizeof(data.prev_task), prev->comm);
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700221
222 // output
223 events.perf_submit(ctx, &data, sizeof(data));
224
225 start.delete(&pid);
226 return 0;
227}
228"""
229
230is_support_raw_tp = BPF.support_raw_tracepoint()
231if is_support_raw_tp:
232 bpf_text += bpf_text_raw_tp
233else:
234 bpf_text += bpf_text_kprobe
235
236# code substitutions
Hengqi Chen08765a92021-10-31 23:20:10 +0800237if BPF.kernel_struct_has_field(b'task_struct', b'__state') == 1:
238 bpf_text = bpf_text.replace('STATE_FIELD', '__state')
239else:
240 bpf_text = bpf_text.replace('STATE_FIELD', 'state')
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700241if min_us == 0:
242 bpf_text = bpf_text.replace('FILTER_US', '0')
243else:
244 bpf_text = bpf_text.replace('FILTER_US', 'delta_us <= %s' % str(min_us))
bodgergelydb826442020-01-03 05:37:24 +0000245
246if args.tid:
247 bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % args.tid)
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700248else:
249 bpf_text = bpf_text.replace('FILTER_PID', '0')
bodgergelydb826442020-01-03 05:37:24 +0000250
251if args.pid:
252 bpf_text = bpf_text.replace('FILTER_TGID', 'tgid != %s' % args.pid)
253else:
254 bpf_text = bpf_text.replace('FILTER_TGID', '0')
255
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700256if debug or args.ebpf:
257 print(bpf_text)
258 if args.ebpf:
259 exit()
260
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700261# process event
262def print_event(cpu, data, size):
Xiaozhou Liu51d62d32019-02-15 13:03:05 +0800263 event = b["events"].event(data)
zhenwei pi508d9692021-08-12 18:04:17 +0800264 if args.previous:
265 print("%-8s %-16s %-6s %14s %-16s %-6s" % (strftime("%H:%M:%S"), event.task, event.pid, event.delta_us, event.prev_task, event.prev_pid))
266 else:
267 print("%-8s %-16s %-6s %14s" % (strftime("%H:%M:%S"), event.task, event.pid, event.delta_us))
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700268
269# load BPF program
270b = BPF(text=bpf_text)
271if not is_support_raw_tp:
272 b.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
273 b.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
Guodong Xu00b72fd2021-03-13 02:23:47 +0000274 b.attach_kprobe(event_re="^finish_task_switch$|^finish_task_switch\.isra\.\d$",
275 fn_name="trace_run")
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700276
277print("Tracing run queue latency higher than %d us" % min_us)
zhenwei pi508d9692021-08-12 18:04:17 +0800278if args.previous:
279 print("%-8s %-16s %-6s %14s %-16s %-6s" % ("TIME", "COMM", "TID", "LAT(us)", "PREV COMM", "PREV TID"))
280else:
281 print("%-8s %-16s %-6s %14s" % ("TIME", "COMM", "TID", "LAT(us)"))
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700282
283# read events
284b["events"].open_perf_buffer(print_event, page_cnt=64)
285while 1:
Jerome Marchand51671272018-12-19 01:57:24 +0100286 try:
287 b.perf_buffer_poll()
288 except KeyboardInterrupt:
289 exit()