blob: e1583e54bdce49b76a3a4c43419b6499e209e0ba [file] [log] [blame]
Alexey Ivanovcc01a9c2019-01-16 09:50:46 -08001#!/usr/bin/python
Ivan Babrou5c48a3f2018-05-08 17:24:19 -07002# @lint-avoid-python-3-compatibility-imports
3#
4# runqslower Trace long process scheduling delays.
5# For Linux, uses BCC, eBPF.
6#
7# This script traces high scheduling delays between tasks being
8# ready to run and them running on CPU after that.
9#
bodgergelydb826442020-01-03 05:37:24 +000010# USAGE: runqslower [-p PID] [-t TID] [min_us]
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070011#
12# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support).
13#
14# This measures the time a task spends waiting on a run queue for a turn
15# on-CPU, and shows this time as a individual events. This time should be small,
16# but a task may need to wait its turn due to CPU load.
17#
18# This measures two types of run queue latency:
19# 1. The time from a task being enqueued on a run queue to its context switch
20# and execution. This traces ttwu_do_wakeup(), wake_up_new_task() ->
21# finish_task_switch() with either raw tracepoints (if supported) or kprobes
22# and instruments the run queue latency after a voluntary context switch.
23# 2. The time from when a task was involuntary context switched and still
24# in the runnable state, to when it next executed. This is instrumented
25# from finish_task_switch() alone.
26#
27# Copyright 2016 Cloudflare, Inc.
28# Licensed under the Apache License, Version 2.0 (the "License")
29#
30# 02-May-2018 Ivan Babrou Created this.
Sumanth Korikkar7f6066d2020-05-20 10:49:56 -050031# 18-Nov-2019 Gergely Bod BUG fix: Use bpf_probe_read_kernel_str() to extract the
Greg Bod65885f32019-11-18 19:35:31 +000032# process name from 'task_struct* next' in raw tp code.
33# bpf_get_current_comm() operates on the current task
34# which might already be different than 'next'.
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070035
36from __future__ import print_function
37from bcc import BPF
38import argparse
39from time import strftime
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070040
41# arguments
42examples = """examples:
43 ./runqslower # trace run queue latency higher than 10000 us (default)
44 ./runqslower 1000 # trace run queue latency higher than 1000 us
bodgergelydb826442020-01-03 05:37:24 +000045 ./runqslower -p 123 # trace pid 123
46 ./runqslower -t 123 # trace tid 123 (use for threads only)
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070047"""
48parser = argparse.ArgumentParser(
49 description="Trace high run queue latency",
50 formatter_class=argparse.RawDescriptionHelpFormatter,
51 epilog=examples)
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070052parser.add_argument("min_us", nargs="?", default='10000',
Aditya Mandaleeka78b0f072020-06-04 13:23:14 -070053 help="minimum run queue latency to trace, in us (default 10000)")
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070054parser.add_argument("--ebpf", action="store_true",
55 help=argparse.SUPPRESS)
bodgergelydb826442020-01-03 05:37:24 +000056
57thread_group = parser.add_mutually_exclusive_group()
58thread_group.add_argument("-p", "--pid", metavar="PID", dest="pid",
59 help="trace this PID only", type=int)
60thread_group.add_argument("-t", "--tid", metavar="TID", dest="tid",
61 help="trace this TID only", type=int)
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070062args = parser.parse_args()
bodgergelydb826442020-01-03 05:37:24 +000063
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070064min_us = int(args.min_us)
65debug = 0
66
67# define BPF program
68bpf_text = """
69#include <uapi/linux/ptrace.h>
70#include <linux/sched.h>
71#include <linux/nsproxy.h>
72#include <linux/pid_namespace.h>
73
74BPF_HASH(start, u32);
75
76struct rq;
77
78struct data_t {
79 u32 pid;
80 char task[TASK_COMM_LEN];
81 u64 delta_us;
82};
83
84BPF_PERF_OUTPUT(events);
85
86// record enqueue timestamp
87static int trace_enqueue(u32 tgid, u32 pid)
88{
bodgergelydb826442020-01-03 05:37:24 +000089 if (FILTER_PID || FILTER_TGID || pid == 0)
Ivan Babrou5c48a3f2018-05-08 17:24:19 -070090 return 0;
91 u64 ts = bpf_ktime_get_ns();
92 start.update(&pid, &ts);
93 return 0;
94}
95"""
96
97bpf_text_kprobe = """
98int trace_wake_up_new_task(struct pt_regs *ctx, struct task_struct *p)
99{
100 return trace_enqueue(p->tgid, p->pid);
101}
102
103int trace_ttwu_do_wakeup(struct pt_regs *ctx, struct rq *rq, struct task_struct *p,
104 int wake_flags)
105{
106 return trace_enqueue(p->tgid, p->pid);
107}
108
109// calculate latency
110int trace_run(struct pt_regs *ctx, struct task_struct *prev)
111{
112 u32 pid, tgid;
113
114 // ivcsw: treat like an enqueue event and store timestamp
115 if (prev->state == TASK_RUNNING) {
116 tgid = prev->tgid;
117 pid = prev->pid;
bodgergelydb826442020-01-03 05:37:24 +0000118 u64 ts = bpf_ktime_get_ns();
119 if (pid != 0) {
120 if (!(FILTER_PID) && !(FILTER_TGID)) {
121 start.update(&pid, &ts);
122 }
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700123 }
124 }
125
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700126 pid = bpf_get_current_pid_tgid();
127
128 u64 *tsp, delta_us;
129
130 // fetch timestamp and calculate delta
131 tsp = start.lookup(&pid);
132 if (tsp == 0) {
133 return 0; // missed enqueue
134 }
135 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
136
137 if (FILTER_US)
138 return 0;
139
140 struct data_t data = {};
141 data.pid = pid;
142 data.delta_us = delta_us;
143 bpf_get_current_comm(&data.task, sizeof(data.task));
144
145 // output
146 events.perf_submit(ctx, &data, sizeof(data));
147
148 start.delete(&pid);
149 return 0;
150}
151"""
152
153bpf_text_raw_tp = """
154RAW_TRACEPOINT_PROBE(sched_wakeup)
155{
156 // TP_PROTO(struct task_struct *p)
157 struct task_struct *p = (struct task_struct *)ctx->args[0];
Paul Chaignon8d78edd2018-06-29 07:47:44 +0200158 return trace_enqueue(p->tgid, p->pid);
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700159}
160
161RAW_TRACEPOINT_PROBE(sched_wakeup_new)
162{
163 // TP_PROTO(struct task_struct *p)
164 struct task_struct *p = (struct task_struct *)ctx->args[0];
165 u32 tgid, pid;
166
Sumanth Korikkar7f6066d2020-05-20 10:49:56 -0500167 bpf_probe_read_kernel(&tgid, sizeof(tgid), &p->tgid);
168 bpf_probe_read_kernel(&pid, sizeof(pid), &p->pid);
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700169 return trace_enqueue(tgid, pid);
170}
171
172RAW_TRACEPOINT_PROBE(sched_switch)
173{
174 // TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
175 struct task_struct *prev = (struct task_struct *)ctx->args[1];
176 struct task_struct *next= (struct task_struct *)ctx->args[2];
bodgergelydb826442020-01-03 05:37:24 +0000177 u32 tgid, pid;
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700178 long state;
179
180 // ivcsw: treat like an enqueue event and store timestamp
Sumanth Korikkar7f6066d2020-05-20 10:49:56 -0500181 bpf_probe_read_kernel(&state, sizeof(long), (const void *)&prev->state);
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700182 if (state == TASK_RUNNING) {
Sumanth Korikkar7f6066d2020-05-20 10:49:56 -0500183 bpf_probe_read_kernel(&tgid, sizeof(prev->tgid), &prev->tgid);
184 bpf_probe_read_kernel(&pid, sizeof(prev->pid), &prev->pid);
bodgergelydb826442020-01-03 05:37:24 +0000185 u64 ts = bpf_ktime_get_ns();
186 if (pid != 0) {
187 if (!(FILTER_PID) && !(FILTER_TGID)) {
188 start.update(&pid, &ts);
189 }
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700190 }
bodgergelydb826442020-01-03 05:37:24 +0000191
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700192 }
193
Sumanth Korikkar7f6066d2020-05-20 10:49:56 -0500194 bpf_probe_read_kernel(&pid, sizeof(next->pid), &next->pid);
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700195
196 u64 *tsp, delta_us;
197
198 // fetch timestamp and calculate delta
199 tsp = start.lookup(&pid);
200 if (tsp == 0) {
201 return 0; // missed enqueue
202 }
203 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
204
205 if (FILTER_US)
206 return 0;
207
208 struct data_t data = {};
209 data.pid = pid;
210 data.delta_us = delta_us;
Sumanth Korikkar7f6066d2020-05-20 10:49:56 -0500211 bpf_probe_read_kernel_str(&data.task, sizeof(data.task), next->comm);
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700212
213 // output
214 events.perf_submit(ctx, &data, sizeof(data));
215
216 start.delete(&pid);
217 return 0;
218}
219"""
220
221is_support_raw_tp = BPF.support_raw_tracepoint()
222if is_support_raw_tp:
223 bpf_text += bpf_text_raw_tp
224else:
225 bpf_text += bpf_text_kprobe
226
227# code substitutions
228if min_us == 0:
229 bpf_text = bpf_text.replace('FILTER_US', '0')
230else:
231 bpf_text = bpf_text.replace('FILTER_US', 'delta_us <= %s' % str(min_us))
bodgergelydb826442020-01-03 05:37:24 +0000232
233if args.tid:
234 bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % args.tid)
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700235else:
236 bpf_text = bpf_text.replace('FILTER_PID', '0')
bodgergelydb826442020-01-03 05:37:24 +0000237
238if args.pid:
239 bpf_text = bpf_text.replace('FILTER_TGID', 'tgid != %s' % args.pid)
240else:
241 bpf_text = bpf_text.replace('FILTER_TGID', '0')
242
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700243if debug or args.ebpf:
244 print(bpf_text)
245 if args.ebpf:
246 exit()
247
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700248# process event
249def print_event(cpu, data, size):
Xiaozhou Liu51d62d32019-02-15 13:03:05 +0800250 event = b["events"].event(data)
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700251 print("%-8s %-16s %-6s %14s" % (strftime("%H:%M:%S"), event.task, event.pid, event.delta_us))
252
253# load BPF program
254b = BPF(text=bpf_text)
255if not is_support_raw_tp:
256 b.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
257 b.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
258 b.attach_kprobe(event="finish_task_switch", fn_name="trace_run")
259
260print("Tracing run queue latency higher than %d us" % min_us)
bodgergelydb826442020-01-03 05:37:24 +0000261print("%-8s %-16s %-6s %14s" % ("TIME", "COMM", "TID", "LAT(us)"))
Ivan Babrou5c48a3f2018-05-08 17:24:19 -0700262
263# read events
264b["events"].open_perf_buffer(print_event, page_cnt=64)
265while 1:
Jerome Marchand51671272018-12-19 01:57:24 +0100266 try:
267 b.perf_buffer_poll()
268 except KeyboardInterrupt:
269 exit()