blob: 9624b50d7d84b384713327592a7f69ee4fde45f4 [file] [log] [blame]
Brendan Gregg3f3acd82016-12-21 15:34:09 -08001#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# cpuunclaimed Sample CPU run queues and calculate unclaimed idle CPU.
5# For Linux, uses BCC, eBPF.
6#
7# This samples the length of the run queues and determine when there are idle
8# CPUs, yet queued threads waiting their turn. Report the amount of idle
9# (yet unclaimed by waiting threads) CPU as a system-wide percentage.
10#
11# This situation can happen for a number of reasons:
12#
13# - An application has been bound to some, but not all, CPUs, and has runnable
14# threads that cannot migrate to other CPUs due to this configuration.
15# - CPU affinity: an optimization that leaves threads on CPUs where the CPU
16# caches are warm, even if this means short periods of waiting while other
17# CPUs are idle. The wait period is tunale (see sysctl, kernel.sched*).
18# - Scheduler bugs.
19#
20# An unclaimed idle of < 1% is likely to be CPU affinity, and not usually a
21# cause for concern. By leaving the CPU idle, overall throughput of the system
22# may be improved. This tool is best for identifying larger issues, > 2%, due
23# to the coarseness of its 99 Hertz samples.
24#
25# This is an experimental tool that currently works by use of sampling to
26# keep overheads low. Tool assumptions:
27#
28# - CPU samples consistently fire around the same offset. There will sometimes
29# be a lag as a sample is delayed by higher-priority interrupts, but it is
30# assumed the subsequent samples will catch up to the expected offsets (as
31# is seen in practice). You can use -J to inspect sample offsets. Some
32# systems can power down CPUs when idle, and when they wake up again they
33# may begin firing at a skewed offset: this tool will detect the skew, print
34# an error, and exit.
35# - All CPUs are online (see ncpu).
36#
37# If this identifies unclaimed CPU, you can double check it by dumping raw
38# samples (-j), as well as using other tracing tools to instrument scheduler
39# events (although this latter approach has much higher overhead).
40#
41# This tool passes all sampled events to user space for post processing.
42# I originally wrote this to do the calculations entirerly in kernel context,
43# and only pass a summary. That involves a number of challenges, and the
44# overhead savings may not outweigh the caveats. You can see my WIP here:
45# https://gist.github.com/brendangregg/731cf2ce54bf1f9a19d4ccd397625ad9
46#
47# USAGE: cpuunclaimed [-h] [-j] [-J] [-T] [interval] [count]
48#
49# If you see "Lost 1881 samples" warnings, try increasing wakeup_hz.
50#
51# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is
52# a version of this tool that may work on Linux 4.6 - 4.8.
53#
54# Copyright 2016 Netflix, Inc.
55# Licensed under the Apache License, Version 2.0 (the "License")
56#
57# 20-Dec-2016 Brendan Gregg Created this.
58
59from __future__ import print_function
60from bcc import BPF, PerfType, PerfSWConfig
61from time import sleep, strftime
62from ctypes import c_int
63import argparse
64import multiprocessing
65from os import getpid, system
66import ctypes as ct
67
68# arguments
69examples = """examples:
70 ./cpuunclaimed # sample and calculate unclaimed idle CPUs,
71 # output every 1 second (default)
72 ./cpuunclaimed 5 10 # print 5 second summaries, 10 times
73 ./cpuunclaimed -T 1 # 1s summaries and timestamps
74 ./cpuunclaimed -j # raw dump of all samples (verbose), CSV
75"""
76parser = argparse.ArgumentParser(
77 description="Sample CPU run queues and calculate unclaimed idle CPU",
78 formatter_class=argparse.RawDescriptionHelpFormatter,
79 epilog=examples)
80parser.add_argument("-j", "--csv", action="store_true",
81 help="print sample summaries (verbose) as comma-separated values")
82parser.add_argument("-J", "--fullcsv", action="store_true",
83 help="print sample summaries with extra fields: CPU sample offsets")
84parser.add_argument("-T", "--timestamp", action="store_true",
85 help="include timestamp on output")
86parser.add_argument("interval", nargs="?", default=-1,
87 help="output interval, in seconds")
88parser.add_argument("count", nargs="?", default=99999999,
89 help="number of outputs")
90args = parser.parse_args()
91countdown = int(args.count)
92frequency = 99
93dobind = 1
94wakeup_hz = 10 # frequency to read buffers
95wakeup_s = float(1) / wakeup_hz
96ncpu = multiprocessing.cpu_count() # assume all are online
97debug = 0
98
99# process arguments
100if args.fullcsv:
101 args.csv = True
102if args.csv:
103 interval = 0.2
104if args.interval != -1 and (args.fullcsv or args.csv):
105 print("ERROR: cannot use interval with either -j or -J. Exiting.")
106 exit()
107if args.interval == -1:
108 args.interval = "1"
109interval = float(args.interval)
110
111# define BPF program
112bpf_text = """
113#include <uapi/linux/ptrace.h>
114#include <uapi/linux/bpf_perf_event.h>
115#include <linux/sched.h>
116
117struct data_t {
118 u64 ts;
119 u64 cpu;
120 u64 len;
121};
122
123BPF_PERF_OUTPUT(events);
124
125// Declare enough of cfs_rq to find nr_running, since we can't #import the
126// header. This will need maintenance. It is from kernel/sched/sched.h:
127struct cfs_rq_partial {
128 struct load_weight load;
129 unsigned int nr_running, h_nr_running;
130};
131
132int do_perf_event(struct bpf_perf_event_data *ctx)
133{
134 int cpu = bpf_get_smp_processor_id();
135 u64 now = bpf_ktime_get_ns();
136
137 /*
138 * Fetch the run queue length from task->se.cfs_rq->nr_running. This is an
139 * unstable interface and may need maintenance. Perhaps a future version
140 * of BPF will support task_rq(p) or something similar as a more reliable
141 * interface.
142 */
143 unsigned int len = 0;
144 struct task_struct *task = NULL;
145 struct cfs_rq_partial *my_q = NULL;
146 task = (struct task_struct *)bpf_get_current_task();
147 bpf_probe_read(&my_q, sizeof(my_q), &task->se.cfs_rq);
148 bpf_probe_read(&len, sizeof(len), &my_q->nr_running);
149
150 struct data_t data = {.ts = now, .cpu = cpu, .len = len};
151 events.perf_submit(ctx, &data, sizeof(data));
152
153 return 0;
154}
155"""
156
157# code substitutions
158if debug:
159 print(bpf_text)
160
161# initialize BPF & perf_events
162b = BPF(text=bpf_text)
163# TODO: check for HW counters first and use if more accurate
164b.attach_perf_event(ev_type=PerfType.SOFTWARE,
165 ev_config=PerfSWConfig.TASK_CLOCK, fn_name="do_perf_event",
166 sample_period=0, sample_freq=frequency)
167
168if args.csv:
169 if args.timestamp:
170 print("TIME", end=",")
171 print("TIMESTAMP_ns", end=",")
172 print(",".join("CPU" + str(c) for c in xrange(ncpu)), end="")
173 if args.fullcsv:
174 print(",", end="")
175 print(",".join("OFFSET_ns_CPU" + str(c) for c in xrange(ncpu)), end="")
176 print()
177else:
178 print(("Sampling run queues... Output every %s seconds. " +
179 "Hit Ctrl-C to end.") % args.interval)
180class Data(ct.Structure):
181 _fields_ = [
182 ("ts", ct.c_ulonglong),
183 ("cpu", ct.c_ulonglong),
184 ("len", ct.c_ulonglong)
185 ]
186
187samples = {}
188group = {}
189last = 0
190
191# process event
192def print_event(cpu, data, size):
193 event = ct.cast(data, ct.POINTER(Data)).contents
194 samples[event.ts] = {}
195 samples[event.ts]['cpu'] = event.cpu
196 samples[event.ts]['len'] = event.len
197
198exiting = 0 if args.interval else 1
199slept = float(0)
200
201# Choose the elapsed time from one sample group to the next that identifies a
202# new sample group (a group being a set of samples from all CPUs). The
203# earliest timestamp is compared in each group. This trigger is also used
204# for sanity testing, if a group's samples exceed half this value.
205trigger = int(0.8 * (1000000000 / frequency))
206
207# read events
208b["events"].open_perf_buffer(print_event)
209while 1:
210 # allow some buffering by calling sleep(), to reduce the context switch
211 # rate and lower overhead.
212 try:
213 if not exiting:
214 sleep(wakeup_s)
215 except KeyboardInterrupt:
216 exiting = 1
217 b.kprobe_poll()
218 slept += wakeup_s
219
220 if slept < 0.999 * interval: # floating point workaround
221 continue
222 slept = 0
223
224 positive = 0 # number of samples where an idle CPU could have run work
225 running = 0
226 idle = 0
227 if debug >= 2:
228 print("DEBUG: begin samples loop, count %d" % len(samples))
229 for e in sorted(samples):
230 if debug >= 2:
231 print("DEBUG: ts %d cpu %d len %d delta %d trig %d" % (e,
232 samples[e]['cpu'], samples[e]['len'], e - last,
233 e - last > trigger))
234
235 # look for time jumps to identify a new sample group
236 if e - last > trigger:
237
238 # first first group timestamp, and sanity test
239 g_time = 0
240 g_max = 0
241 for ge in sorted(group):
242 if g_time == 0:
243 g_time = ge
244 g_max = ge
245
246 # process previous sample group
247 if args.csv:
248 lens = [0] * ncpu
249 offs = [0] * ncpu
250 for ge in sorted(group):
251 lens[samples[ge]['cpu']] = samples[ge]['len']
252 if args.fullcsv:
253 offs[samples[ge]['cpu']] = ge - g_time
254 if g_time > 0: # else first sample
255 if args.timestamp:
256 print("%-8s" % strftime("%H:%M:%S"), end=",")
257 print("%d" % g_time, end=",")
258 print(",".join(str(lens[c]) for c in xrange(ncpu)), end="")
259 if args.fullcsv:
260 print(",", end="")
261 print(",".join(str(offs[c]) for c in xrange(ncpu)))
262 else:
263 print()
264 else:
265 # calculate stats
266 g_running = 0
267 g_queued = 0
268 for ge in group:
269 if samples[ge]['len'] > 0:
270 g_running += 1
271 if samples[ge]['len'] > 1:
272 g_queued += samples[ge]['len'] - 1
273 g_idle = ncpu - g_running
274
275 # calculate the number of threads that could have run as the
276 # minimum of idle and queued
277 if g_idle > 0 and g_queued > 0:
278 if g_queued > g_idle:
279 i = g_idle
280 else:
281 i = g_queued
282 positive += i
283 running += g_running
284 idle += g_idle
285
286 # now sanity test, after -J output
287 g_range = g_max - g_time
288 if g_range > trigger / 2:
289 # if a sample group exceeds half the interval, we can no
290 # longer draw conclusions about some CPUs idle while others
291 # have queued work. Error and exit. This can happen when
292 # CPUs power down, then start again on different offsets.
293 # TODO: Since this is a sampling tool, an error margin should
294 # be anticipated, so an improvement may be to bump a counter
295 # instead of exiting, and only exit if this counter shows
296 # a skewed sample rate of over, say, 1%. Such an approach
297 # would allow a small rate of outliers (sampling error),
298 # and, we could tighten the trigger to be, say, trigger / 5.
299 # In the case of a power down, if it's detectable, perhaps
300 # the tool could reinitialize the timers (although exiting
301 # is simple and works).
302 print(("ERROR: CPU samples arrived at skewed offsets " +
303 "(CPUs may have powered down when idle), " +
304 "spanning %d ns (expected < %d ns). Debug with -J, " +
305 "and see the man page. As output may begin to be " +
306 "unreliable, exiting.") % (g_range, trigger / 2))
307 exit()
308
309 # these are done, remove
310 for ge in sorted(group):
311 del samples[ge]
312
313 # begin next group
314 group = {}
315 last = e
316
317 # stash this timestamp in a sample group dict
318 group[e] = 1
319
320 if not args.csv:
321 total = running + idle
322 unclaimed = util = 0
323
324 if debug:
325 print("DEBUG: hit %d running %d idle %d total %d buffered %d" % (
326 positive, running, idle, total, len(samples)))
327
328 if args.timestamp:
329 print("%-8s " % strftime("%H:%M:%S"), end="")
330
331 # output
332 if total:
333 unclaimed = float(positive) / total
334 util = float(running) / total
335 print("%%CPU %6.2f%%, unclaimed idle %0.2f%%" % (100 * util,
336 100 * unclaimed))
337
338 countdown -= 1
339 if exiting or countdown == 0:
340 exit()