blob: 75ee9324ee2edd88255a3cad13713bd647b9d5d0 [file] [log] [blame]
Alexey Ivanovcc01a9c2019-01-16 09:50:46 -08001#!/usr/bin/python
Brendan Gregg3f3acd82016-12-21 15:34:09 -08002# @lint-avoid-python-3-compatibility-imports
3#
4# cpuunclaimed Sample CPU run queues and calculate unclaimed idle CPU.
5# For Linux, uses BCC, eBPF.
6#
7# This samples the length of the run queues and determine when there are idle
8# CPUs, yet queued threads waiting their turn. Report the amount of idle
9# (yet unclaimed by waiting threads) CPU as a system-wide percentage.
10#
11# This situation can happen for a number of reasons:
12#
13# - An application has been bound to some, but not all, CPUs, and has runnable
14# threads that cannot migrate to other CPUs due to this configuration.
15# - CPU affinity: an optimization that leaves threads on CPUs where the CPU
16# caches are warm, even if this means short periods of waiting while other
17# CPUs are idle. The wait period is tunale (see sysctl, kernel.sched*).
18# - Scheduler bugs.
19#
20# An unclaimed idle of < 1% is likely to be CPU affinity, and not usually a
21# cause for concern. By leaving the CPU idle, overall throughput of the system
22# may be improved. This tool is best for identifying larger issues, > 2%, due
23# to the coarseness of its 99 Hertz samples.
24#
25# This is an experimental tool that currently works by use of sampling to
26# keep overheads low. Tool assumptions:
27#
28# - CPU samples consistently fire around the same offset. There will sometimes
29# be a lag as a sample is delayed by higher-priority interrupts, but it is
30# assumed the subsequent samples will catch up to the expected offsets (as
31# is seen in practice). You can use -J to inspect sample offsets. Some
32# systems can power down CPUs when idle, and when they wake up again they
33# may begin firing at a skewed offset: this tool will detect the skew, print
34# an error, and exit.
35# - All CPUs are online (see ncpu).
36#
37# If this identifies unclaimed CPU, you can double check it by dumping raw
38# samples (-j), as well as using other tracing tools to instrument scheduler
39# events (although this latter approach has much higher overhead).
40#
41# This tool passes all sampled events to user space for post processing.
42# I originally wrote this to do the calculations entirerly in kernel context,
43# and only pass a summary. That involves a number of challenges, and the
44# overhead savings may not outweigh the caveats. You can see my WIP here:
45# https://gist.github.com/brendangregg/731cf2ce54bf1f9a19d4ccd397625ad9
46#
47# USAGE: cpuunclaimed [-h] [-j] [-J] [-T] [interval] [count]
48#
49# If you see "Lost 1881 samples" warnings, try increasing wakeup_hz.
50#
51# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is
52# a version of this tool that may work on Linux 4.6 - 4.8.
53#
54# Copyright 2016 Netflix, Inc.
55# Licensed under the Apache License, Version 2.0 (the "License")
56#
57# 20-Dec-2016 Brendan Gregg Created this.
58
59from __future__ import print_function
60from bcc import BPF, PerfType, PerfSWConfig
61from time import sleep, strftime
62from ctypes import c_int
63import argparse
64import multiprocessing
yonghong-song77f4f662019-01-24 12:48:25 -080065from os import getpid, system, open, close, dup, unlink, O_WRONLY
Brendan Gregg3f3acd82016-12-21 15:34:09 -080066import ctypes as ct
yonghong-song77f4f662019-01-24 12:48:25 -080067from tempfile import NamedTemporaryFile
Brendan Gregg3f3acd82016-12-21 15:34:09 -080068
69# arguments
70examples = """examples:
71 ./cpuunclaimed # sample and calculate unclaimed idle CPUs,
72 # output every 1 second (default)
73 ./cpuunclaimed 5 10 # print 5 second summaries, 10 times
74 ./cpuunclaimed -T 1 # 1s summaries and timestamps
75 ./cpuunclaimed -j # raw dump of all samples (verbose), CSV
76"""
77parser = argparse.ArgumentParser(
78 description="Sample CPU run queues and calculate unclaimed idle CPU",
79 formatter_class=argparse.RawDescriptionHelpFormatter,
80 epilog=examples)
81parser.add_argument("-j", "--csv", action="store_true",
82 help="print sample summaries (verbose) as comma-separated values")
83parser.add_argument("-J", "--fullcsv", action="store_true",
84 help="print sample summaries with extra fields: CPU sample offsets")
85parser.add_argument("-T", "--timestamp", action="store_true",
86 help="include timestamp on output")
87parser.add_argument("interval", nargs="?", default=-1,
88 help="output interval, in seconds")
89parser.add_argument("count", nargs="?", default=99999999,
90 help="number of outputs")
Nathan Scottcf0792f2018-02-02 16:56:50 +110091parser.add_argument("--ebpf", action="store_true",
92 help=argparse.SUPPRESS)
Brendan Gregg3f3acd82016-12-21 15:34:09 -080093args = parser.parse_args()
94countdown = int(args.count)
95frequency = 99
96dobind = 1
97wakeup_hz = 10 # frequency to read buffers
98wakeup_s = float(1) / wakeup_hz
99ncpu = multiprocessing.cpu_count() # assume all are online
100debug = 0
101
yonghong-song77f4f662019-01-24 12:48:25 -0800102# Linux 4.15 introduced a new field runnable_weight
103# in linux_src:kernel/sched/sched.h as
104# struct cfs_rq {
105# struct load_weight load;
106# unsigned long runnable_weight;
107# unsigned int nr_running, h_nr_running;
108# ......
109# }
110# and this tool requires to access nr_running to get
111# runqueue len information.
112#
113# The commit which introduces cfs_rq->runnable_weight
114# field also introduces the field sched_entity->runnable_weight
115# where sched_entity is defined in linux_src:include/linux/sched.h.
116#
117# To cope with pre-4.15 and 4.15/post-4.15 releases,
118# we run a simple BPF program to detect whether
119# field sched_entity->runnable_weight exists. The existence of
120# this field should infer the existence of cfs_rq->runnable_weight.
121#
122# This will need maintenance as the relationship between these
123# two fields may change in the future.
124#
125def check_runnable_weight_field():
126 # Define the bpf program for checking purpose
127 bpf_check_text = """
128#include <linux/sched.h>
129unsigned long dummy(struct sched_entity *entity)
130{
131 return entity->runnable_weight;
132}
133"""
134
135 # Get a temporary file name
136 tmp_file = NamedTemporaryFile(delete=False)
137 tmp_file.close();
138
139 # Duplicate and close stderr (fd = 2)
140 old_stderr = dup(2)
141 close(2)
142
143 # Open a new file, should get fd number 2
144 # This will avoid printing llvm errors on the screen
145 fd = open(tmp_file.name, O_WRONLY)
146 try:
147 t = BPF(text=bpf_check_text)
148 success_compile = True
149 except:
150 success_compile = False
151
152 # Release the fd 2, and next dup should restore old stderr
153 close(fd)
154 dup(old_stderr)
155 close(old_stderr)
156
157 # remove the temporary file and return
158 unlink(tmp_file.name)
159 return success_compile
160
161
Brendan Gregg3f3acd82016-12-21 15:34:09 -0800162# process arguments
163if args.fullcsv:
164 args.csv = True
165if args.csv:
166 interval = 0.2
167if args.interval != -1 and (args.fullcsv or args.csv):
168 print("ERROR: cannot use interval with either -j or -J. Exiting.")
169 exit()
170if args.interval == -1:
171 args.interval = "1"
172interval = float(args.interval)
173
174# define BPF program
175bpf_text = """
176#include <uapi/linux/ptrace.h>
177#include <uapi/linux/bpf_perf_event.h>
178#include <linux/sched.h>
179
180struct data_t {
181 u64 ts;
182 u64 cpu;
183 u64 len;
184};
185
186BPF_PERF_OUTPUT(events);
187
188// Declare enough of cfs_rq to find nr_running, since we can't #import the
189// header. This will need maintenance. It is from kernel/sched/sched.h:
190struct cfs_rq_partial {
191 struct load_weight load;
yonghong-song77f4f662019-01-24 12:48:25 -0800192 RUNNABLE_WEIGHT_FIELD
Brendan Gregg3f3acd82016-12-21 15:34:09 -0800193 unsigned int nr_running, h_nr_running;
194};
195
196int do_perf_event(struct bpf_perf_event_data *ctx)
197{
198 int cpu = bpf_get_smp_processor_id();
199 u64 now = bpf_ktime_get_ns();
200
201 /*
202 * Fetch the run queue length from task->se.cfs_rq->nr_running. This is an
203 * unstable interface and may need maintenance. Perhaps a future version
204 * of BPF will support task_rq(p) or something similar as a more reliable
205 * interface.
206 */
207 unsigned int len = 0;
208 struct task_struct *task = NULL;
209 struct cfs_rq_partial *my_q = NULL;
210 task = (struct task_struct *)bpf_get_current_task();
Paul Chaignon719e1002017-08-06 14:33:20 +0200211 my_q = (struct cfs_rq_partial *)task->se.cfs_rq;
212 len = my_q->nr_running;
Brendan Gregg3f3acd82016-12-21 15:34:09 -0800213
214 struct data_t data = {.ts = now, .cpu = cpu, .len = len};
215 events.perf_submit(ctx, &data, sizeof(data));
216
217 return 0;
218}
219"""
220
yonghong-song77f4f662019-01-24 12:48:25 -0800221if check_runnable_weight_field():
222 bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', 'unsigned long runnable_weight;')
223else:
224 bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', '')
225
Brendan Gregg3f3acd82016-12-21 15:34:09 -0800226# code substitutions
Nathan Scottcf0792f2018-02-02 16:56:50 +1100227if debug or args.ebpf:
Brendan Gregg3f3acd82016-12-21 15:34:09 -0800228 print(bpf_text)
Nathan Scottcf0792f2018-02-02 16:56:50 +1100229 if args.ebpf:
230 exit()
Brendan Gregg3f3acd82016-12-21 15:34:09 -0800231
232# initialize BPF & perf_events
233b = BPF(text=bpf_text)
234# TODO: check for HW counters first and use if more accurate
235b.attach_perf_event(ev_type=PerfType.SOFTWARE,
236 ev_config=PerfSWConfig.TASK_CLOCK, fn_name="do_perf_event",
237 sample_period=0, sample_freq=frequency)
238
239if args.csv:
240 if args.timestamp:
241 print("TIME", end=",")
242 print("TIMESTAMP_ns", end=",")
Rafael Fd7a5ff02017-03-03 19:57:28 +0100243 print(",".join("CPU" + str(c) for c in range(ncpu)), end="")
Brendan Gregg3f3acd82016-12-21 15:34:09 -0800244 if args.fullcsv:
245 print(",", end="")
Rafael Fd7a5ff02017-03-03 19:57:28 +0100246 print(",".join("OFFSET_ns_CPU" + str(c) for c in range(ncpu)), end="")
Brendan Gregg3f3acd82016-12-21 15:34:09 -0800247 print()
248else:
249 print(("Sampling run queues... Output every %s seconds. " +
250 "Hit Ctrl-C to end.") % args.interval)
251class Data(ct.Structure):
252 _fields_ = [
253 ("ts", ct.c_ulonglong),
254 ("cpu", ct.c_ulonglong),
255 ("len", ct.c_ulonglong)
256 ]
257
258samples = {}
259group = {}
260last = 0
261
262# process event
263def print_event(cpu, data, size):
264 event = ct.cast(data, ct.POINTER(Data)).contents
265 samples[event.ts] = {}
266 samples[event.ts]['cpu'] = event.cpu
267 samples[event.ts]['len'] = event.len
268
269exiting = 0 if args.interval else 1
270slept = float(0)
271
272# Choose the elapsed time from one sample group to the next that identifies a
273# new sample group (a group being a set of samples from all CPUs). The
274# earliest timestamp is compared in each group. This trigger is also used
275# for sanity testing, if a group's samples exceed half this value.
276trigger = int(0.8 * (1000000000 / frequency))
277
278# read events
Mark Drayton5f5687e2017-02-20 18:13:03 +0000279b["events"].open_perf_buffer(print_event, page_cnt=64)
Brendan Gregg3f3acd82016-12-21 15:34:09 -0800280while 1:
281 # allow some buffering by calling sleep(), to reduce the context switch
282 # rate and lower overhead.
283 try:
284 if not exiting:
285 sleep(wakeup_s)
286 except KeyboardInterrupt:
287 exiting = 1
Teng Qindbf00292018-02-28 21:47:50 -0800288 b.perf_buffer_poll()
Brendan Gregg3f3acd82016-12-21 15:34:09 -0800289 slept += wakeup_s
290
291 if slept < 0.999 * interval: # floating point workaround
292 continue
293 slept = 0
294
295 positive = 0 # number of samples where an idle CPU could have run work
296 running = 0
297 idle = 0
298 if debug >= 2:
299 print("DEBUG: begin samples loop, count %d" % len(samples))
300 for e in sorted(samples):
301 if debug >= 2:
302 print("DEBUG: ts %d cpu %d len %d delta %d trig %d" % (e,
303 samples[e]['cpu'], samples[e]['len'], e - last,
304 e - last > trigger))
305
306 # look for time jumps to identify a new sample group
307 if e - last > trigger:
308
309 # first first group timestamp, and sanity test
310 g_time = 0
311 g_max = 0
312 for ge in sorted(group):
313 if g_time == 0:
314 g_time = ge
315 g_max = ge
316
317 # process previous sample group
318 if args.csv:
319 lens = [0] * ncpu
320 offs = [0] * ncpu
321 for ge in sorted(group):
322 lens[samples[ge]['cpu']] = samples[ge]['len']
323 if args.fullcsv:
324 offs[samples[ge]['cpu']] = ge - g_time
325 if g_time > 0: # else first sample
326 if args.timestamp:
327 print("%-8s" % strftime("%H:%M:%S"), end=",")
328 print("%d" % g_time, end=",")
Rafael Fd7a5ff02017-03-03 19:57:28 +0100329 print(",".join(str(lens[c]) for c in range(ncpu)), end="")
Brendan Gregg3f3acd82016-12-21 15:34:09 -0800330 if args.fullcsv:
331 print(",", end="")
Rafael Fd7a5ff02017-03-03 19:57:28 +0100332 print(",".join(str(offs[c]) for c in range(ncpu)))
Brendan Gregg3f3acd82016-12-21 15:34:09 -0800333 else:
334 print()
335 else:
336 # calculate stats
337 g_running = 0
338 g_queued = 0
339 for ge in group:
340 if samples[ge]['len'] > 0:
341 g_running += 1
342 if samples[ge]['len'] > 1:
343 g_queued += samples[ge]['len'] - 1
344 g_idle = ncpu - g_running
345
346 # calculate the number of threads that could have run as the
347 # minimum of idle and queued
348 if g_idle > 0 and g_queued > 0:
349 if g_queued > g_idle:
350 i = g_idle
351 else:
352 i = g_queued
353 positive += i
354 running += g_running
355 idle += g_idle
356
357 # now sanity test, after -J output
358 g_range = g_max - g_time
359 if g_range > trigger / 2:
360 # if a sample group exceeds half the interval, we can no
361 # longer draw conclusions about some CPUs idle while others
362 # have queued work. Error and exit. This can happen when
363 # CPUs power down, then start again on different offsets.
364 # TODO: Since this is a sampling tool, an error margin should
365 # be anticipated, so an improvement may be to bump a counter
366 # instead of exiting, and only exit if this counter shows
367 # a skewed sample rate of over, say, 1%. Such an approach
368 # would allow a small rate of outliers (sampling error),
369 # and, we could tighten the trigger to be, say, trigger / 5.
370 # In the case of a power down, if it's detectable, perhaps
371 # the tool could reinitialize the timers (although exiting
372 # is simple and works).
373 print(("ERROR: CPU samples arrived at skewed offsets " +
374 "(CPUs may have powered down when idle), " +
375 "spanning %d ns (expected < %d ns). Debug with -J, " +
376 "and see the man page. As output may begin to be " +
377 "unreliable, exiting.") % (g_range, trigger / 2))
378 exit()
379
380 # these are done, remove
381 for ge in sorted(group):
382 del samples[ge]
383
384 # begin next group
385 group = {}
386 last = e
387
388 # stash this timestamp in a sample group dict
389 group[e] = 1
390
391 if not args.csv:
392 total = running + idle
393 unclaimed = util = 0
394
395 if debug:
396 print("DEBUG: hit %d running %d idle %d total %d buffered %d" % (
397 positive, running, idle, total, len(samples)))
398
399 if args.timestamp:
400 print("%-8s " % strftime("%H:%M:%S"), end="")
401
402 # output
403 if total:
404 unclaimed = float(positive) / total
405 util = float(running) / total
406 print("%%CPU %6.2f%%, unclaimed idle %0.2f%%" % (100 * util,
407 100 * unclaimed))
408
409 countdown -= 1
410 if exiting or countdown == 0:
411 exit()