Blame - tools/cpuunclaimed.py - platform/external/bcc

blob: dc0f325230d79bedd27fce74b9696193e454af9b [file] [log] [blame]

Alexey Ivanov	cc01a9c	2019-01-16 09:50:46 -0800	[diff] [blame]	1	#!/usr/bin/python
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	2	# @lint-avoid-python-3-compatibility-imports
				3	#
				4	# cpuunclaimed Sample CPU run queues and calculate unclaimed idle CPU.
				5	# For Linux, uses BCC, eBPF.
				6	#
				7	# This samples the length of the run queues and determine when there are idle
				8	# CPUs, yet queued threads waiting their turn. Report the amount of idle
				9	# (yet unclaimed by waiting threads) CPU as a system-wide percentage.
				10	#
				11	# This situation can happen for a number of reasons:
				12	#
				13	# - An application has been bound to some, but not all, CPUs, and has runnable
				14	# threads that cannot migrate to other CPUs due to this configuration.
				15	# - CPU affinity: an optimization that leaves threads on CPUs where the CPU
				16	# caches are warm, even if this means short periods of waiting while other
				17	# CPUs are idle. The wait period is tunale (see sysctl, kernel.sched*).
				18	# - Scheduler bugs.
				19	#
				20	# An unclaimed idle of < 1% is likely to be CPU affinity, and not usually a
				21	# cause for concern. By leaving the CPU idle, overall throughput of the system
				22	# may be improved. This tool is best for identifying larger issues, > 2%, due
				23	# to the coarseness of its 99 Hertz samples.
				24	#
				25	# This is an experimental tool that currently works by use of sampling to
				26	# keep overheads low. Tool assumptions:
				27	#
				28	# - CPU samples consistently fire around the same offset. There will sometimes
				29	# be a lag as a sample is delayed by higher-priority interrupts, but it is
				30	# assumed the subsequent samples will catch up to the expected offsets (as
				31	# is seen in practice). You can use -J to inspect sample offsets. Some
				32	# systems can power down CPUs when idle, and when they wake up again they
				33	# may begin firing at a skewed offset: this tool will detect the skew, print
				34	# an error, and exit.
				35	# - All CPUs are online (see ncpu).
				36	#
				37	# If this identifies unclaimed CPU, you can double check it by dumping raw
				38	# samples (-j), as well as using other tracing tools to instrument scheduler
				39	# events (although this latter approach has much higher overhead).
				40	#
				41	# This tool passes all sampled events to user space for post processing.
				42	# I originally wrote this to do the calculations entirerly in kernel context,
				43	# and only pass a summary. That involves a number of challenges, and the
				44	# overhead savings may not outweigh the caveats. You can see my WIP here:
				45	# https://gist.github.com/brendangregg/731cf2ce54bf1f9a19d4ccd397625ad9
				46	#
				47	# USAGE: cpuunclaimed [-h] [-j] [-J] [-T] [interval] [count]
				48	#
				49	# If you see "Lost 1881 samples" warnings, try increasing wakeup_hz.
				50	#
				51	# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is
				52	# a version of this tool that may work on Linux 4.6 - 4.8.
				53	#
				54	# Copyright 2016 Netflix, Inc.
				55	# Licensed under the Apache License, Version 2.0 (the "License")
				56	#
				57	# 20-Dec-2016 Brendan Gregg Created this.
				58
				59	from __future__ import print_function
				60	from bcc import BPF, PerfType, PerfSWConfig
				61	from time import sleep, strftime
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	62	import argparse
				63	import multiprocessing
yonghong-song	77f4f66	2019-01-24 12:48:25 -0800	[diff] [blame]	64	from os import getpid, system, open, close, dup, unlink, O_WRONLY
yonghong-song	77f4f66	2019-01-24 12:48:25 -0800	[diff] [blame]	65	from tempfile import NamedTemporaryFile
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	66
				67	# arguments
				68	examples = """examples:
				69	./cpuunclaimed # sample and calculate unclaimed idle CPUs,
				70	# output every 1 second (default)
				71	./cpuunclaimed 5 10 # print 5 second summaries, 10 times
				72	./cpuunclaimed -T 1 # 1s summaries and timestamps
				73	./cpuunclaimed -j # raw dump of all samples (verbose), CSV
				74	"""
				75	parser = argparse.ArgumentParser(
				76	description="Sample CPU run queues and calculate unclaimed idle CPU",
				77	formatter_class=argparse.RawDescriptionHelpFormatter,
				78	epilog=examples)
				79	parser.add_argument("-j", "--csv", action="store_true",
				80	help="print sample summaries (verbose) as comma-separated values")
				81	parser.add_argument("-J", "--fullcsv", action="store_true",
				82	help="print sample summaries with extra fields: CPU sample offsets")
				83	parser.add_argument("-T", "--timestamp", action="store_true",
				84	help="include timestamp on output")
				85	parser.add_argument("interval", nargs="?", default=-1,
				86	help="output interval, in seconds")
				87	parser.add_argument("count", nargs="?", default=99999999,
				88	help="number of outputs")
Nathan Scott	cf0792f	2018-02-02 16:56:50 +1100	[diff] [blame]	89	parser.add_argument("--ebpf", action="store_true",
				90	help=argparse.SUPPRESS)
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	91	args = parser.parse_args()
				92	countdown = int(args.count)
				93	frequency = 99
				94	dobind = 1
				95	wakeup_hz = 10 # frequency to read buffers
				96	wakeup_s = float(1) / wakeup_hz
				97	ncpu = multiprocessing.cpu_count() # assume all are online
				98	debug = 0
				99
yonghong-song	77f4f66	2019-01-24 12:48:25 -0800	[diff] [blame]	100	# Linux 4.15 introduced a new field runnable_weight
				101	# in linux_src:kernel/sched/sched.h as
				102	# struct cfs_rq {
				103	# struct load_weight load;
				104	# unsigned long runnable_weight;
				105	# unsigned int nr_running, h_nr_running;
				106	# ......
				107	# }
				108	# and this tool requires to access nr_running to get
				109	# runqueue len information.
				110	#
				111	# The commit which introduces cfs_rq->runnable_weight
				112	# field also introduces the field sched_entity->runnable_weight
				113	# where sched_entity is defined in linux_src:include/linux/sched.h.
				114	#
				115	# To cope with pre-4.15 and 4.15/post-4.15 releases,
				116	# we run a simple BPF program to detect whether
				117	# field sched_entity->runnable_weight exists. The existence of
				118	# this field should infer the existence of cfs_rq->runnable_weight.
				119	#
				120	# This will need maintenance as the relationship between these
				121	# two fields may change in the future.
				122	#
				123	def check_runnable_weight_field():
				124	# Define the bpf program for checking purpose
				125	bpf_check_text = """
				126	#include <linux/sched.h>
				127	unsigned long dummy(struct sched_entity *entity)
				128	{
				129	return entity->runnable_weight;
				130	}
				131	"""
				132
				133	# Get a temporary file name
				134	tmp_file = NamedTemporaryFile(delete=False)
				135	tmp_file.close();
				136
				137	# Duplicate and close stderr (fd = 2)
				138	old_stderr = dup(2)
				139	close(2)
				140
				141	# Open a new file, should get fd number 2
				142	# This will avoid printing llvm errors on the screen
				143	fd = open(tmp_file.name, O_WRONLY)
				144	try:
				145	t = BPF(text=bpf_check_text)
				146	success_compile = True
				147	except:
				148	success_compile = False
				149
				150	# Release the fd 2, and next dup should restore old stderr
				151	close(fd)
				152	dup(old_stderr)
				153	close(old_stderr)
				154
				155	# remove the temporary file and return
				156	unlink(tmp_file.name)
				157	return success_compile
				158
				159
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	160	# process arguments
				161	if args.fullcsv:
				162	args.csv = True
				163	if args.csv:
				164	interval = 0.2
				165	if args.interval != -1 and (args.fullcsv or args.csv):
				166	print("ERROR: cannot use interval with either -j or -J. Exiting.")
				167	exit()
				168	if args.interval == -1:
				169	args.interval = "1"
				170	interval = float(args.interval)
				171
				172	# define BPF program
				173	bpf_text = """
				174	#include <uapi/linux/ptrace.h>
				175	#include <uapi/linux/bpf_perf_event.h>
				176	#include <linux/sched.h>
				177
				178	struct data_t {
				179	u64 ts;
				180	u64 cpu;
				181	u64 len;
				182	};
				183
				184	BPF_PERF_OUTPUT(events);
				185
				186	// Declare enough of cfs_rq to find nr_running, since we can't #import the
				187	// header. This will need maintenance. It is from kernel/sched/sched.h:
				188	struct cfs_rq_partial {
				189	struct load_weight load;
yonghong-song	77f4f66	2019-01-24 12:48:25 -0800	[diff] [blame]	190	RUNNABLE_WEIGHT_FIELD
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	191	unsigned int nr_running, h_nr_running;
				192	};
				193
				194	int do_perf_event(struct bpf_perf_event_data *ctx)
				195	{
				196	int cpu = bpf_get_smp_processor_id();
				197	u64 now = bpf_ktime_get_ns();
				198
				199	/*
				200	* Fetch the run queue length from task->se.cfs_rq->nr_running. This is an
				201	* unstable interface and may need maintenance. Perhaps a future version
				202	* of BPF will support task_rq(p) or something similar as a more reliable
				203	* interface.
				204	*/
				205	unsigned int len = 0;
				206	struct task_struct *task = NULL;
				207	struct cfs_rq_partial *my_q = NULL;
				208	task = (struct task_struct *)bpf_get_current_task();
Paul Chaignon	719e100	2017-08-06 14:33:20 +0200	[diff] [blame]	209	my_q = (struct cfs_rq_partial *)task->se.cfs_rq;
				210	len = my_q->nr_running;
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	211
				212	struct data_t data = {.ts = now, .cpu = cpu, .len = len};
				213	events.perf_submit(ctx, &data, sizeof(data));
				214
				215	return 0;
				216	}
				217	"""
				218
yonghong-song	77f4f66	2019-01-24 12:48:25 -0800	[diff] [blame]	219	if check_runnable_weight_field():
				220	bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', 'unsigned long runnable_weight;')
				221	else:
				222	bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', '')
				223
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	224	# code substitutions
Nathan Scott	cf0792f	2018-02-02 16:56:50 +1100	[diff] [blame]	225	if debug or args.ebpf:
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	226	print(bpf_text)
Nathan Scott	cf0792f	2018-02-02 16:56:50 +1100	[diff] [blame]	227	if args.ebpf:
				228	exit()
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	229
				230	# initialize BPF & perf_events
				231	b = BPF(text=bpf_text)
				232	# TODO: check for HW counters first and use if more accurate
				233	b.attach_perf_event(ev_type=PerfType.SOFTWARE,
				234	ev_config=PerfSWConfig.TASK_CLOCK, fn_name="do_perf_event",
				235	sample_period=0, sample_freq=frequency)
				236
				237	if args.csv:
				238	if args.timestamp:
				239	print("TIME", end=",")
				240	print("TIMESTAMP_ns", end=",")
Rafael F	d7a5ff0	2017-03-03 19:57:28 +0100	[diff] [blame]	241	print(",".join("CPU" + str(c) for c in range(ncpu)), end="")
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	242	if args.fullcsv:
				243	print(",", end="")
Rafael F	d7a5ff0	2017-03-03 19:57:28 +0100	[diff] [blame]	244	print(",".join("OFFSET_ns_CPU" + str(c) for c in range(ncpu)), end="")
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	245	print()
				246	else:
				247	print(("Sampling run queues... Output every %s seconds. " +
				248	"Hit Ctrl-C to end.") % args.interval)
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	249
				250	samples = {}
				251	group = {}
				252	last = 0
				253
				254	# process event
				255	def print_event(cpu, data, size):
Xiaozhou Liu	51d62d3	2019-02-15 13:03:05 +0800	[diff] [blame]	256	event = b["events"].event(data)
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	257	samples[event.ts] = {}
				258	samples[event.ts]['cpu'] = event.cpu
				259	samples[event.ts]['len'] = event.len
				260
				261	exiting = 0 if args.interval else 1
				262	slept = float(0)
				263
				264	# Choose the elapsed time from one sample group to the next that identifies a
				265	# new sample group (a group being a set of samples from all CPUs). The
				266	# earliest timestamp is compared in each group. This trigger is also used
				267	# for sanity testing, if a group's samples exceed half this value.
				268	trigger = int(0.8 * (1000000000 / frequency))
				269
				270	# read events
Mark Drayton	5f5687e	2017-02-20 18:13:03 +0000	[diff] [blame]	271	b["events"].open_perf_buffer(print_event, page_cnt=64)
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	272	while 1:
				273	# allow some buffering by calling sleep(), to reduce the context switch
				274	# rate and lower overhead.
				275	try:
				276	if not exiting:
				277	sleep(wakeup_s)
				278	except KeyboardInterrupt:
				279	exiting = 1
Teng Qin	dbf0029	2018-02-28 21:47:50 -0800	[diff] [blame]	280	b.perf_buffer_poll()
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	281	slept += wakeup_s
				282
				283	if slept < 0.999 * interval: # floating point workaround
				284	continue
				285	slept = 0
				286
				287	positive = 0 # number of samples where an idle CPU could have run work
				288	running = 0
				289	idle = 0
				290	if debug >= 2:
				291	print("DEBUG: begin samples loop, count %d" % len(samples))
				292	for e in sorted(samples):
				293	if debug >= 2:
				294	print("DEBUG: ts %d cpu %d len %d delta %d trig %d" % (e,
				295	samples[e]['cpu'], samples[e]['len'], e - last,
				296	e - last > trigger))
				297
				298	# look for time jumps to identify a new sample group
				299	if e - last > trigger:
				300
				301	# first first group timestamp, and sanity test
				302	g_time = 0
				303	g_max = 0
				304	for ge in sorted(group):
				305	if g_time == 0:
				306	g_time = ge
				307	g_max = ge
				308
				309	# process previous sample group
				310	if args.csv:
				311	lens = [0] * ncpu
				312	offs = [0] * ncpu
				313	for ge in sorted(group):
				314	lens[samples[ge]['cpu']] = samples[ge]['len']
				315	if args.fullcsv:
				316	offs[samples[ge]['cpu']] = ge - g_time
				317	if g_time > 0: # else first sample
				318	if args.timestamp:
				319	print("%-8s" % strftime("%H:%M:%S"), end=",")
				320	print("%d" % g_time, end=",")
Rafael F	d7a5ff0	2017-03-03 19:57:28 +0100	[diff] [blame]	321	print(",".join(str(lens[c]) for c in range(ncpu)), end="")
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	322	if args.fullcsv:
				323	print(",", end="")
Rafael F	d7a5ff0	2017-03-03 19:57:28 +0100	[diff] [blame]	324	print(",".join(str(offs[c]) for c in range(ncpu)))
Brendan Gregg	3f3acd8	2016-12-21 15:34:09 -0800	[diff] [blame]	325	else:
				326	print()
				327	else:
				328	# calculate stats
				329	g_running = 0
				330	g_queued = 0
				331	for ge in group:
				332	if samples[ge]['len'] > 0:
				333	g_running += 1
				334	if samples[ge]['len'] > 1:
				335	g_queued += samples[ge]['len'] - 1
				336	g_idle = ncpu - g_running
				337
				338	# calculate the number of threads that could have run as the
				339	# minimum of idle and queued
				340	if g_idle > 0 and g_queued > 0:
				341	if g_queued > g_idle:
				342	i = g_idle
				343	else:
				344	i = g_queued
				345	positive += i
				346	running += g_running
				347	idle += g_idle
				348
				349	# now sanity test, after -J output
				350	g_range = g_max - g_time
				351	if g_range > trigger / 2:
				352	# if a sample group exceeds half the interval, we can no
				353	# longer draw conclusions about some CPUs idle while others
				354	# have queued work. Error and exit. This can happen when
				355	# CPUs power down, then start again on different offsets.
				356	# TODO: Since this is a sampling tool, an error margin should
				357	# be anticipated, so an improvement may be to bump a counter
				358	# instead of exiting, and only exit if this counter shows
				359	# a skewed sample rate of over, say, 1%. Such an approach
				360	# would allow a small rate of outliers (sampling error),
				361	# and, we could tighten the trigger to be, say, trigger / 5.
				362	# In the case of a power down, if it's detectable, perhaps
				363	# the tool could reinitialize the timers (although exiting
				364	# is simple and works).
				365	print(("ERROR: CPU samples arrived at skewed offsets " +
				366	"(CPUs may have powered down when idle), " +
				367	"spanning %d ns (expected < %d ns). Debug with -J, " +
				368	"and see the man page. As output may begin to be " +
				369	"unreliable, exiting.") % (g_range, trigger / 2))
				370	exit()
				371
				372	# these are done, remove
				373	for ge in sorted(group):
				374	del samples[ge]
				375
				376	# begin next group
				377	group = {}
				378	last = e
				379
				380	# stash this timestamp in a sample group dict
				381	group[e] = 1
				382
				383	if not args.csv:
				384	total = running + idle
				385	unclaimed = util = 0
				386
				387	if debug:
				388	print("DEBUG: hit %d running %d idle %d total %d buffered %d" % (
				389	positive, running, idle, total, len(samples)))
				390
				391	if args.timestamp:
				392	print("%-8s " % strftime("%H:%M:%S"), end="")
				393
				394	# output
				395	if total:
				396	unclaimed = float(positive) / total
				397	util = float(running) / total
				398	print("%%CPU %6.2f%%, unclaimed idle %0.2f%%" % (100 * util,
				399	100 * unclaimed))
				400
				401	countdown -= 1
				402	if exiting or countdown == 0:
				403	exit()