add cpuunclaimed
diff --git a/tools/cpuunclaimed.py b/tools/cpuunclaimed.py
new file mode 100755
index 0000000..9624b50
--- /dev/null
+++ b/tools/cpuunclaimed.py
@@ -0,0 +1,340 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# cpuunclaimed Sample CPU run queues and calculate unclaimed idle CPU.
+# For Linux, uses BCC, eBPF.
+#
+# This samples the length of the run queues and determine when there are idle
+# CPUs, yet queued threads waiting their turn. Report the amount of idle
+# (yet unclaimed by waiting threads) CPU as a system-wide percentage.
+#
+# This situation can happen for a number of reasons:
+#
+# - An application has been bound to some, but not all, CPUs, and has runnable
+# threads that cannot migrate to other CPUs due to this configuration.
+# - CPU affinity: an optimization that leaves threads on CPUs where the CPU
+# caches are warm, even if this means short periods of waiting while other
+# CPUs are idle. The wait period is tunale (see sysctl, kernel.sched*).
+# - Scheduler bugs.
+#
+# An unclaimed idle of < 1% is likely to be CPU affinity, and not usually a
+# cause for concern. By leaving the CPU idle, overall throughput of the system
+# may be improved. This tool is best for identifying larger issues, > 2%, due
+# to the coarseness of its 99 Hertz samples.
+#
+# This is an experimental tool that currently works by use of sampling to
+# keep overheads low. Tool assumptions:
+#
+# - CPU samples consistently fire around the same offset. There will sometimes
+# be a lag as a sample is delayed by higher-priority interrupts, but it is
+# assumed the subsequent samples will catch up to the expected offsets (as
+# is seen in practice). You can use -J to inspect sample offsets. Some
+# systems can power down CPUs when idle, and when they wake up again they
+# may begin firing at a skewed offset: this tool will detect the skew, print
+# an error, and exit.
+# - All CPUs are online (see ncpu).
+#
+# If this identifies unclaimed CPU, you can double check it by dumping raw
+# samples (-j), as well as using other tracing tools to instrument scheduler
+# events (although this latter approach has much higher overhead).
+#
+# This tool passes all sampled events to user space for post processing.
+# I originally wrote this to do the calculations entirerly in kernel context,
+# and only pass a summary. That involves a number of challenges, and the
+# overhead savings may not outweigh the caveats. You can see my WIP here:
+# https://gist.github.com/brendangregg/731cf2ce54bf1f9a19d4ccd397625ad9
+#
+# USAGE: cpuunclaimed [-h] [-j] [-J] [-T] [interval] [count]
+#
+# If you see "Lost 1881 samples" warnings, try increasing wakeup_hz.
+#
+# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is
+# a version of this tool that may work on Linux 4.6 - 4.8.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 20-Dec-2016 Brendan Gregg Created this.
+
+from __future__ import print_function
+from bcc import BPF, PerfType, PerfSWConfig
+from time import sleep, strftime
+from ctypes import c_int
+import argparse
+import multiprocessing
+from os import getpid, system
+import ctypes as ct
+
+# arguments
+examples = """examples:
+ ./cpuunclaimed # sample and calculate unclaimed idle CPUs,
+ # output every 1 second (default)
+ ./cpuunclaimed 5 10 # print 5 second summaries, 10 times
+ ./cpuunclaimed -T 1 # 1s summaries and timestamps
+ ./cpuunclaimed -j # raw dump of all samples (verbose), CSV
+"""
+parser = argparse.ArgumentParser(
+ description="Sample CPU run queues and calculate unclaimed idle CPU",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=examples)
+parser.add_argument("-j", "--csv", action="store_true",
+ help="print sample summaries (verbose) as comma-separated values")
+parser.add_argument("-J", "--fullcsv", action="store_true",
+ help="print sample summaries with extra fields: CPU sample offsets")
+parser.add_argument("-T", "--timestamp", action="store_true",
+ help="include timestamp on output")
+parser.add_argument("interval", nargs="?", default=-1,
+ help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+ help="number of outputs")
+args = parser.parse_args()
+countdown = int(args.count)
+frequency = 99
+dobind = 1
+wakeup_hz = 10 # frequency to read buffers
+wakeup_s = float(1) / wakeup_hz
+ncpu = multiprocessing.cpu_count() # assume all are online
+debug = 0
+
+# process arguments
+if args.fullcsv:
+ args.csv = True
+if args.csv:
+ interval = 0.2
+if args.interval != -1 and (args.fullcsv or args.csv):
+ print("ERROR: cannot use interval with either -j or -J. Exiting.")
+ exit()
+if args.interval == -1:
+ args.interval = "1"
+interval = float(args.interval)
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <uapi/linux/bpf_perf_event.h>
+#include <linux/sched.h>
+
+struct data_t {
+ u64 ts;
+ u64 cpu;
+ u64 len;
+};
+
+BPF_PERF_OUTPUT(events);
+
+// Declare enough of cfs_rq to find nr_running, since we can't #import the
+// header. This will need maintenance. It is from kernel/sched/sched.h:
+struct cfs_rq_partial {
+ struct load_weight load;
+ unsigned int nr_running, h_nr_running;
+};
+
+int do_perf_event(struct bpf_perf_event_data *ctx)
+{
+ int cpu = bpf_get_smp_processor_id();
+ u64 now = bpf_ktime_get_ns();
+
+ /*
+ * Fetch the run queue length from task->se.cfs_rq->nr_running. This is an
+ * unstable interface and may need maintenance. Perhaps a future version
+ * of BPF will support task_rq(p) or something similar as a more reliable
+ * interface.
+ */
+ unsigned int len = 0;
+ struct task_struct *task = NULL;
+ struct cfs_rq_partial *my_q = NULL;
+ task = (struct task_struct *)bpf_get_current_task();
+ bpf_probe_read(&my_q, sizeof(my_q), &task->se.cfs_rq);
+ bpf_probe_read(&len, sizeof(len), &my_q->nr_running);
+
+ struct data_t data = {.ts = now, .cpu = cpu, .len = len};
+ events.perf_submit(ctx, &data, sizeof(data));
+
+ return 0;
+}
+"""
+
+# code substitutions
+if debug:
+ print(bpf_text)
+
+# initialize BPF & perf_events
+b = BPF(text=bpf_text)
+# TODO: check for HW counters first and use if more accurate
+b.attach_perf_event(ev_type=PerfType.SOFTWARE,
+ ev_config=PerfSWConfig.TASK_CLOCK, fn_name="do_perf_event",
+ sample_period=0, sample_freq=frequency)
+
+if args.csv:
+ if args.timestamp:
+ print("TIME", end=",")
+ print("TIMESTAMP_ns", end=",")
+ print(",".join("CPU" + str(c) for c in xrange(ncpu)), end="")
+ if args.fullcsv:
+ print(",", end="")
+ print(",".join("OFFSET_ns_CPU" + str(c) for c in xrange(ncpu)), end="")
+ print()
+else:
+ print(("Sampling run queues... Output every %s seconds. " +
+ "Hit Ctrl-C to end.") % args.interval)
+class Data(ct.Structure):
+ _fields_ = [
+ ("ts", ct.c_ulonglong),
+ ("cpu", ct.c_ulonglong),
+ ("len", ct.c_ulonglong)
+ ]
+
+samples = {}
+group = {}
+last = 0
+
+# process event
+def print_event(cpu, data, size):
+ event = ct.cast(data, ct.POINTER(Data)).contents
+ samples[event.ts] = {}
+ samples[event.ts]['cpu'] = event.cpu
+ samples[event.ts]['len'] = event.len
+
+exiting = 0 if args.interval else 1
+slept = float(0)
+
+# Choose the elapsed time from one sample group to the next that identifies a
+# new sample group (a group being a set of samples from all CPUs). The
+# earliest timestamp is compared in each group. This trigger is also used
+# for sanity testing, if a group's samples exceed half this value.
+trigger = int(0.8 * (1000000000 / frequency))
+
+# read events
+b["events"].open_perf_buffer(print_event)
+while 1:
+ # allow some buffering by calling sleep(), to reduce the context switch
+ # rate and lower overhead.
+ try:
+ if not exiting:
+ sleep(wakeup_s)
+ except KeyboardInterrupt:
+ exiting = 1
+ b.kprobe_poll()
+ slept += wakeup_s
+
+ if slept < 0.999 * interval: # floating point workaround
+ continue
+ slept = 0
+
+ positive = 0 # number of samples where an idle CPU could have run work
+ running = 0
+ idle = 0
+ if debug >= 2:
+ print("DEBUG: begin samples loop, count %d" % len(samples))
+ for e in sorted(samples):
+ if debug >= 2:
+ print("DEBUG: ts %d cpu %d len %d delta %d trig %d" % (e,
+ samples[e]['cpu'], samples[e]['len'], e - last,
+ e - last > trigger))
+
+ # look for time jumps to identify a new sample group
+ if e - last > trigger:
+
+ # first first group timestamp, and sanity test
+ g_time = 0
+ g_max = 0
+ for ge in sorted(group):
+ if g_time == 0:
+ g_time = ge
+ g_max = ge
+
+ # process previous sample group
+ if args.csv:
+ lens = [0] * ncpu
+ offs = [0] * ncpu
+ for ge in sorted(group):
+ lens[samples[ge]['cpu']] = samples[ge]['len']
+ if args.fullcsv:
+ offs[samples[ge]['cpu']] = ge - g_time
+ if g_time > 0: # else first sample
+ if args.timestamp:
+ print("%-8s" % strftime("%H:%M:%S"), end=",")
+ print("%d" % g_time, end=",")
+ print(",".join(str(lens[c]) for c in xrange(ncpu)), end="")
+ if args.fullcsv:
+ print(",", end="")
+ print(",".join(str(offs[c]) for c in xrange(ncpu)))
+ else:
+ print()
+ else:
+ # calculate stats
+ g_running = 0
+ g_queued = 0
+ for ge in group:
+ if samples[ge]['len'] > 0:
+ g_running += 1
+ if samples[ge]['len'] > 1:
+ g_queued += samples[ge]['len'] - 1
+ g_idle = ncpu - g_running
+
+ # calculate the number of threads that could have run as the
+ # minimum of idle and queued
+ if g_idle > 0 and g_queued > 0:
+ if g_queued > g_idle:
+ i = g_idle
+ else:
+ i = g_queued
+ positive += i
+ running += g_running
+ idle += g_idle
+
+ # now sanity test, after -J output
+ g_range = g_max - g_time
+ if g_range > trigger / 2:
+ # if a sample group exceeds half the interval, we can no
+ # longer draw conclusions about some CPUs idle while others
+ # have queued work. Error and exit. This can happen when
+ # CPUs power down, then start again on different offsets.
+ # TODO: Since this is a sampling tool, an error margin should
+ # be anticipated, so an improvement may be to bump a counter
+ # instead of exiting, and only exit if this counter shows
+ # a skewed sample rate of over, say, 1%. Such an approach
+ # would allow a small rate of outliers (sampling error),
+ # and, we could tighten the trigger to be, say, trigger / 5.
+ # In the case of a power down, if it's detectable, perhaps
+ # the tool could reinitialize the timers (although exiting
+ # is simple and works).
+ print(("ERROR: CPU samples arrived at skewed offsets " +
+ "(CPUs may have powered down when idle), " +
+ "spanning %d ns (expected < %d ns). Debug with -J, " +
+ "and see the man page. As output may begin to be " +
+ "unreliable, exiting.") % (g_range, trigger / 2))
+ exit()
+
+ # these are done, remove
+ for ge in sorted(group):
+ del samples[ge]
+
+ # begin next group
+ group = {}
+ last = e
+
+ # stash this timestamp in a sample group dict
+ group[e] = 1
+
+ if not args.csv:
+ total = running + idle
+ unclaimed = util = 0
+
+ if debug:
+ print("DEBUG: hit %d running %d idle %d total %d buffered %d" % (
+ positive, running, idle, total, len(samples)))
+
+ if args.timestamp:
+ print("%-8s " % strftime("%H:%M:%S"), end="")
+
+ # output
+ if total:
+ unclaimed = float(positive) / total
+ util = float(running) / total
+ print("%%CPU %6.2f%%, unclaimed idle %0.2f%%" % (100 * util,
+ 100 * unclaimed))
+
+ countdown -= 1
+ if exiting or countdown == 0:
+ exit()