add cpuunclaimed
diff --git a/tools/cpuunclaimed.py b/tools/cpuunclaimed.py
new file mode 100755
index 0000000..9624b50
--- /dev/null
+++ b/tools/cpuunclaimed.py
@@ -0,0 +1,340 @@
+#!/usr/bin/python
+# @lint-avoid-python-3-compatibility-imports
+#
+# cpuunclaimed   Sample CPU run queues and calculate unclaimed idle CPU.
+#                For Linux, uses BCC, eBPF.
+#
+# This samples the length of the run queues and determine when there are idle
+# CPUs, yet queued threads waiting their turn. Report the amount of idle
+# (yet unclaimed by waiting threads) CPU as a system-wide percentage.
+#
+# This situation can happen for a number of reasons:
+#
+# - An application has been bound to some, but not all, CPUs, and has runnable
+#   threads that cannot migrate to other CPUs due to this configuration.
+# - CPU affinity: an optimization that leaves threads on CPUs where the CPU
+#   caches are warm, even if this means short periods of waiting while other
+#   CPUs are idle. The wait period is tunale (see sysctl, kernel.sched*).
+# - Scheduler bugs.
+#
+# An unclaimed idle of < 1% is likely to be CPU affinity, and not usually a
+# cause for concern. By leaving the CPU idle, overall throughput of the system
+# may be improved. This tool is best for identifying larger issues, > 2%, due
+# to the coarseness of its 99 Hertz samples.
+#
+# This is an experimental tool that currently works by use of sampling to
+# keep overheads low. Tool assumptions:
+#
+# - CPU samples consistently fire around the same offset. There will sometimes
+#   be a lag as a sample is delayed by higher-priority interrupts, but it is
+#   assumed the subsequent samples will catch up to the expected offsets (as
+#   is seen in practice). You can use -J to inspect sample offsets. Some
+#   systems can power down CPUs when idle, and when they wake up again they
+#   may begin firing at a skewed offset: this tool will detect the skew, print
+#   an error, and exit.
+# - All CPUs are online (see ncpu).
+#
+# If this identifies unclaimed CPU, you can double check it by dumping raw
+# samples (-j), as well as using other tracing tools to instrument scheduler
+# events (although this latter approach has much higher overhead).
+#
+# This tool passes all sampled events to user space for post processing.
+# I originally wrote this to do the calculations entirerly in kernel context,
+# and only pass a summary. That involves a number of challenges, and the
+# overhead savings may not outweigh the caveats. You can see my WIP here:
+# https://gist.github.com/brendangregg/731cf2ce54bf1f9a19d4ccd397625ad9
+#
+# USAGE: cpuunclaimed [-h] [-j] [-J] [-T] [interval] [count]
+#
+# If you see "Lost 1881 samples" warnings, try increasing wakeup_hz.
+#
+# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is
+# a version of this tool that may work on Linux 4.6 - 4.8.
+#
+# Copyright 2016 Netflix, Inc.
+# Licensed under the Apache License, Version 2.0 (the "License")
+#
+# 20-Dec-2016   Brendan Gregg   Created this.
+
+from __future__ import print_function
+from bcc import BPF, PerfType, PerfSWConfig
+from time import sleep, strftime
+from ctypes import c_int
+import argparse
+import multiprocessing
+from os import getpid, system
+import ctypes as ct
+
+# arguments
+examples = """examples:
+    ./cpuunclaimed            # sample and calculate unclaimed idle CPUs,
+                              # output every 1 second (default)
+    ./cpuunclaimed 5 10       # print 5 second summaries, 10 times
+    ./cpuunclaimed -T 1       # 1s summaries and timestamps
+    ./cpuunclaimed -j         # raw dump of all samples (verbose), CSV
+"""
+parser = argparse.ArgumentParser(
+    description="Sample CPU run queues and calculate unclaimed idle CPU",
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    epilog=examples)
+parser.add_argument("-j", "--csv", action="store_true",
+    help="print sample summaries (verbose) as comma-separated values")
+parser.add_argument("-J", "--fullcsv", action="store_true",
+    help="print sample summaries with extra fields: CPU sample offsets")
+parser.add_argument("-T", "--timestamp", action="store_true",
+    help="include timestamp on output")
+parser.add_argument("interval", nargs="?", default=-1,
+    help="output interval, in seconds")
+parser.add_argument("count", nargs="?", default=99999999,
+    help="number of outputs")
+args = parser.parse_args()
+countdown = int(args.count)
+frequency = 99
+dobind = 1
+wakeup_hz = 10                      # frequency to read buffers
+wakeup_s = float(1) / wakeup_hz
+ncpu = multiprocessing.cpu_count()  # assume all are online
+debug = 0
+
+# process arguments
+if args.fullcsv:
+    args.csv = True
+if args.csv:
+    interval = 0.2
+if args.interval != -1 and (args.fullcsv or args.csv):
+    print("ERROR: cannot use interval with either -j or -J. Exiting.")
+    exit()
+if args.interval == -1:
+    args.interval = "1"
+interval = float(args.interval)
+
+# define BPF program
+bpf_text = """
+#include <uapi/linux/ptrace.h>
+#include <uapi/linux/bpf_perf_event.h>
+#include <linux/sched.h>
+
+struct data_t {
+    u64 ts;
+    u64 cpu;
+    u64 len;
+};
+
+BPF_PERF_OUTPUT(events);
+
+// Declare enough of cfs_rq to find nr_running, since we can't #import the
+// header. This will need maintenance. It is from kernel/sched/sched.h:
+struct cfs_rq_partial {
+    struct load_weight load;
+    unsigned int nr_running, h_nr_running;
+};
+
+int do_perf_event(struct bpf_perf_event_data *ctx)
+{
+    int cpu = bpf_get_smp_processor_id();
+    u64 now = bpf_ktime_get_ns();
+
+    /*
+     * Fetch the run queue length from task->se.cfs_rq->nr_running. This is an
+     * unstable interface and may need maintenance. Perhaps a future version
+     * of BPF will support task_rq(p) or something similar as a more reliable
+     * interface.
+     */
+    unsigned int len = 0;
+    struct task_struct *task = NULL;
+    struct cfs_rq_partial *my_q = NULL;
+    task = (struct task_struct *)bpf_get_current_task();
+    bpf_probe_read(&my_q, sizeof(my_q), &task->se.cfs_rq);
+    bpf_probe_read(&len, sizeof(len), &my_q->nr_running);
+
+    struct data_t data = {.ts = now, .cpu = cpu, .len = len};
+    events.perf_submit(ctx, &data, sizeof(data));
+
+    return 0;
+}
+"""
+
+# code substitutions
+if debug:
+    print(bpf_text)
+
+# initialize BPF & perf_events
+b = BPF(text=bpf_text)
+# TODO: check for HW counters first and use if more accurate
+b.attach_perf_event(ev_type=PerfType.SOFTWARE,
+    ev_config=PerfSWConfig.TASK_CLOCK, fn_name="do_perf_event",
+    sample_period=0, sample_freq=frequency)
+
+if args.csv:
+    if args.timestamp:
+        print("TIME", end=",")
+    print("TIMESTAMP_ns", end=",")
+    print(",".join("CPU" + str(c) for c in xrange(ncpu)), end="")
+    if args.fullcsv:
+        print(",", end="")
+        print(",".join("OFFSET_ns_CPU" + str(c) for c in xrange(ncpu)), end="")
+    print()
+else:
+    print(("Sampling run queues... Output every %s seconds. " +
+          "Hit Ctrl-C to end.") % args.interval)
+class Data(ct.Structure):
+    _fields_ = [
+        ("ts", ct.c_ulonglong),
+        ("cpu", ct.c_ulonglong),
+        ("len", ct.c_ulonglong)
+    ]
+
+samples = {}
+group = {}
+last = 0
+
+# process event
+def print_event(cpu, data, size):
+    event = ct.cast(data, ct.POINTER(Data)).contents
+    samples[event.ts] = {}
+    samples[event.ts]['cpu'] = event.cpu
+    samples[event.ts]['len'] = event.len
+
+exiting = 0 if args.interval else 1
+slept = float(0)
+
+# Choose the elapsed time from one sample group to the next that identifies a
+# new sample group (a group being a set of samples from all CPUs). The
+# earliest timestamp is compared in each group. This trigger is also used
+# for sanity testing, if a group's samples exceed half this value.
+trigger = int(0.8 * (1000000000 / frequency))
+
+# read events
+b["events"].open_perf_buffer(print_event)
+while 1:
+    # allow some buffering by calling sleep(), to reduce the context switch
+    # rate and lower overhead.
+    try:
+        if not exiting:
+            sleep(wakeup_s)
+    except KeyboardInterrupt:
+        exiting = 1
+    b.kprobe_poll()
+    slept += wakeup_s
+
+    if slept < 0.999 * interval:   # floating point workaround
+        continue
+    slept = 0
+
+    positive = 0  # number of samples where an idle CPU could have run work
+    running = 0
+    idle = 0
+    if debug >= 2:
+        print("DEBUG: begin samples loop, count %d" % len(samples))
+    for e in sorted(samples):
+        if debug >= 2:
+            print("DEBUG: ts %d cpu %d len %d delta %d trig %d" % (e,
+                  samples[e]['cpu'], samples[e]['len'], e - last,
+                  e - last > trigger))
+
+        # look for time jumps to identify a new sample group
+        if e - last > trigger:
+
+            # first first group timestamp, and sanity test
+            g_time = 0
+            g_max = 0
+            for ge in sorted(group):
+                if g_time == 0:
+                    g_time = ge
+                g_max = ge
+
+            # process previous sample group
+            if args.csv:
+                lens = [0] * ncpu
+                offs = [0] * ncpu
+                for ge in sorted(group):
+                    lens[samples[ge]['cpu']] = samples[ge]['len']
+                    if args.fullcsv:
+                        offs[samples[ge]['cpu']] = ge - g_time
+                if g_time > 0:      # else first sample
+                    if args.timestamp:
+                        print("%-8s" % strftime("%H:%M:%S"), end=",")
+                    print("%d" % g_time, end=",")
+                    print(",".join(str(lens[c]) for c in xrange(ncpu)), end="")
+                    if args.fullcsv:
+                        print(",", end="")
+                        print(",".join(str(offs[c]) for c in xrange(ncpu)))
+                    else:
+                        print()
+            else:
+                # calculate stats
+                g_running = 0
+                g_queued = 0
+                for ge in group:
+                    if samples[ge]['len'] > 0:
+                        g_running += 1
+                    if samples[ge]['len'] > 1:
+                        g_queued += samples[ge]['len'] - 1
+                g_idle = ncpu - g_running
+
+                # calculate the number of threads that could have run as the
+                # minimum of idle and queued
+                if g_idle > 0 and g_queued > 0:
+                    if g_queued > g_idle:
+                        i = g_idle
+                    else:
+                        i = g_queued
+                    positive += i
+                running += g_running
+                idle += g_idle
+
+            # now sanity test, after -J output
+            g_range = g_max - g_time
+            if g_range > trigger / 2:
+                # if a sample group exceeds half the interval, we can no
+                # longer draw conclusions about some CPUs idle while others
+                # have queued work. Error and exit. This can happen when
+                # CPUs power down, then start again on different offsets.
+                # TODO: Since this is a sampling tool, an error margin should
+                # be anticipated, so an improvement may be to bump a counter
+                # instead of exiting, and only exit if this counter shows
+                # a skewed sample rate of over, say, 1%. Such an approach
+                # would allow a small rate of outliers (sampling error),
+                # and, we could tighten the trigger to be, say, trigger / 5.
+                # In the case of a power down, if it's detectable, perhaps
+                # the tool could reinitialize the timers (although exiting
+                # is simple and works).
+                print(("ERROR: CPU samples arrived at skewed offsets " +
+                      "(CPUs may have powered down when idle), " +
+                      "spanning %d ns (expected < %d ns). Debug with -J, " +
+                      "and see the man page. As output may begin to be " +
+                      "unreliable, exiting.") % (g_range, trigger / 2))
+                exit()
+
+            # these are done, remove
+            for ge in sorted(group):
+                del samples[ge]
+
+            # begin next group
+            group = {}
+            last = e
+
+        # stash this timestamp in a sample group dict
+        group[e] = 1
+
+    if not args.csv:
+        total = running + idle
+        unclaimed = util = 0
+
+        if debug:
+            print("DEBUG: hit %d running %d idle %d total %d buffered %d" % (
+                  positive, running, idle, total, len(samples)))
+
+        if args.timestamp:
+            print("%-8s " % strftime("%H:%M:%S"), end="")
+
+        # output
+        if total:
+            unclaimed = float(positive) / total
+            util = float(running) / total
+        print("%%CPU %6.2f%%, unclaimed idle %0.2f%%" % (100 * util,
+              100 * unclaimed))
+
+    countdown -= 1
+    if exiting or countdown == 0:
+        exit()