Alexey Ivanov | cc01a9c | 2019-01-16 09:50:46 -0800 | [diff] [blame] | 1 | #!/usr/bin/python |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 2 | # @lint-avoid-python-3-compatibility-imports |
| 3 | # |
| 4 | # cpuunclaimed Sample CPU run queues and calculate unclaimed idle CPU. |
| 5 | # For Linux, uses BCC, eBPF. |
| 6 | # |
| 7 | # This samples the length of the run queues and determine when there are idle |
| 8 | # CPUs, yet queued threads waiting their turn. Report the amount of idle |
| 9 | # (yet unclaimed by waiting threads) CPU as a system-wide percentage. |
| 10 | # |
| 11 | # This situation can happen for a number of reasons: |
| 12 | # |
| 13 | # - An application has been bound to some, but not all, CPUs, and has runnable |
| 14 | # threads that cannot migrate to other CPUs due to this configuration. |
| 15 | # - CPU affinity: an optimization that leaves threads on CPUs where the CPU |
| 16 | # caches are warm, even if this means short periods of waiting while other |
| 17 | # CPUs are idle. The wait period is tunale (see sysctl, kernel.sched*). |
| 18 | # - Scheduler bugs. |
| 19 | # |
| 20 | # An unclaimed idle of < 1% is likely to be CPU affinity, and not usually a |
| 21 | # cause for concern. By leaving the CPU idle, overall throughput of the system |
| 22 | # may be improved. This tool is best for identifying larger issues, > 2%, due |
| 23 | # to the coarseness of its 99 Hertz samples. |
| 24 | # |
| 25 | # This is an experimental tool that currently works by use of sampling to |
| 26 | # keep overheads low. Tool assumptions: |
| 27 | # |
| 28 | # - CPU samples consistently fire around the same offset. There will sometimes |
| 29 | # be a lag as a sample is delayed by higher-priority interrupts, but it is |
| 30 | # assumed the subsequent samples will catch up to the expected offsets (as |
| 31 | # is seen in practice). You can use -J to inspect sample offsets. Some |
| 32 | # systems can power down CPUs when idle, and when they wake up again they |
| 33 | # may begin firing at a skewed offset: this tool will detect the skew, print |
| 34 | # an error, and exit. |
| 35 | # - All CPUs are online (see ncpu). |
| 36 | # |
| 37 | # If this identifies unclaimed CPU, you can double check it by dumping raw |
| 38 | # samples (-j), as well as using other tracing tools to instrument scheduler |
| 39 | # events (although this latter approach has much higher overhead). |
| 40 | # |
| 41 | # This tool passes all sampled events to user space for post processing. |
| 42 | # I originally wrote this to do the calculations entirerly in kernel context, |
| 43 | # and only pass a summary. That involves a number of challenges, and the |
| 44 | # overhead savings may not outweigh the caveats. You can see my WIP here: |
| 45 | # https://gist.github.com/brendangregg/731cf2ce54bf1f9a19d4ccd397625ad9 |
| 46 | # |
| 47 | # USAGE: cpuunclaimed [-h] [-j] [-J] [-T] [interval] [count] |
| 48 | # |
| 49 | # If you see "Lost 1881 samples" warnings, try increasing wakeup_hz. |
| 50 | # |
| 51 | # REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is |
| 52 | # a version of this tool that may work on Linux 4.6 - 4.8. |
| 53 | # |
| 54 | # Copyright 2016 Netflix, Inc. |
| 55 | # Licensed under the Apache License, Version 2.0 (the "License") |
| 56 | # |
| 57 | # 20-Dec-2016 Brendan Gregg Created this. |
| 58 | |
| 59 | from __future__ import print_function |
| 60 | from bcc import BPF, PerfType, PerfSWConfig |
| 61 | from time import sleep, strftime |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 62 | import argparse |
| 63 | import multiprocessing |
yonghong-song | 77f4f66 | 2019-01-24 12:48:25 -0800 | [diff] [blame] | 64 | from os import getpid, system, open, close, dup, unlink, O_WRONLY |
yonghong-song | 77f4f66 | 2019-01-24 12:48:25 -0800 | [diff] [blame] | 65 | from tempfile import NamedTemporaryFile |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 66 | |
| 67 | # arguments |
| 68 | examples = """examples: |
| 69 | ./cpuunclaimed # sample and calculate unclaimed idle CPUs, |
| 70 | # output every 1 second (default) |
| 71 | ./cpuunclaimed 5 10 # print 5 second summaries, 10 times |
| 72 | ./cpuunclaimed -T 1 # 1s summaries and timestamps |
| 73 | ./cpuunclaimed -j # raw dump of all samples (verbose), CSV |
| 74 | """ |
| 75 | parser = argparse.ArgumentParser( |
| 76 | description="Sample CPU run queues and calculate unclaimed idle CPU", |
| 77 | formatter_class=argparse.RawDescriptionHelpFormatter, |
| 78 | epilog=examples) |
| 79 | parser.add_argument("-j", "--csv", action="store_true", |
| 80 | help="print sample summaries (verbose) as comma-separated values") |
| 81 | parser.add_argument("-J", "--fullcsv", action="store_true", |
| 82 | help="print sample summaries with extra fields: CPU sample offsets") |
| 83 | parser.add_argument("-T", "--timestamp", action="store_true", |
| 84 | help="include timestamp on output") |
| 85 | parser.add_argument("interval", nargs="?", default=-1, |
| 86 | help="output interval, in seconds") |
| 87 | parser.add_argument("count", nargs="?", default=99999999, |
| 88 | help="number of outputs") |
Nathan Scott | cf0792f | 2018-02-02 16:56:50 +1100 | [diff] [blame] | 89 | parser.add_argument("--ebpf", action="store_true", |
| 90 | help=argparse.SUPPRESS) |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 91 | args = parser.parse_args() |
| 92 | countdown = int(args.count) |
| 93 | frequency = 99 |
| 94 | dobind = 1 |
| 95 | wakeup_hz = 10 # frequency to read buffers |
| 96 | wakeup_s = float(1) / wakeup_hz |
| 97 | ncpu = multiprocessing.cpu_count() # assume all are online |
| 98 | debug = 0 |
| 99 | |
yonghong-song | 77f4f66 | 2019-01-24 12:48:25 -0800 | [diff] [blame] | 100 | # Linux 4.15 introduced a new field runnable_weight |
| 101 | # in linux_src:kernel/sched/sched.h as |
| 102 | # struct cfs_rq { |
| 103 | # struct load_weight load; |
| 104 | # unsigned long runnable_weight; |
| 105 | # unsigned int nr_running, h_nr_running; |
| 106 | # ...... |
| 107 | # } |
| 108 | # and this tool requires to access nr_running to get |
| 109 | # runqueue len information. |
| 110 | # |
| 111 | # The commit which introduces cfs_rq->runnable_weight |
| 112 | # field also introduces the field sched_entity->runnable_weight |
| 113 | # where sched_entity is defined in linux_src:include/linux/sched.h. |
| 114 | # |
| 115 | # To cope with pre-4.15 and 4.15/post-4.15 releases, |
| 116 | # we run a simple BPF program to detect whether |
| 117 | # field sched_entity->runnable_weight exists. The existence of |
| 118 | # this field should infer the existence of cfs_rq->runnable_weight. |
| 119 | # |
| 120 | # This will need maintenance as the relationship between these |
| 121 | # two fields may change in the future. |
| 122 | # |
| 123 | def check_runnable_weight_field(): |
| 124 | # Define the bpf program for checking purpose |
| 125 | bpf_check_text = """ |
| 126 | #include <linux/sched.h> |
| 127 | unsigned long dummy(struct sched_entity *entity) |
| 128 | { |
| 129 | return entity->runnable_weight; |
| 130 | } |
| 131 | """ |
| 132 | |
| 133 | # Get a temporary file name |
| 134 | tmp_file = NamedTemporaryFile(delete=False) |
| 135 | tmp_file.close(); |
| 136 | |
| 137 | # Duplicate and close stderr (fd = 2) |
| 138 | old_stderr = dup(2) |
| 139 | close(2) |
| 140 | |
| 141 | # Open a new file, should get fd number 2 |
| 142 | # This will avoid printing llvm errors on the screen |
| 143 | fd = open(tmp_file.name, O_WRONLY) |
| 144 | try: |
| 145 | t = BPF(text=bpf_check_text) |
| 146 | success_compile = True |
| 147 | except: |
| 148 | success_compile = False |
| 149 | |
| 150 | # Release the fd 2, and next dup should restore old stderr |
| 151 | close(fd) |
| 152 | dup(old_stderr) |
| 153 | close(old_stderr) |
| 154 | |
| 155 | # remove the temporary file and return |
| 156 | unlink(tmp_file.name) |
| 157 | return success_compile |
| 158 | |
| 159 | |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 160 | # process arguments |
| 161 | if args.fullcsv: |
| 162 | args.csv = True |
| 163 | if args.csv: |
| 164 | interval = 0.2 |
| 165 | if args.interval != -1 and (args.fullcsv or args.csv): |
| 166 | print("ERROR: cannot use interval with either -j or -J. Exiting.") |
| 167 | exit() |
| 168 | if args.interval == -1: |
| 169 | args.interval = "1" |
| 170 | interval = float(args.interval) |
| 171 | |
| 172 | # define BPF program |
| 173 | bpf_text = """ |
| 174 | #include <uapi/linux/ptrace.h> |
| 175 | #include <uapi/linux/bpf_perf_event.h> |
| 176 | #include <linux/sched.h> |
| 177 | |
| 178 | struct data_t { |
| 179 | u64 ts; |
| 180 | u64 cpu; |
| 181 | u64 len; |
| 182 | }; |
| 183 | |
| 184 | BPF_PERF_OUTPUT(events); |
| 185 | |
| 186 | // Declare enough of cfs_rq to find nr_running, since we can't #import the |
| 187 | // header. This will need maintenance. It is from kernel/sched/sched.h: |
| 188 | struct cfs_rq_partial { |
| 189 | struct load_weight load; |
yonghong-song | 77f4f66 | 2019-01-24 12:48:25 -0800 | [diff] [blame] | 190 | RUNNABLE_WEIGHT_FIELD |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 191 | unsigned int nr_running, h_nr_running; |
| 192 | }; |
| 193 | |
| 194 | int do_perf_event(struct bpf_perf_event_data *ctx) |
| 195 | { |
| 196 | int cpu = bpf_get_smp_processor_id(); |
| 197 | u64 now = bpf_ktime_get_ns(); |
| 198 | |
| 199 | /* |
| 200 | * Fetch the run queue length from task->se.cfs_rq->nr_running. This is an |
| 201 | * unstable interface and may need maintenance. Perhaps a future version |
| 202 | * of BPF will support task_rq(p) or something similar as a more reliable |
| 203 | * interface. |
| 204 | */ |
| 205 | unsigned int len = 0; |
| 206 | struct task_struct *task = NULL; |
| 207 | struct cfs_rq_partial *my_q = NULL; |
| 208 | task = (struct task_struct *)bpf_get_current_task(); |
Paul Chaignon | 719e100 | 2017-08-06 14:33:20 +0200 | [diff] [blame] | 209 | my_q = (struct cfs_rq_partial *)task->se.cfs_rq; |
| 210 | len = my_q->nr_running; |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 211 | |
| 212 | struct data_t data = {.ts = now, .cpu = cpu, .len = len}; |
| 213 | events.perf_submit(ctx, &data, sizeof(data)); |
| 214 | |
| 215 | return 0; |
| 216 | } |
| 217 | """ |
| 218 | |
yonghong-song | 77f4f66 | 2019-01-24 12:48:25 -0800 | [diff] [blame] | 219 | if check_runnable_weight_field(): |
| 220 | bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', 'unsigned long runnable_weight;') |
| 221 | else: |
| 222 | bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', '') |
| 223 | |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 224 | # code substitutions |
Nathan Scott | cf0792f | 2018-02-02 16:56:50 +1100 | [diff] [blame] | 225 | if debug or args.ebpf: |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 226 | print(bpf_text) |
Nathan Scott | cf0792f | 2018-02-02 16:56:50 +1100 | [diff] [blame] | 227 | if args.ebpf: |
| 228 | exit() |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 229 | |
| 230 | # initialize BPF & perf_events |
| 231 | b = BPF(text=bpf_text) |
| 232 | # TODO: check for HW counters first and use if more accurate |
| 233 | b.attach_perf_event(ev_type=PerfType.SOFTWARE, |
| 234 | ev_config=PerfSWConfig.TASK_CLOCK, fn_name="do_perf_event", |
| 235 | sample_period=0, sample_freq=frequency) |
| 236 | |
| 237 | if args.csv: |
| 238 | if args.timestamp: |
| 239 | print("TIME", end=",") |
| 240 | print("TIMESTAMP_ns", end=",") |
Rafael F | d7a5ff0 | 2017-03-03 19:57:28 +0100 | [diff] [blame] | 241 | print(",".join("CPU" + str(c) for c in range(ncpu)), end="") |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 242 | if args.fullcsv: |
| 243 | print(",", end="") |
Rafael F | d7a5ff0 | 2017-03-03 19:57:28 +0100 | [diff] [blame] | 244 | print(",".join("OFFSET_ns_CPU" + str(c) for c in range(ncpu)), end="") |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 245 | print() |
| 246 | else: |
| 247 | print(("Sampling run queues... Output every %s seconds. " + |
| 248 | "Hit Ctrl-C to end.") % args.interval) |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 249 | |
| 250 | samples = {} |
| 251 | group = {} |
| 252 | last = 0 |
| 253 | |
| 254 | # process event |
| 255 | def print_event(cpu, data, size): |
Xiaozhou Liu | 51d62d3 | 2019-02-15 13:03:05 +0800 | [diff] [blame] | 256 | event = b["events"].event(data) |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 257 | samples[event.ts] = {} |
| 258 | samples[event.ts]['cpu'] = event.cpu |
| 259 | samples[event.ts]['len'] = event.len |
| 260 | |
| 261 | exiting = 0 if args.interval else 1 |
| 262 | slept = float(0) |
| 263 | |
| 264 | # Choose the elapsed time from one sample group to the next that identifies a |
| 265 | # new sample group (a group being a set of samples from all CPUs). The |
| 266 | # earliest timestamp is compared in each group. This trigger is also used |
| 267 | # for sanity testing, if a group's samples exceed half this value. |
| 268 | trigger = int(0.8 * (1000000000 / frequency)) |
| 269 | |
| 270 | # read events |
Mark Drayton | 5f5687e | 2017-02-20 18:13:03 +0000 | [diff] [blame] | 271 | b["events"].open_perf_buffer(print_event, page_cnt=64) |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 272 | while 1: |
| 273 | # allow some buffering by calling sleep(), to reduce the context switch |
| 274 | # rate and lower overhead. |
| 275 | try: |
| 276 | if not exiting: |
| 277 | sleep(wakeup_s) |
| 278 | except KeyboardInterrupt: |
| 279 | exiting = 1 |
Teng Qin | dbf0029 | 2018-02-28 21:47:50 -0800 | [diff] [blame] | 280 | b.perf_buffer_poll() |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 281 | slept += wakeup_s |
| 282 | |
| 283 | if slept < 0.999 * interval: # floating point workaround |
| 284 | continue |
| 285 | slept = 0 |
| 286 | |
| 287 | positive = 0 # number of samples where an idle CPU could have run work |
| 288 | running = 0 |
| 289 | idle = 0 |
| 290 | if debug >= 2: |
| 291 | print("DEBUG: begin samples loop, count %d" % len(samples)) |
| 292 | for e in sorted(samples): |
| 293 | if debug >= 2: |
| 294 | print("DEBUG: ts %d cpu %d len %d delta %d trig %d" % (e, |
| 295 | samples[e]['cpu'], samples[e]['len'], e - last, |
| 296 | e - last > trigger)) |
| 297 | |
| 298 | # look for time jumps to identify a new sample group |
| 299 | if e - last > trigger: |
| 300 | |
| 301 | # first first group timestamp, and sanity test |
| 302 | g_time = 0 |
| 303 | g_max = 0 |
| 304 | for ge in sorted(group): |
| 305 | if g_time == 0: |
| 306 | g_time = ge |
| 307 | g_max = ge |
| 308 | |
| 309 | # process previous sample group |
| 310 | if args.csv: |
| 311 | lens = [0] * ncpu |
| 312 | offs = [0] * ncpu |
| 313 | for ge in sorted(group): |
| 314 | lens[samples[ge]['cpu']] = samples[ge]['len'] |
| 315 | if args.fullcsv: |
| 316 | offs[samples[ge]['cpu']] = ge - g_time |
| 317 | if g_time > 0: # else first sample |
| 318 | if args.timestamp: |
| 319 | print("%-8s" % strftime("%H:%M:%S"), end=",") |
| 320 | print("%d" % g_time, end=",") |
Rafael F | d7a5ff0 | 2017-03-03 19:57:28 +0100 | [diff] [blame] | 321 | print(",".join(str(lens[c]) for c in range(ncpu)), end="") |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 322 | if args.fullcsv: |
| 323 | print(",", end="") |
Rafael F | d7a5ff0 | 2017-03-03 19:57:28 +0100 | [diff] [blame] | 324 | print(",".join(str(offs[c]) for c in range(ncpu))) |
Brendan Gregg | 3f3acd8 | 2016-12-21 15:34:09 -0800 | [diff] [blame] | 325 | else: |
| 326 | print() |
| 327 | else: |
| 328 | # calculate stats |
| 329 | g_running = 0 |
| 330 | g_queued = 0 |
| 331 | for ge in group: |
| 332 | if samples[ge]['len'] > 0: |
| 333 | g_running += 1 |
| 334 | if samples[ge]['len'] > 1: |
| 335 | g_queued += samples[ge]['len'] - 1 |
| 336 | g_idle = ncpu - g_running |
| 337 | |
| 338 | # calculate the number of threads that could have run as the |
| 339 | # minimum of idle and queued |
| 340 | if g_idle > 0 and g_queued > 0: |
| 341 | if g_queued > g_idle: |
| 342 | i = g_idle |
| 343 | else: |
| 344 | i = g_queued |
| 345 | positive += i |
| 346 | running += g_running |
| 347 | idle += g_idle |
| 348 | |
| 349 | # now sanity test, after -J output |
| 350 | g_range = g_max - g_time |
| 351 | if g_range > trigger / 2: |
| 352 | # if a sample group exceeds half the interval, we can no |
| 353 | # longer draw conclusions about some CPUs idle while others |
| 354 | # have queued work. Error and exit. This can happen when |
| 355 | # CPUs power down, then start again on different offsets. |
| 356 | # TODO: Since this is a sampling tool, an error margin should |
| 357 | # be anticipated, so an improvement may be to bump a counter |
| 358 | # instead of exiting, and only exit if this counter shows |
| 359 | # a skewed sample rate of over, say, 1%. Such an approach |
| 360 | # would allow a small rate of outliers (sampling error), |
| 361 | # and, we could tighten the trigger to be, say, trigger / 5. |
| 362 | # In the case of a power down, if it's detectable, perhaps |
| 363 | # the tool could reinitialize the timers (although exiting |
| 364 | # is simple and works). |
| 365 | print(("ERROR: CPU samples arrived at skewed offsets " + |
| 366 | "(CPUs may have powered down when idle), " + |
| 367 | "spanning %d ns (expected < %d ns). Debug with -J, " + |
| 368 | "and see the man page. As output may begin to be " + |
| 369 | "unreliable, exiting.") % (g_range, trigger / 2)) |
| 370 | exit() |
| 371 | |
| 372 | # these are done, remove |
| 373 | for ge in sorted(group): |
| 374 | del samples[ge] |
| 375 | |
| 376 | # begin next group |
| 377 | group = {} |
| 378 | last = e |
| 379 | |
| 380 | # stash this timestamp in a sample group dict |
| 381 | group[e] = 1 |
| 382 | |
| 383 | if not args.csv: |
| 384 | total = running + idle |
| 385 | unclaimed = util = 0 |
| 386 | |
| 387 | if debug: |
| 388 | print("DEBUG: hit %d running %d idle %d total %d buffered %d" % ( |
| 389 | positive, running, idle, total, len(samples))) |
| 390 | |
| 391 | if args.timestamp: |
| 392 | print("%-8s " % strftime("%H:%M:%S"), end="") |
| 393 | |
| 394 | # output |
| 395 | if total: |
| 396 | unclaimed = float(positive) / total |
| 397 | util = float(running) / total |
| 398 | print("%%CPU %6.2f%%, unclaimed idle %0.2f%%" % (100 * util, |
| 399 | 100 * unclaimed)) |
| 400 | |
| 401 | countdown -= 1 |
| 402 | if exiting or countdown == 0: |
| 403 | exit() |