blob: 601bddbc30d5ff6be912254aa32c3035736328a3 [file] [log] [blame]
Ingo Molnar07800602009-04-20 15:00:56 +02001/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */
30
31/*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 *
53 * Improvements and fixes by:
54 *
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
60 *
61 * Released under the GPL v2. (and only v2, not any later version)
62 */
63
64#include "util.h"
65
Ingo Molnar07800602009-04-20 15:00:56 +020066#include <getopt.h>
67#include <assert.h>
68#include <fcntl.h>
69#include <stdio.h>
70#include <errno.h>
71#include <ctype.h>
72#include <time.h>
73#include <sched.h>
74#include <pthread.h>
75
76#include <sys/syscall.h>
77#include <sys/ioctl.h>
78#include <sys/poll.h>
79#include <sys/prctl.h>
80#include <sys/wait.h>
81#include <sys/uio.h>
82#include <sys/mman.h>
83
84#include <linux/unistd.h>
85#include <linux/types.h>
86
87#include "../../include/linux/perf_counter.h"
88
89
90/*
91 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
92 * counters in the current task.
93 */
94#define PR_TASK_PERF_COUNTERS_DISABLE 31
95#define PR_TASK_PERF_COUNTERS_ENABLE 32
96
Ingo Molnar07800602009-04-20 15:00:56 +020097#define rdclock() \
98({ \
99 struct timespec ts; \
100 \
101 clock_gettime(CLOCK_MONOTONIC, &ts); \
102 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
103})
104
105/*
106 * Pick up some kernel type conventions:
107 */
108#define __user
109#define asmlinkage
110
111#ifdef __x86_64__
112#define __NR_perf_counter_open 295
113#define rmb() asm volatile("lfence" ::: "memory")
114#define cpu_relax() asm volatile("rep; nop" ::: "memory");
115#endif
116
117#ifdef __i386__
118#define __NR_perf_counter_open 333
119#define rmb() asm volatile("lfence" ::: "memory")
120#define cpu_relax() asm volatile("rep; nop" ::: "memory");
121#endif
122
123#ifdef __powerpc__
124#define __NR_perf_counter_open 319
125#define rmb() asm volatile ("sync" ::: "memory")
126#define cpu_relax() asm volatile ("" ::: "memory");
127#endif
128
129#define unlikely(x) __builtin_expect(!!(x), 0)
130#define min(x, y) ({ \
131 typeof(x) _min1 = (x); \
132 typeof(y) _min2 = (y); \
133 (void) (&_min1 == &_min2); \
134 _min1 < _min2 ? _min1 : _min2; })
135
136asmlinkage int sys_perf_counter_open(
137 struct perf_counter_hw_event *hw_event_uptr __user,
138 pid_t pid,
139 int cpu,
140 int group_fd,
141 unsigned long flags)
142{
143 return syscall(
144 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
145}
146
147#define MAX_COUNTERS 64
148#define MAX_NR_CPUS 256
149
150#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
151
152static int run_perfstat = 0;
153static int system_wide = 0;
154
155static int nr_counters = 0;
156static __u64 event_id[MAX_COUNTERS] = {
157 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
158 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
159 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
160 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
161
162 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
163 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
164 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
165 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
166};
167static int default_interval = 100000;
168static int event_count[MAX_COUNTERS];
169static int fd[MAX_NR_CPUS][MAX_COUNTERS];
170
171static __u64 count_filter = 100;
172
173static int tid = -1;
174static int profile_cpu = -1;
175static int nr_cpus = 0;
176static int nmi = 1;
177static unsigned int realtime_prio = 0;
178static int group = 0;
179static unsigned int page_size;
180static unsigned int mmap_pages = 16;
181static int use_mmap = 0;
182static int use_munmap = 0;
183
184static char *vmlinux;
185
186static char *sym_filter;
187static unsigned long filter_start;
188static unsigned long filter_end;
189
190static int delay_secs = 2;
191static int zero;
192static int dump_symtab;
193
194static int scale;
195
196struct source_line {
197 uint64_t EIP;
198 unsigned long count;
199 char *line;
200 struct source_line *next;
201};
202
203static struct source_line *lines;
204static struct source_line **lines_tail;
205
206const unsigned int default_count[] = {
207 1000000,
208 1000000,
209 10000,
210 10000,
211 1000000,
212 10000,
213};
214
215static char *hw_event_names[] = {
216 "CPU cycles",
217 "instructions",
218 "cache references",
219 "cache misses",
220 "branches",
221 "branch misses",
222 "bus cycles",
223};
224
225static char *sw_event_names[] = {
226 "cpu clock ticks",
227 "task clock ticks",
228 "pagefaults",
229 "context switches",
230 "CPU migrations",
231 "minor faults",
232 "major faults",
233};
234
235struct event_symbol {
236 __u64 event;
237 char *symbol;
238};
239
240static struct event_symbol event_symbols[] = {
241 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
242 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
243 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
244 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
245 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
246 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
247 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
248 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
249 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
250
251 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
252 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
253 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
254 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
255 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
256 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
257 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
258 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
259 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
260 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
261};
262
263#define __PERF_COUNTER_FIELD(config, name) \
264 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
265
266#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
267#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
268#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
269#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
270
271static void display_events_help(void)
272{
273 unsigned int i;
274 __u64 e;
275
276 printf(
277 " -e EVENT --event=EVENT # symbolic-name abbreviations");
278
279 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
280 int type, id;
281
282 e = event_symbols[i].event;
283 type = PERF_COUNTER_TYPE(e);
284 id = PERF_COUNTER_ID(e);
285
286 printf("\n %d:%d: %-20s",
287 type, id, event_symbols[i].symbol);
288 }
289
290 printf("\n"
291 " rNNN: raw PMU events (eventsel+umask)\n\n");
292}
293
294static void display_perfstat_help(void)
295{
296 printf(
297 "Usage: perfstat [<events...>] <cmd...>\n\n"
298 "PerfStat Options (up to %d event types can be specified):\n\n",
299 MAX_COUNTERS);
300
301 display_events_help();
302
303 printf(
304 " -l # scale counter values\n"
305 " -a # system-wide collection\n");
306 exit(0);
307}
308
309static void display_help(void)
310{
311 if (run_perfstat)
312 return display_perfstat_help();
313
314 printf(
315 "Usage: kerneltop [<options>]\n"
316 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
317 "KernelTop Options (up to %d event types can be specified at once):\n\n",
318 MAX_COUNTERS);
319
320 display_events_help();
321
322 printf(
323 " -S --stat # perfstat COMMAND\n"
324 " -a # system-wide collection (for perfstat)\n\n"
325 " -c CNT --count=CNT # event period to sample\n\n"
326 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
327 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
328 " -l # show scale factor for RR events\n"
329 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
330 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
331 " -r prio --realtime=<prio> # event acquisition runs with SCHED_FIFO policy\n"
332 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
333 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
334 " -z --zero # zero counts after display\n"
335 " -D --dump_symtab # dump symbol table to stderr on startup\n"
336 " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
337 " -M --mmap_info # print mmap info stream\n"
338 " -U --munmap_info # print munmap info stream\n"
339 );
340
341 exit(0);
342}
343
344static char *event_name(int ctr)
345{
346 __u64 config = event_id[ctr];
347 int type = PERF_COUNTER_TYPE(config);
348 int id = PERF_COUNTER_ID(config);
349 static char buf[32];
350
351 if (PERF_COUNTER_RAW(config)) {
352 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
353 return buf;
354 }
355
356 switch (type) {
357 case PERF_TYPE_HARDWARE:
358 if (id < PERF_HW_EVENTS_MAX)
359 return hw_event_names[id];
360 return "unknown-hardware";
361
362 case PERF_TYPE_SOFTWARE:
363 if (id < PERF_SW_EVENTS_MAX)
364 return sw_event_names[id];
365 return "unknown-software";
366
367 default:
368 break;
369 }
370
371 return "unknown";
372}
373
374/*
375 * Each event can have multiple symbolic names.
376 * Symbolic names are (almost) exactly matched.
377 */
378static __u64 match_event_symbols(char *str)
379{
380 __u64 config, id;
381 int type;
382 unsigned int i;
383
384 if (sscanf(str, "r%llx", &config) == 1)
385 return config | PERF_COUNTER_RAW_MASK;
386
387 if (sscanf(str, "%d:%llu", &type, &id) == 2)
388 return EID(type, id);
389
390 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
391 if (!strncmp(str, event_symbols[i].symbol,
392 strlen(event_symbols[i].symbol)))
393 return event_symbols[i].event;
394 }
395
396 return ~0ULL;
397}
398
399static int parse_events(char *str)
400{
401 __u64 config;
402
403again:
404 if (nr_counters == MAX_COUNTERS)
405 return -1;
406
407 config = match_event_symbols(str);
408 if (config == ~0ULL)
409 return -1;
410
411 event_id[nr_counters] = config;
412 nr_counters++;
413
414 str = strstr(str, ",");
415 if (str) {
416 str++;
417 goto again;
418 }
419
420 return 0;
421}
422
423
424/*
425 * perfstat
426 */
427
428char fault_here[1000000];
429
430static void create_perfstat_counter(int counter)
431{
432 struct perf_counter_hw_event hw_event;
433
434 memset(&hw_event, 0, sizeof(hw_event));
435 hw_event.config = event_id[counter];
436 hw_event.record_type = 0;
437 hw_event.nmi = 0;
438 if (scale)
439 hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
440 PERF_FORMAT_TOTAL_TIME_RUNNING;
441
442 if (system_wide) {
443 int cpu;
444 for (cpu = 0; cpu < nr_cpus; cpu ++) {
445 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
446 if (fd[cpu][counter] < 0) {
447 printf("perfstat error: syscall returned with %d (%s)\n",
448 fd[cpu][counter], strerror(errno));
449 exit(-1);
450 }
451 }
452 } else {
453 hw_event.inherit = 1;
454 hw_event.disabled = 1;
455
456 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
457 if (fd[0][counter] < 0) {
458 printf("perfstat error: syscall returned with %d (%s)\n",
459 fd[0][counter], strerror(errno));
460 exit(-1);
461 }
462 }
463}
464
465int do_perfstat(int argc, char *argv[])
466{
467 unsigned long long t0, t1;
468 int counter;
469 ssize_t res;
470 int status;
471 int pid;
472
473 if (!system_wide)
474 nr_cpus = 1;
475
476 for (counter = 0; counter < nr_counters; counter++)
477 create_perfstat_counter(counter);
478
479 argc -= optind;
480 argv += optind;
481
482 if (!argc)
483 display_help();
484
485 /*
486 * Enable counters and exec the command:
487 */
488 t0 = rdclock();
489 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
490
491 if ((pid = fork()) < 0)
492 perror("failed to fork");
493 if (!pid) {
494 if (execvp(argv[0], argv)) {
495 perror(argv[0]);
496 exit(-1);
497 }
498 }
499 while (wait(&status) >= 0)
500 ;
501 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
502 t1 = rdclock();
503
504 fflush(stdout);
505
506 fprintf(stderr, "\n");
507 fprintf(stderr, " Performance counter stats for \'%s\':\n",
508 argv[0]);
509 fprintf(stderr, "\n");
510
511 for (counter = 0; counter < nr_counters; counter++) {
512 int cpu, nv;
513 __u64 count[3], single_count[3];
514 int scaled;
515
516 count[0] = count[1] = count[2] = 0;
517 nv = scale ? 3 : 1;
518 for (cpu = 0; cpu < nr_cpus; cpu ++) {
519 res = read(fd[cpu][counter],
520 single_count, nv * sizeof(__u64));
521 assert(res == nv * sizeof(__u64));
522
523 count[0] += single_count[0];
524 if (scale) {
525 count[1] += single_count[1];
526 count[2] += single_count[2];
527 }
528 }
529
530 scaled = 0;
531 if (scale) {
532 if (count[2] == 0) {
533 fprintf(stderr, " %14s %-20s\n",
534 "<not counted>", event_name(counter));
535 continue;
536 }
537 if (count[2] < count[1]) {
538 scaled = 1;
539 count[0] = (unsigned long long)
540 ((double)count[0] * count[1] / count[2] + 0.5);
541 }
542 }
543
544 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
545 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
546
547 double msecs = (double)count[0] / 1000000;
548
549 fprintf(stderr, " %14.6f %-20s (msecs)",
550 msecs, event_name(counter));
551 } else {
552 fprintf(stderr, " %14Ld %-20s (events)",
553 count[0], event_name(counter));
554 }
555 if (scaled)
556 fprintf(stderr, " (scaled from %.2f%%)",
557 (double) count[2] / count[1] * 100);
558 fprintf(stderr, "\n");
559 }
560 fprintf(stderr, "\n");
561 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
562 (double)(t1-t0)/1e6);
563 fprintf(stderr, "\n");
564
565 return 0;
566}
567
568/*
569 * Symbols
570 */
571
572static uint64_t min_ip;
573static uint64_t max_ip = -1ll;
574
575struct sym_entry {
576 unsigned long long addr;
577 char *sym;
578 unsigned long count[MAX_COUNTERS];
579 int skip;
580 struct source_line *source;
581};
582
583#define MAX_SYMS 100000
584
585static int sym_table_count;
586
587struct sym_entry *sym_filter_entry;
588
589static struct sym_entry sym_table[MAX_SYMS];
590
591static void show_details(struct sym_entry *sym);
592
593/*
594 * Ordering weight: count-1 * count-2 * ... / count-n
595 */
596static double sym_weight(const struct sym_entry *sym)
597{
598 double weight;
599 int counter;
600
601 weight = sym->count[0];
602
603 for (counter = 1; counter < nr_counters-1; counter++)
604 weight *= sym->count[counter];
605
606 weight /= (sym->count[counter] + 1);
607
608 return weight;
609}
610
611static int compare(const void *__sym1, const void *__sym2)
612{
613 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
614
615 return sym_weight(sym1) < sym_weight(sym2);
616}
617
618static long events;
619static long userspace_events;
620static const char CONSOLE_CLEAR[] = "";
621
622static struct sym_entry tmp[MAX_SYMS];
623
624static void print_sym_table(void)
625{
626 int i, printed;
627 int counter;
628 float events_per_sec = events/delay_secs;
629 float kevents_per_sec = (events-userspace_events)/delay_secs;
630 float sum_kevents = 0.0;
631
632 events = userspace_events = 0;
633 memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
634 qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
635
636 for (i = 0; i < sym_table_count && tmp[i].count[0]; i++)
637 sum_kevents += tmp[i].count[0];
638
639 write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
640
641 printf(
642"------------------------------------------------------------------------------\n");
643 printf( " KernelTop:%8.0f irqs/sec kernel:%4.1f%% [%s, ",
644 events_per_sec,
645 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
646 nmi ? "NMI" : "IRQ");
647
648 if (nr_counters == 1)
649 printf("%d ", event_count[0]);
650
651 for (counter = 0; counter < nr_counters; counter++) {
652 if (counter)
653 printf("/");
654
655 printf("%s", event_name(counter));
656 }
657
658 printf( "], ");
659
660 if (tid != -1)
661 printf(" (tid: %d", tid);
662 else
663 printf(" (all");
664
665 if (profile_cpu != -1)
666 printf(", cpu: %d)\n", profile_cpu);
667 else {
668 if (tid != -1)
669 printf(")\n");
670 else
671 printf(", %d CPUs)\n", nr_cpus);
672 }
673
674 printf("------------------------------------------------------------------------------\n\n");
675
676 if (nr_counters == 1)
677 printf(" events pcnt");
678 else
679 printf(" weight events pcnt");
680
681 printf(" RIP kernel function\n"
682 " ______ ______ _____ ________________ _______________\n\n"
683 );
684
685 for (i = 0, printed = 0; i < sym_table_count; i++) {
686 float pcnt;
687 int count;
688
689 if (printed <= 18 && tmp[i].count[0] >= count_filter) {
690 pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
691
692 if (nr_counters == 1)
693 printf("%19.2f - %4.1f%% - %016llx : %s\n",
694 sym_weight(tmp + i),
695 pcnt, tmp[i].addr, tmp[i].sym);
696 else
697 printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
698 sym_weight(tmp + i),
699 tmp[i].count[0],
700 pcnt, tmp[i].addr, tmp[i].sym);
701 printed++;
702 }
703 /*
704 * Add decay to the counts:
705 */
706 for (count = 0; count < nr_counters; count++)
707 sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
708 }
709
710 if (sym_filter_entry)
711 show_details(sym_filter_entry);
712
713 {
714 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
715
716 if (poll(&stdin_poll, 1, 0) == 1) {
717 printf("key pressed - exiting.\n");
718 exit(0);
719 }
720 }
721}
722
723static void *display_thread(void *arg)
724{
725 printf("KernelTop refresh period: %d seconds\n", delay_secs);
726
727 while (!sleep(delay_secs))
728 print_sym_table();
729
730 return NULL;
731}
732
733static int read_symbol(FILE *in, struct sym_entry *s)
734{
735 static int filter_match = 0;
736 char *sym, stype;
737 char str[500];
738 int rc, pos;
739
740 rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
741 if (rc == EOF)
742 return -1;
743
744 assert(rc == 3);
745
746 /* skip until end of line: */
747 pos = strlen(str);
748 do {
749 rc = fgetc(in);
750 if (rc == '\n' || rc == EOF || pos >= 499)
751 break;
752 str[pos] = rc;
753 pos++;
754 } while (1);
755 str[pos] = 0;
756
757 sym = str;
758
759 /* Filter out known duplicates and non-text symbols. */
760 if (!strcmp(sym, "_text"))
761 return 1;
762 if (!min_ip && !strcmp(sym, "_stext"))
763 return 1;
764 if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
765 return 1;
766 if (stype != 'T' && stype != 't')
767 return 1;
768 if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
769 return 1;
770 if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
771 return 1;
772
773 s->sym = malloc(strlen(str));
774 assert(s->sym);
775
776 strcpy((char *)s->sym, str);
777 s->skip = 0;
778
779 /* Tag events to be skipped. */
780 if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
781 s->skip = 1;
782 else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
783 s->skip = 1;
784 else if (!strcmp("mwait_idle", s->sym))
785 s->skip = 1;
786
787 if (filter_match == 1) {
788 filter_end = s->addr;
789 filter_match = -1;
790 if (filter_end - filter_start > 10000) {
791 printf("hm, too large filter symbol <%s> - skipping.\n",
792 sym_filter);
793 printf("symbol filter start: %016lx\n", filter_start);
794 printf(" end: %016lx\n", filter_end);
795 filter_end = filter_start = 0;
796 sym_filter = NULL;
797 sleep(1);
798 }
799 }
800 if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
801 filter_match = 1;
802 filter_start = s->addr;
803 }
804
805 return 0;
806}
807
808int compare_addr(const void *__sym1, const void *__sym2)
809{
810 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
811
812 return sym1->addr > sym2->addr;
813}
814
815static void sort_symbol_table(void)
816{
817 int i, dups;
818
819 do {
820 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
821 for (i = 0, dups = 0; i < sym_table_count; i++) {
822 if (sym_table[i].addr == sym_table[i+1].addr) {
823 sym_table[i+1].addr = -1ll;
824 dups++;
825 }
826 }
827 sym_table_count -= dups;
828 } while(dups);
829}
830
831static void parse_symbols(void)
832{
833 struct sym_entry *last;
834
835 FILE *kallsyms = fopen("/proc/kallsyms", "r");
836
837 if (!kallsyms) {
838 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
839 exit(-1);
840 }
841
842 while (!feof(kallsyms)) {
843 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
844 sym_table_count++;
845 assert(sym_table_count <= MAX_SYMS);
846 }
847 }
848
849 sort_symbol_table();
850 min_ip = sym_table[0].addr;
851 max_ip = sym_table[sym_table_count-1].addr;
852 last = sym_table + sym_table_count++;
853
854 last->addr = -1ll;
855 last->sym = "<end>";
856
857 if (filter_end) {
858 int count;
859 for (count=0; count < sym_table_count; count ++) {
860 if (!strcmp(sym_table[count].sym, sym_filter)) {
861 sym_filter_entry = &sym_table[count];
862 break;
863 }
864 }
865 }
866 if (dump_symtab) {
867 int i;
868
869 for (i = 0; i < sym_table_count; i++)
870 fprintf(stderr, "%llx %s\n",
871 sym_table[i].addr, sym_table[i].sym);
872 }
873}
874
875/*
876 * Source lines
877 */
878
879static void parse_vmlinux(char *filename)
880{
881 FILE *file;
882 char command[PATH_MAX*2];
883 if (!filename)
884 return;
885
886 sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
887
888 file = popen(command, "r");
889 if (!file)
890 return;
891
892 lines_tail = &lines;
893 while (!feof(file)) {
894 struct source_line *src;
895 size_t dummy = 0;
896 char *c;
897
898 src = malloc(sizeof(struct source_line));
899 assert(src != NULL);
900 memset(src, 0, sizeof(struct source_line));
901
902 if (getline(&src->line, &dummy, file) < 0)
903 break;
904 if (!src->line)
905 break;
906
907 c = strchr(src->line, '\n');
908 if (c)
909 *c = 0;
910
911 src->next = NULL;
912 *lines_tail = src;
913 lines_tail = &src->next;
914
915 if (strlen(src->line)>8 && src->line[8] == ':')
916 src->EIP = strtoull(src->line, NULL, 16);
917 if (strlen(src->line)>8 && src->line[16] == ':')
918 src->EIP = strtoull(src->line, NULL, 16);
919 }
920 pclose(file);
921}
922
923static void record_precise_ip(uint64_t ip)
924{
925 struct source_line *line;
926
927 for (line = lines; line; line = line->next) {
928 if (line->EIP == ip)
929 line->count++;
930 if (line->EIP > ip)
931 break;
932 }
933}
934
935static void lookup_sym_in_vmlinux(struct sym_entry *sym)
936{
937 struct source_line *line;
938 char pattern[PATH_MAX];
939 sprintf(pattern, "<%s>:", sym->sym);
940
941 for (line = lines; line; line = line->next) {
942 if (strstr(line->line, pattern)) {
943 sym->source = line;
944 break;
945 }
946 }
947}
948
949static void show_lines(struct source_line *line_queue, int line_queue_count)
950{
951 int i;
952 struct source_line *line;
953
954 line = line_queue;
955 for (i = 0; i < line_queue_count; i++) {
956 printf("%8li\t%s\n", line->count, line->line);
957 line = line->next;
958 }
959}
960
961#define TRACE_COUNT 3
962
963static void show_details(struct sym_entry *sym)
964{
965 struct source_line *line;
966 struct source_line *line_queue = NULL;
967 int displayed = 0;
968 int line_queue_count = 0;
969
970 if (!sym->source)
971 lookup_sym_in_vmlinux(sym);
972 if (!sym->source)
973 return;
974
975 printf("Showing details for %s\n", sym->sym);
976
977 line = sym->source;
978 while (line) {
979 if (displayed && strstr(line->line, ">:"))
980 break;
981
982 if (!line_queue_count)
983 line_queue = line;
984 line_queue_count ++;
985
986 if (line->count >= count_filter) {
987 show_lines(line_queue, line_queue_count);
988 line_queue_count = 0;
989 line_queue = NULL;
990 } else if (line_queue_count > TRACE_COUNT) {
991 line_queue = line_queue->next;
992 line_queue_count --;
993 }
994
995 line->count = 0;
996 displayed++;
997 if (displayed > 300)
998 break;
999 line = line->next;
1000 }
1001}
1002
1003/*
1004 * Binary search in the histogram table and record the hit:
1005 */
1006static void record_ip(uint64_t ip, int counter)
1007{
1008 int left_idx, middle_idx, right_idx, idx;
1009 unsigned long left, middle, right;
1010
1011 record_precise_ip(ip);
1012
1013 left_idx = 0;
1014 right_idx = sym_table_count-1;
1015 assert(ip <= max_ip && ip >= min_ip);
1016
1017 while (left_idx + 1 < right_idx) {
1018 middle_idx = (left_idx + right_idx) / 2;
1019
1020 left = sym_table[ left_idx].addr;
1021 middle = sym_table[middle_idx].addr;
1022 right = sym_table[ right_idx].addr;
1023
1024 if (!(left <= middle && middle <= right)) {
1025 printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
1026 printf("%d %d %d\n", left_idx, middle_idx, right_idx);
1027 }
1028 assert(left <= middle && middle <= right);
1029 if (!(left <= ip && ip <= right)) {
1030 printf(" left: %016lx\n", left);
1031 printf(" ip: %016lx\n", (unsigned long)ip);
1032 printf("right: %016lx\n", right);
1033 }
1034 assert(left <= ip && ip <= right);
1035 /*
1036 * [ left .... target .... middle .... right ]
1037 * => right := middle
1038 */
1039 if (ip < middle) {
1040 right_idx = middle_idx;
1041 continue;
1042 }
1043 /*
1044 * [ left .... middle ... target ... right ]
1045 * => left := middle
1046 */
1047 left_idx = middle_idx;
1048 }
1049
1050 idx = left_idx;
1051
1052 if (!sym_table[idx].skip)
1053 sym_table[idx].count[counter]++;
1054 else events--;
1055}
1056
1057static void process_event(uint64_t ip, int counter)
1058{
1059 events++;
1060
1061 if (ip < min_ip || ip > max_ip) {
1062 userspace_events++;
1063 return;
1064 }
1065
1066 record_ip(ip, counter);
1067}
1068
Ingo Molnar6f06ccb2009-04-20 15:22:22 +02001069static void process_options(int argc, char **argv)
Ingo Molnar07800602009-04-20 15:00:56 +02001070{
1071 int error = 0, counter;
1072
1073 if (strstr(argv[0], "perfstat"))
1074 run_perfstat = 1;
1075
1076 for (;;) {
1077 int option_index = 0;
1078 /** Options for getopt */
1079 static struct option long_options[] = {
1080 {"count", required_argument, NULL, 'c'},
1081 {"cpu", required_argument, NULL, 'C'},
1082 {"delay", required_argument, NULL, 'd'},
1083 {"dump_symtab", no_argument, NULL, 'D'},
1084 {"event", required_argument, NULL, 'e'},
1085 {"filter", required_argument, NULL, 'f'},
1086 {"group", required_argument, NULL, 'g'},
1087 {"help", no_argument, NULL, 'h'},
1088 {"nmi", required_argument, NULL, 'n'},
1089 {"mmap_info", no_argument, NULL, 'M'},
1090 {"mmap_pages", required_argument, NULL, 'm'},
1091 {"munmap_info", no_argument, NULL, 'U'},
1092 {"pid", required_argument, NULL, 'p'},
1093 {"realtime", required_argument, NULL, 'r'},
1094 {"scale", no_argument, NULL, 'l'},
1095 {"symbol", required_argument, NULL, 's'},
1096 {"stat", no_argument, NULL, 'S'},
1097 {"vmlinux", required_argument, NULL, 'x'},
1098 {"zero", no_argument, NULL, 'z'},
1099 {NULL, 0, NULL, 0 }
1100 };
1101 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
1102 long_options, &option_index);
1103 if (c == -1)
1104 break;
1105
1106 switch (c) {
1107 case 'a': system_wide = 1; break;
1108 case 'c': default_interval = atoi(optarg); break;
1109 case 'C':
1110 /* CPU and PID are mutually exclusive */
1111 if (tid != -1) {
1112 printf("WARNING: CPU switch overriding PID\n");
1113 sleep(1);
1114 tid = -1;
1115 }
1116 profile_cpu = atoi(optarg); break;
1117 case 'd': delay_secs = atoi(optarg); break;
1118 case 'D': dump_symtab = 1; break;
1119
1120 case 'e': error = parse_events(optarg); break;
1121
1122 case 'f': count_filter = atoi(optarg); break;
1123 case 'g': group = atoi(optarg); break;
1124 case 'h': display_help(); break;
1125 case 'l': scale = 1; break;
1126 case 'n': nmi = atoi(optarg); break;
1127 case 'p':
1128 /* CPU and PID are mutually exclusive */
1129 if (profile_cpu != -1) {
1130 printf("WARNING: PID switch overriding CPU\n");
1131 sleep(1);
1132 profile_cpu = -1;
1133 }
1134 tid = atoi(optarg); break;
1135 case 'r': realtime_prio = atoi(optarg); break;
1136 case 's': sym_filter = strdup(optarg); break;
1137 case 'S': run_perfstat = 1; break;
1138 case 'x': vmlinux = strdup(optarg); break;
1139 case 'z': zero = 1; break;
1140 case 'm': mmap_pages = atoi(optarg); break;
1141 case 'M': use_mmap = 1; break;
1142 case 'U': use_munmap = 1; break;
1143 default: error = 1; break;
1144 }
1145 }
1146 if (error)
1147 display_help();
1148
1149 if (!nr_counters) {
1150 if (run_perfstat)
1151 nr_counters = 8;
1152 else {
1153 nr_counters = 1;
1154 event_id[0] = 0;
1155 }
1156 }
1157
1158 for (counter = 0; counter < nr_counters; counter++) {
1159 if (event_count[counter])
1160 continue;
1161
1162 event_count[counter] = default_interval;
1163 }
1164}
1165
1166struct mmap_data {
1167 int counter;
1168 void *base;
1169 unsigned int mask;
1170 unsigned int prev;
1171};
1172
1173static unsigned int mmap_read_head(struct mmap_data *md)
1174{
1175 struct perf_counter_mmap_page *pc = md->base;
1176 int head;
1177
1178 head = pc->data_head;
1179 rmb();
1180
1181 return head;
1182}
1183
1184struct timeval last_read, this_read;
1185
1186static void mmap_read(struct mmap_data *md)
1187{
1188 unsigned int head = mmap_read_head(md);
1189 unsigned int old = md->prev;
1190 unsigned char *data = md->base + page_size;
1191 int diff;
1192
1193 gettimeofday(&this_read, NULL);
1194
1195 /*
1196 * If we're further behind than half the buffer, there's a chance
1197 * the writer will bite our tail and screw up the events under us.
1198 *
1199 * If we somehow ended up ahead of the head, we got messed up.
1200 *
1201 * In either case, truncate and restart at head.
1202 */
1203 diff = head - old;
1204 if (diff > md->mask / 2 || diff < 0) {
1205 struct timeval iv;
1206 unsigned long msecs;
1207
1208 timersub(&this_read, &last_read, &iv);
1209 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
1210
1211 fprintf(stderr, "WARNING: failed to keep up with mmap data."
1212 " Last read %lu msecs ago.\n", msecs);
1213
1214 /*
1215 * head points to a known good entry, start there.
1216 */
1217 old = head;
1218 }
1219
1220 last_read = this_read;
1221
1222 for (; old != head;) {
1223 struct ip_event {
1224 struct perf_event_header header;
1225 __u64 ip;
1226 __u32 pid, tid;
1227 };
1228 struct mmap_event {
1229 struct perf_event_header header;
1230 __u32 pid, tid;
1231 __u64 start;
1232 __u64 len;
1233 __u64 pgoff;
1234 char filename[PATH_MAX];
1235 };
1236
1237 typedef union event_union {
1238 struct perf_event_header header;
1239 struct ip_event ip;
1240 struct mmap_event mmap;
1241 } event_t;
1242
1243 event_t *event = (event_t *)&data[old & md->mask];
1244
1245 event_t event_copy;
1246
Ingo Molnar6f06ccb2009-04-20 15:22:22 +02001247 size_t size = event->header.size;
Ingo Molnar07800602009-04-20 15:00:56 +02001248
1249 /*
1250 * Event straddles the mmap boundary -- header should always
1251 * be inside due to u64 alignment of output.
1252 */
1253 if ((old & md->mask) + size != ((old + size) & md->mask)) {
1254 unsigned int offset = old;
1255 unsigned int len = min(sizeof(*event), size), cpy;
1256 void *dst = &event_copy;
1257
1258 do {
1259 cpy = min(md->mask + 1 - (offset & md->mask), len);
1260 memcpy(dst, &data[offset & md->mask], cpy);
1261 offset += cpy;
1262 dst += cpy;
1263 len -= cpy;
1264 } while (len);
1265
1266 event = &event_copy;
1267 }
1268
1269 old += size;
1270
1271 if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
1272 if (event->header.type & PERF_RECORD_IP)
1273 process_event(event->ip.ip, md->counter);
1274 } else {
1275 switch (event->header.type) {
1276 case PERF_EVENT_MMAP:
1277 case PERF_EVENT_MUNMAP:
1278 printf("%s: %Lu %Lu %Lu %s\n",
1279 event->header.type == PERF_EVENT_MMAP
1280 ? "mmap" : "munmap",
1281 event->mmap.start,
1282 event->mmap.len,
1283 event->mmap.pgoff,
1284 event->mmap.filename);
1285 break;
1286 }
1287 }
1288 }
1289
1290 md->prev = old;
1291}
1292
Ingo Molnar6f06ccb2009-04-20 15:22:22 +02001293int cmd_top(int argc, char **argv, const char *prefix)
Ingo Molnar07800602009-04-20 15:00:56 +02001294{
1295 struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
1296 struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
1297 struct perf_counter_hw_event hw_event;
1298 pthread_t thread;
1299 int i, counter, group_fd, nr_poll = 0;
1300 unsigned int cpu;
1301 int ret;
1302
1303 page_size = sysconf(_SC_PAGE_SIZE);
1304
1305 process_options(argc, argv);
1306
1307 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1308 assert(nr_cpus <= MAX_NR_CPUS);
1309 assert(nr_cpus >= 0);
1310
1311 if (run_perfstat)
1312 return do_perfstat(argc, argv);
1313
1314 if (tid != -1 || profile_cpu != -1)
1315 nr_cpus = 1;
1316
1317 parse_symbols();
1318 if (vmlinux && sym_filter_entry)
1319 parse_vmlinux(vmlinux);
1320
1321 for (i = 0; i < nr_cpus; i++) {
1322 group_fd = -1;
1323 for (counter = 0; counter < nr_counters; counter++) {
1324
1325 cpu = profile_cpu;
1326 if (tid == -1 && profile_cpu == -1)
1327 cpu = i;
1328
1329 memset(&hw_event, 0, sizeof(hw_event));
1330 hw_event.config = event_id[counter];
1331 hw_event.irq_period = event_count[counter];
1332 hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID;
1333 hw_event.nmi = nmi;
1334 hw_event.mmap = use_mmap;
1335 hw_event.munmap = use_munmap;
1336
1337 fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1338 if (fd[i][counter] < 0) {
1339 int err = errno;
1340 printf("kerneltop error: syscall returned with %d (%s)\n",
1341 fd[i][counter], strerror(err));
1342 if (err == EPERM)
1343 printf("Are you root?\n");
1344 exit(-1);
1345 }
1346 assert(fd[i][counter] >= 0);
1347 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1348
1349 /*
1350 * First counter acts as the group leader:
1351 */
1352 if (group && group_fd == -1)
1353 group_fd = fd[i][counter];
1354
1355 event_array[nr_poll].fd = fd[i][counter];
1356 event_array[nr_poll].events = POLLIN;
1357 nr_poll++;
1358
1359 mmap_array[i][counter].counter = counter;
1360 mmap_array[i][counter].prev = 0;
1361 mmap_array[i][counter].mask = mmap_pages*page_size - 1;
1362 mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
1363 PROT_READ, MAP_SHARED, fd[i][counter], 0);
1364 if (mmap_array[i][counter].base == MAP_FAILED) {
1365 printf("kerneltop error: failed to mmap with %d (%s)\n",
1366 errno, strerror(errno));
1367 exit(-1);
1368 }
1369 }
1370 }
1371
1372 if (pthread_create(&thread, NULL, display_thread, NULL)) {
1373 printf("Could not create display thread.\n");
1374 exit(-1);
1375 }
1376
1377 if (realtime_prio) {
1378 struct sched_param param;
1379
1380 param.sched_priority = realtime_prio;
1381 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1382 printf("Could not set realtime priority.\n");
1383 exit(-1);
1384 }
1385 }
1386
1387 while (1) {
1388 int hits = events;
1389
1390 for (i = 0; i < nr_cpus; i++) {
1391 for (counter = 0; counter < nr_counters; counter++)
1392 mmap_read(&mmap_array[i][counter]);
1393 }
1394
1395 if (hits == events)
1396 ret = poll(event_array, nr_poll, 100);
1397 }
1398
1399 return 0;
1400}