blob: 03518d75d864d68a35e5592540d804bf84e7a4ef [file] [log] [blame]
Ingo Molnarddcacfa2009-04-20 15:37:32 +02001/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */
30
31/*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 *
53 * Improvements and fixes by:
54 *
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
60 *
61 * Released under the GPL v2. (and only v2, not any later version)
62 */
63
Ingo Molnar148be2c2009-04-27 08:02:14 +020064#include "util/util.h"
Ingo Molnarddcacfa2009-04-20 15:37:32 +020065
66#include <getopt.h>
67#include <assert.h>
68#include <fcntl.h>
69#include <stdio.h>
70#include <errno.h>
Ingo Molnarddcacfa2009-04-20 15:37:32 +020071#include <time.h>
72#include <sched.h>
73#include <pthread.h>
74
75#include <sys/syscall.h>
76#include <sys/ioctl.h>
77#include <sys/poll.h>
78#include <sys/prctl.h>
79#include <sys/wait.h>
80#include <sys/uio.h>
81#include <sys/mman.h>
82
83#include <linux/unistd.h>
84#include <linux/types.h>
85
86#include "../../include/linux/perf_counter.h"
87
Thomas Gleixner6eda5832009-05-01 18:29:57 +020088#include "perf.h"
Ingo Molnarddcacfa2009-04-20 15:37:32 +020089
Peter Zijlstra16c8a102009-05-05 17:50:27 +020090#define EVENT_MASK_KERNEL 1
91#define EVENT_MASK_USER 2
92
Ingo Molnarddcacfa2009-04-20 15:37:32 +020093static int system_wide = 0;
94
95static int nr_counters = 0;
96static __u64 event_id[MAX_COUNTERS] = {
97 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
98 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
99 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
100 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
101
102 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
103 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
104 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
105 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
106};
107static int default_interval = 100000;
108static int event_count[MAX_COUNTERS];
109static int fd[MAX_NR_CPUS][MAX_COUNTERS];
Peter Zijlstra16c8a102009-05-05 17:50:27 +0200110static int event_mask[MAX_COUNTERS];
Ingo Molnarddcacfa2009-04-20 15:37:32 +0200111
112static int tid = -1;
113static int profile_cpu = -1;
114static int nr_cpus = 0;
115static int nmi = 1;
116static int group = 0;
117static unsigned int page_size;
118
119static int zero;
120
Ingo Molnar66cf7822009-04-30 13:53:33 +0200121static int scale = 1;
Ingo Molnarddcacfa2009-04-20 15:37:32 +0200122
123static const unsigned int default_count[] = {
124 1000000,
125 1000000,
126 10000,
127 10000,
128 1000000,
129 10000,
130};
131
132static char *hw_event_names[] = {
133 "CPU cycles",
134 "instructions",
135 "cache references",
136 "cache misses",
137 "branches",
138 "branch misses",
139 "bus cycles",
140};
141
142static char *sw_event_names[] = {
143 "cpu clock ticks",
144 "task clock ticks",
145 "pagefaults",
146 "context switches",
147 "CPU migrations",
148 "minor faults",
149 "major faults",
150};
151
152struct event_symbol {
153 __u64 event;
154 char *symbol;
155};
156
157static struct event_symbol event_symbols[] = {
158 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
159 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
160 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
161 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
162 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
163 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
164 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
165 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
166 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
167
168 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
169 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
170 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
171 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
172 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
173 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
174 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
175 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
176 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
177 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
178};
179
180#define __PERF_COUNTER_FIELD(config, name) \
181 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
182
183#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
184#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
185#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
186#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
187
188static void display_events_help(void)
189{
190 unsigned int i;
191 __u64 e;
192
193 printf(
194 " -e EVENT --event=EVENT # symbolic-name abbreviations");
195
196 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
197 int type, id;
198
199 e = event_symbols[i].event;
200 type = PERF_COUNTER_TYPE(e);
201 id = PERF_COUNTER_ID(e);
202
203 printf("\n %d:%d: %-20s",
204 type, id, event_symbols[i].symbol);
205 }
206
207 printf("\n"
208 " rNNN: raw PMU events (eventsel+umask)\n\n");
209}
210
211static void display_help(void)
212{
213 printf(
214 "Usage: perfstat [<events...>] <cmd...>\n\n"
215 "PerfStat Options (up to %d event types can be specified):\n\n",
216 MAX_COUNTERS);
217
218 display_events_help();
219
220 printf(
221 " -l # scale counter values\n"
222 " -a # system-wide collection\n");
223 exit(0);
224}
225
226static char *event_name(int ctr)
227{
228 __u64 config = event_id[ctr];
229 int type = PERF_COUNTER_TYPE(config);
230 int id = PERF_COUNTER_ID(config);
231 static char buf[32];
232
233 if (PERF_COUNTER_RAW(config)) {
234 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
235 return buf;
236 }
237
238 switch (type) {
239 case PERF_TYPE_HARDWARE:
240 if (id < PERF_HW_EVENTS_MAX)
241 return hw_event_names[id];
242 return "unknown-hardware";
243
244 case PERF_TYPE_SOFTWARE:
245 if (id < PERF_SW_EVENTS_MAX)
246 return sw_event_names[id];
247 return "unknown-software";
248
249 default:
250 break;
251 }
252
253 return "unknown";
254}
255
256/*
257 * Each event can have multiple symbolic names.
258 * Symbolic names are (almost) exactly matched.
259 */
260static __u64 match_event_symbols(char *str)
261{
262 __u64 config, id;
263 int type;
264 unsigned int i;
Peter Zijlstra16c8a102009-05-05 17:50:27 +0200265 char mask_str[4];
Ingo Molnarddcacfa2009-04-20 15:37:32 +0200266
267 if (sscanf(str, "r%llx", &config) == 1)
268 return config | PERF_COUNTER_RAW_MASK;
269
Peter Zijlstra16c8a102009-05-05 17:50:27 +0200270 switch (sscanf(str, "%d:%llu:%2s", &type, &id, mask_str)) {
271 case 3:
272 if (strchr(mask_str, 'u'))
273 event_mask[nr_counters] |= EVENT_MASK_USER;
274 if (strchr(mask_str, 'k'))
275 event_mask[nr_counters] |= EVENT_MASK_KERNEL;
276 case 2:
277 return EID(type, id);
278
279 default:
280 break;
281 }
Ingo Molnarddcacfa2009-04-20 15:37:32 +0200282
283 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
284 if (!strncmp(str, event_symbols[i].symbol,
285 strlen(event_symbols[i].symbol)))
286 return event_symbols[i].event;
287 }
288
289 return ~0ULL;
290}
291
292static int parse_events(char *str)
293{
294 __u64 config;
295
296again:
297 if (nr_counters == MAX_COUNTERS)
298 return -1;
299
300 config = match_event_symbols(str);
301 if (config == ~0ULL)
302 return -1;
303
304 event_id[nr_counters] = config;
305 nr_counters++;
306
307 str = strstr(str, ",");
308 if (str) {
309 str++;
310 goto again;
311 }
312
313 return 0;
314}
315
316
317/*
318 * perfstat
319 */
320
321char fault_here[1000000];
322
323static void create_perfstat_counter(int counter)
324{
325 struct perf_counter_hw_event hw_event;
326
327 memset(&hw_event, 0, sizeof(hw_event));
328 hw_event.config = event_id[counter];
329 hw_event.record_type = 0;
330 hw_event.nmi = 0;
Peter Zijlstra16c8a102009-05-05 17:50:27 +0200331 hw_event.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
332 hw_event.exclude_user = event_mask[counter] & EVENT_MASK_USER;
333
Ingo Molnarddcacfa2009-04-20 15:37:32 +0200334 if (scale)
335 hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
336 PERF_FORMAT_TOTAL_TIME_RUNNING;
337
338 if (system_wide) {
339 int cpu;
340 for (cpu = 0; cpu < nr_cpus; cpu ++) {
341 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
342 if (fd[cpu][counter] < 0) {
343 printf("perfstat error: syscall returned with %d (%s)\n",
344 fd[cpu][counter], strerror(errno));
345 exit(-1);
346 }
347 }
348 } else {
349 hw_event.inherit = 1;
350 hw_event.disabled = 1;
351
352 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
353 if (fd[0][counter] < 0) {
354 printf("perfstat error: syscall returned with %d (%s)\n",
355 fd[0][counter], strerror(errno));
356 exit(-1);
357 }
358 }
359}
360
361int do_perfstat(int argc, char *argv[])
362{
363 unsigned long long t0, t1;
364 int counter;
365 ssize_t res;
366 int status;
367 int pid;
368
369 if (!system_wide)
370 nr_cpus = 1;
371
372 for (counter = 0; counter < nr_counters; counter++)
373 create_perfstat_counter(counter);
374
375 argc -= optind;
376 argv += optind;
377
378 if (!argc)
379 display_help();
380
381 /*
382 * Enable counters and exec the command:
383 */
384 t0 = rdclock();
385 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
386
387 if ((pid = fork()) < 0)
388 perror("failed to fork");
389 if (!pid) {
390 if (execvp(argv[0], argv)) {
391 perror(argv[0]);
392 exit(-1);
393 }
394 }
395 while (wait(&status) >= 0)
396 ;
397 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
398 t1 = rdclock();
399
400 fflush(stdout);
401
402 fprintf(stderr, "\n");
403 fprintf(stderr, " Performance counter stats for \'%s\':\n",
404 argv[0]);
405 fprintf(stderr, "\n");
406
407 for (counter = 0; counter < nr_counters; counter++) {
408 int cpu, nv;
409 __u64 count[3], single_count[3];
410 int scaled;
411
412 count[0] = count[1] = count[2] = 0;
413 nv = scale ? 3 : 1;
414 for (cpu = 0; cpu < nr_cpus; cpu ++) {
415 res = read(fd[cpu][counter],
416 single_count, nv * sizeof(__u64));
417 assert(res == nv * sizeof(__u64));
418
419 count[0] += single_count[0];
420 if (scale) {
421 count[1] += single_count[1];
422 count[2] += single_count[2];
423 }
424 }
425
426 scaled = 0;
427 if (scale) {
428 if (count[2] == 0) {
429 fprintf(stderr, " %14s %-20s\n",
430 "<not counted>", event_name(counter));
431 continue;
432 }
433 if (count[2] < count[1]) {
434 scaled = 1;
435 count[0] = (unsigned long long)
436 ((double)count[0] * count[1] / count[2] + 0.5);
437 }
438 }
439
440 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
441 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
442
443 double msecs = (double)count[0] / 1000000;
444
445 fprintf(stderr, " %14.6f %-20s (msecs)",
446 msecs, event_name(counter));
447 } else {
448 fprintf(stderr, " %14Ld %-20s (events)",
449 count[0], event_name(counter));
450 }
451 if (scaled)
452 fprintf(stderr, " (scaled from %.2f%%)",
453 (double) count[2] / count[1] * 100);
454 fprintf(stderr, "\n");
455 }
456 fprintf(stderr, "\n");
457 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
458 (double)(t1-t0)/1e6);
459 fprintf(stderr, "\n");
460
461 return 0;
462}
463
464static void process_options(int argc, char **argv)
465{
466 int error = 0, counter;
467
468 for (;;) {
469 int option_index = 0;
470 /** Options for getopt */
471 static struct option long_options[] = {
472 {"count", required_argument, NULL, 'c'},
473 {"cpu", required_argument, NULL, 'C'},
474 {"delay", required_argument, NULL, 'd'},
475 {"dump_symtab", no_argument, NULL, 'D'},
476 {"event", required_argument, NULL, 'e'},
477 {"filter", required_argument, NULL, 'f'},
478 {"group", required_argument, NULL, 'g'},
479 {"help", no_argument, NULL, 'h'},
480 {"nmi", required_argument, NULL, 'n'},
481 {"munmap_info", no_argument, NULL, 'U'},
482 {"pid", required_argument, NULL, 'p'},
483 {"realtime", required_argument, NULL, 'r'},
484 {"scale", no_argument, NULL, 'l'},
485 {"symbol", required_argument, NULL, 's'},
486 {"stat", no_argument, NULL, 'S'},
487 {"vmlinux", required_argument, NULL, 'x'},
488 {"zero", no_argument, NULL, 'z'},
489 {NULL, 0, NULL, 0 }
490 };
491 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
492 long_options, &option_index);
493 if (c == -1)
494 break;
495
496 switch (c) {
497 case 'a': system_wide = 1; break;
498 case 'c': default_interval = atoi(optarg); break;
499 case 'C':
500 /* CPU and PID are mutually exclusive */
501 if (tid != -1) {
502 printf("WARNING: CPU switch overriding PID\n");
503 sleep(1);
504 tid = -1;
505 }
506 profile_cpu = atoi(optarg); break;
507
508 case 'e': error = parse_events(optarg); break;
509
510 case 'g': group = atoi(optarg); break;
511 case 'h': display_help(); break;
512 case 'l': scale = 1; break;
513 case 'n': nmi = atoi(optarg); break;
514 case 'p':
515 /* CPU and PID are mutually exclusive */
516 if (profile_cpu != -1) {
517 printf("WARNING: PID switch overriding CPU\n");
518 sleep(1);
519 profile_cpu = -1;
520 }
521 tid = atoi(optarg); break;
522 case 'z': zero = 1; break;
523 default: error = 1; break;
524 }
525 }
526 if (error)
527 display_help();
528
529 if (!nr_counters) {
530 nr_counters = 8;
531 }
532
533 for (counter = 0; counter < nr_counters; counter++) {
534 if (event_count[counter])
535 continue;
536
537 event_count[counter] = default_interval;
538 }
539}
540
Ingo Molnar58d7e992009-05-15 11:03:23 +0200541static void skip_signal(int signo)
542{
543}
544
Ingo Molnarddcacfa2009-04-20 15:37:32 +0200545int cmd_stat(int argc, char **argv, const char *prefix)
546{
Ingo Molnar58d7e992009-05-15 11:03:23 +0200547 sigset_t blocked;
548
Ingo Molnarddcacfa2009-04-20 15:37:32 +0200549 page_size = sysconf(_SC_PAGE_SIZE);
550
551 process_options(argc, argv);
552
553 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
554 assert(nr_cpus <= MAX_NR_CPUS);
555 assert(nr_cpus >= 0);
556
Ingo Molnar58d7e992009-05-15 11:03:23 +0200557 /*
558 * We dont want to block the signals - that would cause
559 * child tasks to inherit that and Ctrl-C would not work.
560 * What we want is for Ctrl-C to work in the exec()-ed
561 * task, but being ignored by perf stat itself:
562 */
563 signal(SIGINT, skip_signal);
564 signal(SIGALRM, skip_signal);
565 signal(SIGABRT, skip_signal);
566
Ingo Molnarddcacfa2009-04-20 15:37:32 +0200567 return do_perfstat(argc, argv);
568}