blob: 533f71570736ce368a13301056c748d6aa74706c [file] [log] [blame]
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001/*
Ingo Molnar57c0c152009-09-21 12:20:38 +02002 * Performance events core code:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
Ingo Molnar57c0c152009-09-21 12:20:38 +02009 * For licensing details see kernel-base/COPYING
Ingo Molnarcdd6c482009-09-21 12:02:48 +020010 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
Peter Zijlstra2e80a822010-11-17 23:17:36 +010016#include <linux/idr.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020017#include <linux/file.h>
18#include <linux/poll.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090019#include <linux/slab.h>
Frederic Weisbecker76e1d902010-04-05 15:35:57 +020020#include <linux/hash.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020021#include <linux/sysfs.h>
22#include <linux/dcache.h>
23#include <linux/percpu.h>
24#include <linux/ptrace.h>
Peter Zijlstrac2774432010-12-08 15:29:02 +010025#include <linux/reboot.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020026#include <linux/vmstat.h>
Peter Zijlstraabe43402010-11-17 23:17:37 +010027#include <linux/device.h>
Peter Zijlstra906010b2009-09-21 16:08:49 +020028#include <linux/vmalloc.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020029#include <linux/hardirq.h>
30#include <linux/rculist.h>
31#include <linux/uaccess.h>
32#include <linux/syscalls.h>
33#include <linux/anon_inodes.h>
34#include <linux/kernel_stat.h>
35#include <linux/perf_event.h>
Li Zefan6fb29152009-10-15 11:21:42 +080036#include <linux/ftrace_event.h>
Jason Wessel3c502e72010-11-04 17:33:01 -050037#include <linux/hw_breakpoint.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020038
39#include <asm/irq_regs.h>
40
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +010041struct remote_function_call {
42 struct task_struct *p;
43 int (*func)(void *info);
44 void *info;
45 int ret;
46};
47
48static void remote_function(void *data)
49{
50 struct remote_function_call *tfc = data;
51 struct task_struct *p = tfc->p;
52
53 if (p) {
54 tfc->ret = -EAGAIN;
55 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
56 return;
57 }
58
59 tfc->ret = tfc->func(tfc->info);
60}
61
62/**
63 * task_function_call - call a function on the cpu on which a task runs
64 * @p: the task to evaluate
65 * @func: the function to be called
66 * @info: the function call argument
67 *
68 * Calls the function @func when the task is currently running. This might
69 * be on the current CPU, which just calls the function directly
70 *
71 * returns: @func return value, or
72 * -ESRCH - when the process isn't running
73 * -EAGAIN - when the process moved away
74 */
75static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{
78 struct remote_function_call data = {
79 .p = p,
80 .func = func,
81 .info = info,
82 .ret = -ESRCH, /* No such (running) process */
83 };
84
85 if (task_curr(p))
86 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
87
88 return data.ret;
89}
90
91/**
92 * cpu_function_call - call a function on the cpu
93 * @func: the function to be called
94 * @info: the function call argument
95 *
96 * Calls the function @func on the remote cpu.
97 *
98 * returns: @func return value or -ENXIO when the cpu is offline
99 */
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{
102 struct remote_function_call data = {
103 .p = NULL,
104 .func = func,
105 .info = info,
106 .ret = -ENXIO, /* No such CPU */
107 };
108
109 smp_call_function_single(cpu, remote_function, &data, 1);
110
111 return data.ret;
112}
113
Stephane Eraniane5d13672011-02-14 11:20:01 +0200114#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
115 PERF_FLAG_FD_OUTPUT |\
116 PERF_FLAG_PID_CGROUP)
117
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200118enum event_type_t {
119 EVENT_FLEXIBLE = 0x1,
120 EVENT_PINNED = 0x2,
121 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
122};
123
Stephane Eraniane5d13672011-02-14 11:20:01 +0200124/*
125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */
128atomic_t perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200131static atomic_t nr_mmap_events __read_mostly;
132static atomic_t nr_comm_events __read_mostly;
133static atomic_t nr_task_events __read_mostly;
134
Peter Zijlstra108b02c2010-09-06 14:32:03 +0200135static LIST_HEAD(pmus);
136static DEFINE_MUTEX(pmus_lock);
137static struct srcu_struct pmus_srcu;
138
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200139/*
140 * perf event paranoia level:
141 * -1 - not paranoid at all
142 * 0 - disallow raw tracepoint access for unpriv
143 * 1 - disallow cpu events for unpriv
144 * 2 - disallow kernel profiling for unpriv
145 */
146int sysctl_perf_event_paranoid __read_mostly = 1;
147
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200148int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
149
150/*
151 * max perf event sample rate
152 */
Peter Zijlstra163ec432011-02-16 11:22:34 +0100153#define DEFAULT_MAX_SAMPLE_RATE 100000
154int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
155static int max_samples_per_tick __read_mostly =
156 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
157
158int perf_proc_update_handler(struct ctl_table *table, int write,
159 void __user *buffer, size_t *lenp,
160 loff_t *ppos)
161{
162 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
163
164 if (ret || !write)
165 return ret;
166
167 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
168
169 return 0;
170}
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200171
172static atomic64_t perf_event_id;
173
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200174static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
175 enum event_type_t event_type);
176
177static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
Stephane Eraniane5d13672011-02-14 11:20:01 +0200178 enum event_type_t event_type,
179 struct task_struct *task);
180
181static void update_context_time(struct perf_event_context *ctx);
182static u64 perf_event_time(struct perf_event *event);
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200183
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200184void __weak perf_event_print_debug(void) { }
185
Matt Fleming84c79912010-10-03 21:41:13 +0100186extern __weak const char *perf_pmu_name(void)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200187{
Matt Fleming84c79912010-10-03 21:41:13 +0100188 return "pmu";
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200189}
190
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200191static inline u64 perf_clock(void)
192{
193 return local_clock();
194}
195
Stephane Eraniane5d13672011-02-14 11:20:01 +0200196static inline struct perf_cpu_context *
197__get_cpu_context(struct perf_event_context *ctx)
198{
199 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
200}
201
202#ifdef CONFIG_CGROUP_PERF
203
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200204/*
205 * Must ensure cgroup is pinned (css_get) before calling
206 * this function. In other words, we cannot call this function
207 * if there is no cgroup event for the current CPU context.
208 */
Stephane Eraniane5d13672011-02-14 11:20:01 +0200209static inline struct perf_cgroup *
210perf_cgroup_from_task(struct task_struct *task)
211{
212 return container_of(task_subsys_state(task, perf_subsys_id),
213 struct perf_cgroup, css);
214}
215
216static inline bool
217perf_cgroup_match(struct perf_event *event)
218{
219 struct perf_event_context *ctx = event->ctx;
220 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
221
222 return !event->cgrp || event->cgrp == cpuctx->cgrp;
223}
224
225static inline void perf_get_cgroup(struct perf_event *event)
226{
227 css_get(&event->cgrp->css);
228}
229
230static inline void perf_put_cgroup(struct perf_event *event)
231{
232 css_put(&event->cgrp->css);
233}
234
235static inline void perf_detach_cgroup(struct perf_event *event)
236{
237 perf_put_cgroup(event);
238 event->cgrp = NULL;
239}
240
241static inline int is_cgroup_event(struct perf_event *event)
242{
243 return event->cgrp != NULL;
244}
245
246static inline u64 perf_cgroup_event_time(struct perf_event *event)
247{
248 struct perf_cgroup_info *t;
249
250 t = per_cpu_ptr(event->cgrp->info, event->cpu);
251 return t->time;
252}
253
254static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
255{
256 struct perf_cgroup_info *info;
257 u64 now;
258
259 now = perf_clock();
260
261 info = this_cpu_ptr(cgrp->info);
262
263 info->time += now - info->timestamp;
264 info->timestamp = now;
265}
266
267static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
268{
269 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
270 if (cgrp_out)
271 __update_cgrp_time(cgrp_out);
272}
273
274static inline void update_cgrp_time_from_event(struct perf_event *event)
275{
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200276 struct perf_cgroup *cgrp;
277
Stephane Eraniane5d13672011-02-14 11:20:01 +0200278 /*
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200279 * ensure we access cgroup data only when needed and
280 * when we know the cgroup is pinned (css_get)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200281 */
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200282 if (!is_cgroup_event(event))
Stephane Eraniane5d13672011-02-14 11:20:01 +0200283 return;
284
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200285 cgrp = perf_cgroup_from_task(current);
286 /*
287 * Do not update time when cgroup is not active
288 */
289 if (cgrp == event->cgrp)
290 __update_cgrp_time(event->cgrp);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200291}
292
293static inline void
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200294perf_cgroup_set_timestamp(struct task_struct *task,
295 struct perf_event_context *ctx)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200296{
297 struct perf_cgroup *cgrp;
298 struct perf_cgroup_info *info;
299
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200300 /*
301 * ctx->lock held by caller
302 * ensure we do not access cgroup data
303 * unless we have the cgroup pinned (css_get)
304 */
305 if (!task || !ctx->nr_cgroups)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200306 return;
307
308 cgrp = perf_cgroup_from_task(task);
309 info = this_cpu_ptr(cgrp->info);
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200310 info->timestamp = ctx->timestamp;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200311}
312
313#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
314#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
315
316/*
317 * reschedule events based on the cgroup constraint of task.
318 *
319 * mode SWOUT : schedule out everything
320 * mode SWIN : schedule in based on cgroup for next
321 */
322void perf_cgroup_switch(struct task_struct *task, int mode)
323{
324 struct perf_cpu_context *cpuctx;
325 struct pmu *pmu;
326 unsigned long flags;
327
328 /*
329 * disable interrupts to avoid geting nr_cgroup
330 * changes via __perf_event_disable(). Also
331 * avoids preemption.
332 */
333 local_irq_save(flags);
334
335 /*
336 * we reschedule only in the presence of cgroup
337 * constrained events.
338 */
339 rcu_read_lock();
340
341 list_for_each_entry_rcu(pmu, &pmus, entry) {
342
343 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
344
345 perf_pmu_disable(cpuctx->ctx.pmu);
346
347 /*
348 * perf_cgroup_events says at least one
349 * context on this CPU has cgroup events.
350 *
351 * ctx->nr_cgroups reports the number of cgroup
352 * events for a context.
353 */
354 if (cpuctx->ctx.nr_cgroups > 0) {
355
356 if (mode & PERF_CGROUP_SWOUT) {
357 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
358 /*
359 * must not be done before ctxswout due
360 * to event_filter_match() in event_sched_out()
361 */
362 cpuctx->cgrp = NULL;
363 }
364
365 if (mode & PERF_CGROUP_SWIN) {
366 /* set cgrp before ctxsw in to
367 * allow event_filter_match() to not
368 * have to pass task around
369 */
370 cpuctx->cgrp = perf_cgroup_from_task(task);
371 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
372 }
373 }
374
375 perf_pmu_enable(cpuctx->ctx.pmu);
376 }
377
378 rcu_read_unlock();
379
380 local_irq_restore(flags);
381}
382
383static inline void perf_cgroup_sched_out(struct task_struct *task)
384{
385 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
386}
387
388static inline void perf_cgroup_sched_in(struct task_struct *task)
389{
390 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
391}
392
393static inline int perf_cgroup_connect(int fd, struct perf_event *event,
394 struct perf_event_attr *attr,
395 struct perf_event *group_leader)
396{
397 struct perf_cgroup *cgrp;
398 struct cgroup_subsys_state *css;
399 struct file *file;
400 int ret = 0, fput_needed;
401
402 file = fget_light(fd, &fput_needed);
403 if (!file)
404 return -EBADF;
405
406 css = cgroup_css_from_dir(file, perf_subsys_id);
Li Zefan3db272c2011-03-03 14:25:37 +0800407 if (IS_ERR(css)) {
408 ret = PTR_ERR(css);
409 goto out;
410 }
Stephane Eraniane5d13672011-02-14 11:20:01 +0200411
412 cgrp = container_of(css, struct perf_cgroup, css);
413 event->cgrp = cgrp;
414
Li Zefanf75e18c2011-03-03 14:25:50 +0800415 /* must be done before we fput() the file */
416 perf_get_cgroup(event);
417
Stephane Eraniane5d13672011-02-14 11:20:01 +0200418 /*
419 * all events in a group must monitor
420 * the same cgroup because a task belongs
421 * to only one perf cgroup at a time
422 */
423 if (group_leader && group_leader->cgrp != cgrp) {
424 perf_detach_cgroup(event);
425 ret = -EINVAL;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200426 }
Li Zefan3db272c2011-03-03 14:25:37 +0800427out:
Stephane Eraniane5d13672011-02-14 11:20:01 +0200428 fput_light(file, fput_needed);
429 return ret;
430}
431
432static inline void
433perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
434{
435 struct perf_cgroup_info *t;
436 t = per_cpu_ptr(event->cgrp->info, event->cpu);
437 event->shadow_ctx_time = now - t->timestamp;
438}
439
440static inline void
441perf_cgroup_defer_enabled(struct perf_event *event)
442{
443 /*
444 * when the current task's perf cgroup does not match
445 * the event's, we need to remember to call the
446 * perf_mark_enable() function the first time a task with
447 * a matching perf cgroup is scheduled in.
448 */
449 if (is_cgroup_event(event) && !perf_cgroup_match(event))
450 event->cgrp_defer_enabled = 1;
451}
452
453static inline void
454perf_cgroup_mark_enabled(struct perf_event *event,
455 struct perf_event_context *ctx)
456{
457 struct perf_event *sub;
458 u64 tstamp = perf_event_time(event);
459
460 if (!event->cgrp_defer_enabled)
461 return;
462
463 event->cgrp_defer_enabled = 0;
464
465 event->tstamp_enabled = tstamp - event->total_time_enabled;
466 list_for_each_entry(sub, &event->sibling_list, group_entry) {
467 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
468 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
469 sub->cgrp_defer_enabled = 0;
470 }
471 }
472}
473#else /* !CONFIG_CGROUP_PERF */
474
475static inline bool
476perf_cgroup_match(struct perf_event *event)
477{
478 return true;
479}
480
481static inline void perf_detach_cgroup(struct perf_event *event)
482{}
483
484static inline int is_cgroup_event(struct perf_event *event)
485{
486 return 0;
487}
488
489static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
490{
491 return 0;
492}
493
494static inline void update_cgrp_time_from_event(struct perf_event *event)
495{
496}
497
498static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
499{
500}
501
502static inline void perf_cgroup_sched_out(struct task_struct *task)
503{
504}
505
506static inline void perf_cgroup_sched_in(struct task_struct *task)
507{
508}
509
510static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
511 struct perf_event_attr *attr,
512 struct perf_event *group_leader)
513{
514 return -EINVAL;
515}
516
517static inline void
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200518perf_cgroup_set_timestamp(struct task_struct *task,
519 struct perf_event_context *ctx)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200520{
521}
522
523void
524perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
525{
526}
527
528static inline void
529perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
530{
531}
532
533static inline u64 perf_cgroup_event_time(struct perf_event *event)
534{
535 return 0;
536}
537
538static inline void
539perf_cgroup_defer_enabled(struct perf_event *event)
540{
541}
542
543static inline void
544perf_cgroup_mark_enabled(struct perf_event *event,
545 struct perf_event_context *ctx)
546{
547}
548#endif
549
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200550void perf_pmu_disable(struct pmu *pmu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200551{
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200552 int *count = this_cpu_ptr(pmu->pmu_disable_count);
553 if (!(*count)++)
554 pmu->pmu_disable(pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200555}
556
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200557void perf_pmu_enable(struct pmu *pmu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200558{
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200559 int *count = this_cpu_ptr(pmu->pmu_disable_count);
560 if (!--(*count))
561 pmu->pmu_enable(pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200562}
563
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200564static DEFINE_PER_CPU(struct list_head, rotation_list);
565
566/*
567 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
568 * because they're strictly cpu affine and rotate_start is called with IRQs
569 * disabled, while rotate_context is called from IRQ context.
570 */
Peter Zijlstra108b02c2010-09-06 14:32:03 +0200571static void perf_pmu_rotate_start(struct pmu *pmu)
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200572{
Peter Zijlstra108b02c2010-09-06 14:32:03 +0200573 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200574 struct list_head *head = &__get_cpu_var(rotation_list);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200575
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200576 WARN_ON(!irqs_disabled());
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200577
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200578 if (list_empty(&cpuctx->rotation_list))
579 list_add(&cpuctx->rotation_list, head);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200580}
581
582static void get_ctx(struct perf_event_context *ctx)
583{
584 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
585}
586
587static void free_ctx(struct rcu_head *head)
588{
589 struct perf_event_context *ctx;
590
591 ctx = container_of(head, struct perf_event_context, rcu_head);
592 kfree(ctx);
593}
594
595static void put_ctx(struct perf_event_context *ctx)
596{
597 if (atomic_dec_and_test(&ctx->refcount)) {
598 if (ctx->parent_ctx)
599 put_ctx(ctx->parent_ctx);
600 if (ctx->task)
601 put_task_struct(ctx->task);
602 call_rcu(&ctx->rcu_head, free_ctx);
603 }
604}
605
606static void unclone_ctx(struct perf_event_context *ctx)
607{
608 if (ctx->parent_ctx) {
609 put_ctx(ctx->parent_ctx);
610 ctx->parent_ctx = NULL;
611 }
612}
613
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -0200614static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
615{
616 /*
617 * only top level events have the pid namespace they were created in
618 */
619 if (event->parent)
620 event = event->parent;
621
622 return task_tgid_nr_ns(p, event->ns);
623}
624
625static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
626{
627 /*
628 * only top level events have the pid namespace they were created in
629 */
630 if (event->parent)
631 event = event->parent;
632
633 return task_pid_nr_ns(p, event->ns);
634}
635
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200636/*
637 * If we inherit events we want to return the parent event id
638 * to userspace.
639 */
640static u64 primary_event_id(struct perf_event *event)
641{
642 u64 id = event->id;
643
644 if (event->parent)
645 id = event->parent->id;
646
647 return id;
648}
649
650/*
651 * Get the perf_event_context for a task and lock it.
652 * This has to cope with with the fact that until it is locked,
653 * the context could get moved to another task.
654 */
655static struct perf_event_context *
Peter Zijlstra8dc85d52010-09-02 16:50:03 +0200656perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200657{
658 struct perf_event_context *ctx;
659
660 rcu_read_lock();
Peter Zijlstra9ed60602010-06-11 17:36:35 +0200661retry:
Peter Zijlstra8dc85d52010-09-02 16:50:03 +0200662 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200663 if (ctx) {
664 /*
665 * If this context is a clone of another, it might
666 * get swapped for another underneath us by
667 * perf_event_task_sched_out, though the
668 * rcu_read_lock() protects us from any context
669 * getting freed. Lock the context and check if it
670 * got swapped before we could get the lock, and retry
671 * if so. If we locked the right context, then it
672 * can't get swapped on us any more.
673 */
Thomas Gleixnere625cce12009-11-17 18:02:06 +0100674 raw_spin_lock_irqsave(&ctx->lock, *flags);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +0200675 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
Thomas Gleixnere625cce12009-11-17 18:02:06 +0100676 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200677 goto retry;
678 }
679
680 if (!atomic_inc_not_zero(&ctx->refcount)) {
Thomas Gleixnere625cce12009-11-17 18:02:06 +0100681 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200682 ctx = NULL;
683 }
684 }
685 rcu_read_unlock();
686 return ctx;
687}
688
689/*
690 * Get the context for a task and increment its pin_count so it
691 * can't get swapped to another task. This also increments its
692 * reference count so that the context can't get freed.
693 */
Peter Zijlstra8dc85d52010-09-02 16:50:03 +0200694static struct perf_event_context *
695perf_pin_task_context(struct task_struct *task, int ctxn)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200696{
697 struct perf_event_context *ctx;
698 unsigned long flags;
699
Peter Zijlstra8dc85d52010-09-02 16:50:03 +0200700 ctx = perf_lock_task_context(task, ctxn, &flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200701 if (ctx) {
702 ++ctx->pin_count;
Thomas Gleixnere625cce12009-11-17 18:02:06 +0100703 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200704 }
705 return ctx;
706}
707
708static void perf_unpin_context(struct perf_event_context *ctx)
709{
710 unsigned long flags;
711
Thomas Gleixnere625cce12009-11-17 18:02:06 +0100712 raw_spin_lock_irqsave(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200713 --ctx->pin_count;
Thomas Gleixnere625cce12009-11-17 18:02:06 +0100714 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200715}
716
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100717/*
718 * Update the record of the current time in a context.
719 */
720static void update_context_time(struct perf_event_context *ctx)
721{
722 u64 now = perf_clock();
723
724 ctx->time += now - ctx->timestamp;
725 ctx->timestamp = now;
726}
727
Stephane Eranian41587552011-01-03 18:20:01 +0200728static u64 perf_event_time(struct perf_event *event)
729{
730 struct perf_event_context *ctx = event->ctx;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200731
732 if (is_cgroup_event(event))
733 return perf_cgroup_event_time(event);
734
Stephane Eranian41587552011-01-03 18:20:01 +0200735 return ctx ? ctx->time : 0;
736}
737
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100738/*
739 * Update the total_time_enabled and total_time_running fields for a event.
740 */
741static void update_event_times(struct perf_event *event)
742{
743 struct perf_event_context *ctx = event->ctx;
744 u64 run_end;
745
746 if (event->state < PERF_EVENT_STATE_INACTIVE ||
747 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
748 return;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200749 /*
750 * in cgroup mode, time_enabled represents
751 * the time the event was enabled AND active
752 * tasks were in the monitored cgroup. This is
753 * independent of the activity of the context as
754 * there may be a mix of cgroup and non-cgroup events.
755 *
756 * That is why we treat cgroup events differently
757 * here.
758 */
759 if (is_cgroup_event(event))
Stephane Eranian41587552011-01-03 18:20:01 +0200760 run_end = perf_event_time(event);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200761 else if (ctx->is_active)
762 run_end = ctx->time;
Peter Zijlstraacd1d7c2009-11-23 15:00:36 +0100763 else
764 run_end = event->tstamp_stopped;
765
766 event->total_time_enabled = run_end - event->tstamp_enabled;
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100767
768 if (event->state == PERF_EVENT_STATE_INACTIVE)
769 run_end = event->tstamp_stopped;
770 else
Stephane Eranian41587552011-01-03 18:20:01 +0200771 run_end = perf_event_time(event);
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100772
773 event->total_time_running = run_end - event->tstamp_running;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200774
Peter Zijlstraf67218c2009-11-23 11:37:27 +0100775}
776
Peter Zijlstra96c21a42010-05-11 16:19:10 +0200777/*
778 * Update total_time_enabled and total_time_running for all events in a group.
779 */
780static void update_group_times(struct perf_event *leader)
781{
782 struct perf_event *event;
783
784 update_event_times(leader);
785 list_for_each_entry(event, &leader->sibling_list, group_entry)
786 update_event_times(event);
787}
788
Frederic Weisbecker889ff012010-01-09 20:04:47 +0100789static struct list_head *
790ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
791{
792 if (event->attr.pinned)
793 return &ctx->pinned_groups;
794 else
795 return &ctx->flexible_groups;
796}
797
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200798/*
799 * Add a event from the lists for its context.
800 * Must be called with ctx->mutex and ctx->lock held.
801 */
802static void
803list_add_event(struct perf_event *event, struct perf_event_context *ctx)
804{
Peter Zijlstra8a495422010-05-27 15:47:49 +0200805 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
806 event->attach_state |= PERF_ATTACH_CONTEXT;
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200807
808 /*
Peter Zijlstra8a495422010-05-27 15:47:49 +0200809 * If we're a stand alone event or group leader, we go to the context
810 * list, group events are kept attached to the group so that
811 * perf_group_detach can, at all times, locate all siblings.
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200812 */
Peter Zijlstra8a495422010-05-27 15:47:49 +0200813 if (event->group_leader == event) {
Frederic Weisbecker889ff012010-01-09 20:04:47 +0100814 struct list_head *list;
815
Frederic Weisbeckerd6f962b2010-01-10 01:25:51 +0100816 if (is_software_event(event))
817 event->group_flags |= PERF_GROUP_SOFTWARE;
818
Frederic Weisbecker889ff012010-01-09 20:04:47 +0100819 list = ctx_group_list(event, ctx);
820 list_add_tail(&event->group_entry, list);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200821 }
822
Peter Zijlstra08309372011-03-03 11:31:20 +0100823 if (is_cgroup_event(event))
Stephane Eraniane5d13672011-02-14 11:20:01 +0200824 ctx->nr_cgroups++;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200825
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200826 list_add_rcu(&event->event_entry, &ctx->event_list);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200827 if (!ctx->nr_events)
Peter Zijlstra108b02c2010-09-06 14:32:03 +0200828 perf_pmu_rotate_start(ctx->pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200829 ctx->nr_events++;
830 if (event->attr.inherit_stat)
831 ctx->nr_stat++;
832}
833
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200834/*
835 * Called at perf_event creation and when events are attached/detached from a
836 * group.
837 */
838static void perf_event__read_size(struct perf_event *event)
839{
840 int entry = sizeof(u64); /* value */
841 int size = 0;
842 int nr = 1;
843
844 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
845 size += sizeof(u64);
846
847 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
848 size += sizeof(u64);
849
850 if (event->attr.read_format & PERF_FORMAT_ID)
851 entry += sizeof(u64);
852
853 if (event->attr.read_format & PERF_FORMAT_GROUP) {
854 nr += event->group_leader->nr_siblings;
855 size += sizeof(u64);
856 }
857
858 size += entry * nr;
859 event->read_size = size;
860}
861
862static void perf_event__header_size(struct perf_event *event)
863{
864 struct perf_sample_data *data;
865 u64 sample_type = event->attr.sample_type;
866 u16 size = 0;
867
868 perf_event__read_size(event);
869
870 if (sample_type & PERF_SAMPLE_IP)
871 size += sizeof(data->ip);
872
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -0200873 if (sample_type & PERF_SAMPLE_ADDR)
874 size += sizeof(data->addr);
875
876 if (sample_type & PERF_SAMPLE_PERIOD)
877 size += sizeof(data->period);
878
879 if (sample_type & PERF_SAMPLE_READ)
880 size += event->read_size;
881
882 event->header_size = size;
883}
884
885static void perf_event__id_header_size(struct perf_event *event)
886{
887 struct perf_sample_data *data;
888 u64 sample_type = event->attr.sample_type;
889 u16 size = 0;
890
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200891 if (sample_type & PERF_SAMPLE_TID)
892 size += sizeof(data->tid_entry);
893
894 if (sample_type & PERF_SAMPLE_TIME)
895 size += sizeof(data->time);
896
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200897 if (sample_type & PERF_SAMPLE_ID)
898 size += sizeof(data->id);
899
900 if (sample_type & PERF_SAMPLE_STREAM_ID)
901 size += sizeof(data->stream_id);
902
903 if (sample_type & PERF_SAMPLE_CPU)
904 size += sizeof(data->cpu_entry);
905
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -0200906 event->id_header_size = size;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200907}
908
Peter Zijlstra8a495422010-05-27 15:47:49 +0200909static void perf_group_attach(struct perf_event *event)
910{
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200911 struct perf_event *group_leader = event->group_leader, *pos;
Peter Zijlstra8a495422010-05-27 15:47:49 +0200912
Peter Zijlstra74c33372010-10-15 11:40:29 +0200913 /*
914 * We can have double attach due to group movement in perf_event_open.
915 */
916 if (event->attach_state & PERF_ATTACH_GROUP)
917 return;
918
Peter Zijlstra8a495422010-05-27 15:47:49 +0200919 event->attach_state |= PERF_ATTACH_GROUP;
920
921 if (group_leader == event)
922 return;
923
924 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
925 !is_software_event(event))
926 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
927
928 list_add_tail(&event->group_entry, &group_leader->sibling_list);
929 group_leader->nr_siblings++;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200930
931 perf_event__header_size(group_leader);
932
933 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
934 perf_event__header_size(pos);
Peter Zijlstra8a495422010-05-27 15:47:49 +0200935}
936
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200937/*
938 * Remove a event from the lists for its context.
939 * Must be called with ctx->mutex and ctx->lock held.
940 */
941static void
942list_del_event(struct perf_event *event, struct perf_event_context *ctx)
943{
Peter Zijlstra8a495422010-05-27 15:47:49 +0200944 /*
945 * We can have double detach due to exit/hot-unplug + close.
946 */
947 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200948 return;
Peter Zijlstra8a495422010-05-27 15:47:49 +0200949
950 event->attach_state &= ~PERF_ATTACH_CONTEXT;
951
Peter Zijlstra08309372011-03-03 11:31:20 +0100952 if (is_cgroup_event(event))
Stephane Eraniane5d13672011-02-14 11:20:01 +0200953 ctx->nr_cgroups--;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200954
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200955 ctx->nr_events--;
956 if (event->attr.inherit_stat)
957 ctx->nr_stat--;
958
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200959 list_del_rcu(&event->event_entry);
960
Peter Zijlstra8a495422010-05-27 15:47:49 +0200961 if (event->group_leader == event)
962 list_del_init(&event->group_entry);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200963
Peter Zijlstra96c21a42010-05-11 16:19:10 +0200964 update_group_times(event);
Stephane Eranianb2e74a22009-11-26 09:24:30 -0800965
966 /*
967 * If event was in error state, then keep it
968 * that way, otherwise bogus counts will be
969 * returned on read(). The only way to get out
970 * of error state is by explicit re-enabling
971 * of the event
972 */
973 if (event->state > PERF_EVENT_STATE_OFF)
974 event->state = PERF_EVENT_STATE_OFF;
Peter Zijlstra050735b2010-05-11 11:51:53 +0200975}
976
Peter Zijlstra8a495422010-05-27 15:47:49 +0200977static void perf_group_detach(struct perf_event *event)
Peter Zijlstra050735b2010-05-11 11:51:53 +0200978{
979 struct perf_event *sibling, *tmp;
Peter Zijlstra8a495422010-05-27 15:47:49 +0200980 struct list_head *list = NULL;
981
982 /*
983 * We can have double detach due to exit/hot-unplug + close.
984 */
985 if (!(event->attach_state & PERF_ATTACH_GROUP))
986 return;
987
988 event->attach_state &= ~PERF_ATTACH_GROUP;
989
990 /*
991 * If this is a sibling, remove it from its group.
992 */
993 if (event->group_leader != event) {
994 list_del_init(&event->group_entry);
995 event->group_leader->nr_siblings--;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -0200996 goto out;
Peter Zijlstra8a495422010-05-27 15:47:49 +0200997 }
998
999 if (!list_empty(&event->group_entry))
1000 list = &event->group_entry;
Peter Zijlstra2e2af502009-11-23 11:37:25 +01001001
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001002 /*
1003 * If this was a group event with sibling events then
1004 * upgrade the siblings to singleton events by adding them
Peter Zijlstra8a495422010-05-27 15:47:49 +02001005 * to whatever list we are on.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001006 */
1007 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
Peter Zijlstra8a495422010-05-27 15:47:49 +02001008 if (list)
1009 list_move_tail(&sibling->group_entry, list);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001010 sibling->group_leader = sibling;
Frederic Weisbeckerd6f962b2010-01-10 01:25:51 +01001011
1012 /* Inherit group flags from the previous leader */
1013 sibling->group_flags = event->group_flags;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001014 }
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001015
1016out:
1017 perf_event__header_size(event->group_leader);
1018
1019 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1020 perf_event__header_size(tmp);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001021}
1022
Stephane Eranianfa66f072010-08-26 16:40:01 +02001023static inline int
1024event_filter_match(struct perf_event *event)
1025{
Stephane Eraniane5d13672011-02-14 11:20:01 +02001026 return (event->cpu == -1 || event->cpu == smp_processor_id())
1027 && perf_cgroup_match(event);
Stephane Eranianfa66f072010-08-26 16:40:01 +02001028}
1029
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001030static void
1031event_sched_out(struct perf_event *event,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001032 struct perf_cpu_context *cpuctx,
1033 struct perf_event_context *ctx)
1034{
Stephane Eranian41587552011-01-03 18:20:01 +02001035 u64 tstamp = perf_event_time(event);
Stephane Eranianfa66f072010-08-26 16:40:01 +02001036 u64 delta;
1037 /*
1038 * An event which could not be activated because of
1039 * filter mismatch still needs to have its timings
1040 * maintained, otherwise bogus information is return
1041 * via read() for time_enabled, time_running:
1042 */
1043 if (event->state == PERF_EVENT_STATE_INACTIVE
1044 && !event_filter_match(event)) {
Stephane Eraniane5d13672011-02-14 11:20:01 +02001045 delta = tstamp - event->tstamp_stopped;
Stephane Eranianfa66f072010-08-26 16:40:01 +02001046 event->tstamp_running += delta;
Stephane Eranian41587552011-01-03 18:20:01 +02001047 event->tstamp_stopped = tstamp;
Stephane Eranianfa66f072010-08-26 16:40:01 +02001048 }
1049
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001050 if (event->state != PERF_EVENT_STATE_ACTIVE)
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001051 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001052
1053 event->state = PERF_EVENT_STATE_INACTIVE;
1054 if (event->pending_disable) {
1055 event->pending_disable = 0;
1056 event->state = PERF_EVENT_STATE_OFF;
1057 }
Stephane Eranian41587552011-01-03 18:20:01 +02001058 event->tstamp_stopped = tstamp;
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02001059 event->pmu->del(event, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001060 event->oncpu = -1;
1061
1062 if (!is_software_event(event))
1063 cpuctx->active_oncpu--;
1064 ctx->nr_active--;
1065 if (event->attr.exclusive || !cpuctx->active_oncpu)
1066 cpuctx->exclusive = 0;
1067}
1068
1069static void
1070group_sched_out(struct perf_event *group_event,
1071 struct perf_cpu_context *cpuctx,
1072 struct perf_event_context *ctx)
1073{
1074 struct perf_event *event;
Stephane Eranianfa66f072010-08-26 16:40:01 +02001075 int state = group_event->state;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001076
1077 event_sched_out(group_event, cpuctx, ctx);
1078
1079 /*
1080 * Schedule out siblings (if any):
1081 */
1082 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1083 event_sched_out(event, cpuctx, ctx);
1084
Stephane Eranianfa66f072010-08-26 16:40:01 +02001085 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001086 cpuctx->exclusive = 0;
1087}
1088
1089/*
1090 * Cross CPU call to remove a performance event
1091 *
1092 * We disable the event on the hardware level first. After that we
1093 * remove it from the context list.
1094 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001095static int __perf_remove_from_context(void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001096{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001097 struct perf_event *event = info;
1098 struct perf_event_context *ctx = event->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001099 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001100
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001101 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001102 event_sched_out(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001103 list_del_event(event, ctx);
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001104 raw_spin_unlock(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001105
1106 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001107}
1108
1109
1110/*
1111 * Remove the event from a task's (or a CPU's) list of events.
1112 *
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001113 * CPU events are removed with a smp call. For task events we only
1114 * call when the task is on a CPU.
1115 *
1116 * If event->ctx is a cloned context, callers must make sure that
1117 * every task struct that event->ctx->task could possibly point to
1118 * remains valid. This is OK when called from perf_release since
1119 * that only calls us on the top-level context, which can't be a clone.
1120 * When called from perf_event_exit_task, it's OK because the
1121 * context has been detached from its task.
1122 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001123static void perf_remove_from_context(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001124{
1125 struct perf_event_context *ctx = event->ctx;
1126 struct task_struct *task = ctx->task;
1127
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001128 lockdep_assert_held(&ctx->mutex);
1129
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001130 if (!task) {
1131 /*
1132 * Per cpu events are removed via an smp call and
André Goddard Rosaaf901ca2009-11-14 13:09:05 -02001133 * the removal is always successful.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001134 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001135 cpu_function_call(event->cpu, __perf_remove_from_context, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001136 return;
1137 }
1138
1139retry:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001140 if (!task_function_call(task, __perf_remove_from_context, event))
1141 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001142
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001143 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001144 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001145 * If we failed to find a running task, but find the context active now
1146 * that we've acquired the ctx->lock, retry.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001147 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001148 if (ctx->is_active) {
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001149 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001150 goto retry;
1151 }
1152
1153 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001154 * Since the task isn't running, its safe to remove the event, us
1155 * holding the ctx->lock ensures the task won't get scheduled in.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001156 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001157 list_del_event(event, ctx);
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001158 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001159}
1160
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001161/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001162 * Cross CPU call to disable a performance event
1163 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001164static int __perf_event_disable(void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001165{
1166 struct perf_event *event = info;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001167 struct perf_event_context *ctx = event->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001168 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001169
1170 /*
1171 * If this is a per-task event, need to check whether this
1172 * event's task is the current task on this cpu.
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001173 *
1174 * Can trigger due to concurrent perf_event_context_sched_out()
1175 * flipping contexts around.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001176 */
1177 if (ctx->task && cpuctx->task_ctx != ctx)
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001178 return -EINVAL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001179
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001180 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001181
1182 /*
1183 * If the event is on, turn it off.
1184 * If it is in error state, leave it in error state.
1185 */
1186 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1187 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001188 update_cgrp_time_from_event(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001189 update_group_times(event);
1190 if (event == event->group_leader)
1191 group_sched_out(event, cpuctx, ctx);
1192 else
1193 event_sched_out(event, cpuctx, ctx);
1194 event->state = PERF_EVENT_STATE_OFF;
1195 }
1196
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001197 raw_spin_unlock(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001198
1199 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001200}
1201
1202/*
1203 * Disable a event.
1204 *
1205 * If event->ctx is a cloned context, callers must make sure that
1206 * every task struct that event->ctx->task could possibly point to
1207 * remains valid. This condition is satisifed when called through
1208 * perf_event_for_each_child or perf_event_for_each because they
1209 * hold the top-level event's child_mutex, so any descendant that
1210 * goes to exit will block in sync_child_event.
1211 * When called from perf_pending_event it's OK because event->ctx
1212 * is the current context on this CPU and preemption is disabled,
1213 * hence we can't get into perf_event_task_sched_out for this context.
1214 */
Frederic Weisbecker44234ad2009-12-09 09:25:48 +01001215void perf_event_disable(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001216{
1217 struct perf_event_context *ctx = event->ctx;
1218 struct task_struct *task = ctx->task;
1219
1220 if (!task) {
1221 /*
1222 * Disable the event on the cpu that it's on
1223 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001224 cpu_function_call(event->cpu, __perf_event_disable, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001225 return;
1226 }
1227
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001228retry:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001229 if (!task_function_call(task, __perf_event_disable, event))
1230 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001231
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001232 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001233 /*
1234 * If the event is still active, we need to retry the cross-call.
1235 */
1236 if (event->state == PERF_EVENT_STATE_ACTIVE) {
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001237 raw_spin_unlock_irq(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001238 /*
1239 * Reload the task pointer, it might have been changed by
1240 * a concurrent perf_event_context_sched_out().
1241 */
1242 task = ctx->task;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001243 goto retry;
1244 }
1245
1246 /*
1247 * Since we have the lock this context can't be scheduled
1248 * in, so we can change the state safely.
1249 */
1250 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1251 update_group_times(event);
1252 event->state = PERF_EVENT_STATE_OFF;
1253 }
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001254 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001255}
1256
Stephane Eraniane5d13672011-02-14 11:20:01 +02001257static void perf_set_shadow_time(struct perf_event *event,
1258 struct perf_event_context *ctx,
1259 u64 tstamp)
1260{
1261 /*
1262 * use the correct time source for the time snapshot
1263 *
1264 * We could get by without this by leveraging the
1265 * fact that to get to this function, the caller
1266 * has most likely already called update_context_time()
1267 * and update_cgrp_time_xx() and thus both timestamp
1268 * are identical (or very close). Given that tstamp is,
1269 * already adjusted for cgroup, we could say that:
1270 * tstamp - ctx->timestamp
1271 * is equivalent to
1272 * tstamp - cgrp->timestamp.
1273 *
1274 * Then, in perf_output_read(), the calculation would
1275 * work with no changes because:
1276 * - event is guaranteed scheduled in
1277 * - no scheduled out in between
1278 * - thus the timestamp would be the same
1279 *
1280 * But this is a bit hairy.
1281 *
1282 * So instead, we have an explicit cgroup call to remain
1283 * within the time time source all along. We believe it
1284 * is cleaner and simpler to understand.
1285 */
1286 if (is_cgroup_event(event))
1287 perf_cgroup_set_shadow_time(event, tstamp);
1288 else
1289 event->shadow_ctx_time = tstamp - ctx->timestamp;
1290}
1291
Peter Zijlstra4fe757d2011-02-15 22:26:07 +01001292#define MAX_INTERRUPTS (~0ULL)
1293
1294static void perf_log_throttle(struct perf_event *event, int enable);
1295
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001296static int
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001297event_sched_in(struct perf_event *event,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001298 struct perf_cpu_context *cpuctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01001299 struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001300{
Stephane Eranian41587552011-01-03 18:20:01 +02001301 u64 tstamp = perf_event_time(event);
1302
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001303 if (event->state <= PERF_EVENT_STATE_OFF)
1304 return 0;
1305
1306 event->state = PERF_EVENT_STATE_ACTIVE;
Peter Zijlstra6e377382010-02-11 13:21:58 +01001307 event->oncpu = smp_processor_id();
Peter Zijlstra4fe757d2011-02-15 22:26:07 +01001308
1309 /*
1310 * Unthrottle events, since we scheduled we might have missed several
1311 * ticks already, also for a heavily scheduling task there is little
1312 * guarantee it'll get a tick in a timely manner.
1313 */
1314 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1315 perf_log_throttle(event, 1);
1316 event->hw.interrupts = 0;
1317 }
1318
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001319 /*
1320 * The new state must be visible before we turn it on in the hardware:
1321 */
1322 smp_wmb();
1323
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02001324 if (event->pmu->add(event, PERF_EF_START)) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001325 event->state = PERF_EVENT_STATE_INACTIVE;
1326 event->oncpu = -1;
1327 return -EAGAIN;
1328 }
1329
Stephane Eranian41587552011-01-03 18:20:01 +02001330 event->tstamp_running += tstamp - event->tstamp_stopped;
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001331
Stephane Eraniane5d13672011-02-14 11:20:01 +02001332 perf_set_shadow_time(event, ctx, tstamp);
Stephane Eranianeed01522010-10-26 16:08:01 +02001333
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001334 if (!is_software_event(event))
1335 cpuctx->active_oncpu++;
1336 ctx->nr_active++;
1337
1338 if (event->attr.exclusive)
1339 cpuctx->exclusive = 1;
1340
1341 return 0;
1342}
1343
1344static int
1345group_sched_in(struct perf_event *group_event,
1346 struct perf_cpu_context *cpuctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01001347 struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001348{
Lin Ming6bde9b62010-04-23 13:56:00 +08001349 struct perf_event *event, *partial_group = NULL;
Peter Zijlstra51b0fe32010-06-11 13:35:57 +02001350 struct pmu *pmu = group_event->pmu;
Stephane Eraniand7842da2010-10-20 15:25:01 +02001351 u64 now = ctx->time;
1352 bool simulate = false;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001353
1354 if (group_event->state == PERF_EVENT_STATE_OFF)
1355 return 0;
1356
Peter Zijlstraad5133b2010-06-15 12:22:39 +02001357 pmu->start_txn(pmu);
Lin Ming6bde9b62010-04-23 13:56:00 +08001358
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001359 if (event_sched_in(group_event, cpuctx, ctx)) {
Peter Zijlstraad5133b2010-06-15 12:22:39 +02001360 pmu->cancel_txn(pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001361 return -EAGAIN;
Stephane Eranian90151c352010-05-25 16:23:10 +02001362 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001363
1364 /*
1365 * Schedule in siblings as one group (if any):
1366 */
1367 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001368 if (event_sched_in(event, cpuctx, ctx)) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001369 partial_group = event;
1370 goto group_error;
1371 }
1372 }
1373
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001374 if (!pmu->commit_txn(pmu))
Paul Mackerras6e851582010-05-08 20:58:00 +10001375 return 0;
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001376
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001377group_error:
1378 /*
1379 * Groups can be scheduled in as one unit only, so undo any
1380 * partial group before returning:
Stephane Eraniand7842da2010-10-20 15:25:01 +02001381 * The events up to the failed event are scheduled out normally,
1382 * tstamp_stopped will be updated.
1383 *
1384 * The failed events and the remaining siblings need to have
1385 * their timings updated as if they had gone thru event_sched_in()
1386 * and event_sched_out(). This is required to get consistent timings
1387 * across the group. This also takes care of the case where the group
1388 * could never be scheduled by ensuring tstamp_stopped is set to mark
1389 * the time the event was actually stopped, such that time delta
1390 * calculation in update_event_times() is correct.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001391 */
1392 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1393 if (event == partial_group)
Stephane Eraniand7842da2010-10-20 15:25:01 +02001394 simulate = true;
1395
1396 if (simulate) {
1397 event->tstamp_running += now - event->tstamp_stopped;
1398 event->tstamp_stopped = now;
1399 } else {
1400 event_sched_out(event, cpuctx, ctx);
1401 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001402 }
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001403 event_sched_out(group_event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001404
Peter Zijlstraad5133b2010-06-15 12:22:39 +02001405 pmu->cancel_txn(pmu);
Stephane Eranian90151c352010-05-25 16:23:10 +02001406
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001407 return -EAGAIN;
1408}
1409
1410/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001411 * Work out whether we can put this event group on the CPU now.
1412 */
1413static int group_can_go_on(struct perf_event *event,
1414 struct perf_cpu_context *cpuctx,
1415 int can_add_hw)
1416{
1417 /*
1418 * Groups consisting entirely of software events can always go on.
1419 */
Frederic Weisbeckerd6f962b2010-01-10 01:25:51 +01001420 if (event->group_flags & PERF_GROUP_SOFTWARE)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001421 return 1;
1422 /*
1423 * If an exclusive group is already on, no other hardware
1424 * events can go on.
1425 */
1426 if (cpuctx->exclusive)
1427 return 0;
1428 /*
1429 * If this group is exclusive and there are already
1430 * events on the CPU, it can't go on.
1431 */
1432 if (event->attr.exclusive && cpuctx->active_oncpu)
1433 return 0;
1434 /*
1435 * Otherwise, try to add it if all previous groups were able
1436 * to go on.
1437 */
1438 return can_add_hw;
1439}
1440
1441static void add_event_to_ctx(struct perf_event *event,
1442 struct perf_event_context *ctx)
1443{
Stephane Eranian41587552011-01-03 18:20:01 +02001444 u64 tstamp = perf_event_time(event);
1445
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001446 list_add_event(event, ctx);
Peter Zijlstra8a495422010-05-27 15:47:49 +02001447 perf_group_attach(event);
Stephane Eranian41587552011-01-03 18:20:01 +02001448 event->tstamp_enabled = tstamp;
1449 event->tstamp_running = tstamp;
1450 event->tstamp_stopped = tstamp;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001451}
1452
Stephane Eraniane5d13672011-02-14 11:20:01 +02001453static void perf_event_context_sched_in(struct perf_event_context *ctx,
1454 struct task_struct *tsk);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001455
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001456/*
1457 * Cross CPU call to install and enable a performance event
1458 *
1459 * Must be called with ctx->mutex held
1460 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001461static int __perf_install_in_context(void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001462{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001463 struct perf_event *event = info;
1464 struct perf_event_context *ctx = event->ctx;
1465 struct perf_event *leader = event->group_leader;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001466 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001467 int err;
1468
1469 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001470 * In case we're installing a new context to an already running task,
1471 * could also happen before perf_event_task_sched_in() on architectures
1472 * which do context switches with IRQs enabled.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001473 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001474 if (ctx->task && !cpuctx->task_ctx)
Stephane Eraniane5d13672011-02-14 11:20:01 +02001475 perf_event_context_sched_in(ctx, ctx->task);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001476
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001477 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001478 ctx->is_active = 1;
1479 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001480 /*
1481 * update cgrp time only if current cgrp
1482 * matches event->cgrp. Must be done before
1483 * calling add_event_to_ctx()
1484 */
1485 update_cgrp_time_from_event(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001486
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001487 add_event_to_ctx(event, ctx);
1488
Stephane Eranian5632ab12011-01-03 18:20:01 +02001489 if (!event_filter_match(event))
Peter Zijlstraf4c41762009-12-16 17:55:54 +01001490 goto unlock;
1491
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001492 /*
1493 * Don't put the event on if it is disabled or if
1494 * it is in a group and the group isn't on.
1495 */
1496 if (event->state != PERF_EVENT_STATE_INACTIVE ||
1497 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
1498 goto unlock;
1499
1500 /*
1501 * An exclusive event can't go on if there are already active
1502 * hardware events, and no hardware event can go on if there
1503 * is already an exclusive event on.
1504 */
1505 if (!group_can_go_on(event, cpuctx, 1))
1506 err = -EEXIST;
1507 else
Peter Zijlstra6e377382010-02-11 13:21:58 +01001508 err = event_sched_in(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001509
1510 if (err) {
1511 /*
1512 * This event couldn't go on. If it is in a group
1513 * then we have to pull the whole group off.
1514 * If the event group is pinned then put it in error state.
1515 */
1516 if (leader != event)
1517 group_sched_out(leader, cpuctx, ctx);
1518 if (leader->attr.pinned) {
1519 update_group_times(leader);
1520 leader->state = PERF_EVENT_STATE_ERROR;
1521 }
1522 }
1523
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001524unlock:
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001525 raw_spin_unlock(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001526
1527 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001528}
1529
1530/*
1531 * Attach a performance event to a context
1532 *
1533 * First we add the event to the list with the hardware enable bit
1534 * in event->hw_config cleared.
1535 *
1536 * If the event is attached to a task which is on a CPU we use a smp
1537 * call to enable it in the task context. The task might have been
1538 * scheduled away, but we check this in the smp call again.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001539 */
1540static void
1541perf_install_in_context(struct perf_event_context *ctx,
1542 struct perf_event *event,
1543 int cpu)
1544{
1545 struct task_struct *task = ctx->task;
1546
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001547 lockdep_assert_held(&ctx->mutex);
1548
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02001549 event->ctx = ctx;
1550
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001551 if (!task) {
1552 /*
1553 * Per cpu events are installed via an smp call and
André Goddard Rosaaf901ca2009-11-14 13:09:05 -02001554 * the install is always successful.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001555 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001556 cpu_function_call(cpu, __perf_install_in_context, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001557 return;
1558 }
1559
1560retry:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001561 if (!task_function_call(task, __perf_install_in_context, event))
1562 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001563
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001564 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001565 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001566 * If we failed to find a running task, but find the context active now
1567 * that we've acquired the ctx->lock, retry.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001568 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001569 if (ctx->is_active) {
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001570 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001571 goto retry;
1572 }
1573
1574 /*
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001575 * Since the task isn't running, its safe to add the event, us holding
1576 * the ctx->lock ensures the task won't get scheduled in.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001577 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001578 add_event_to_ctx(event, ctx);
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001579 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001580}
1581
1582/*
1583 * Put a event into inactive state and update time fields.
1584 * Enabling the leader of a group effectively enables all
1585 * the group members that aren't explicitly disabled, so we
1586 * have to update their ->tstamp_enabled also.
1587 * Note: this works for group members as well as group leaders
1588 * since the non-leader members' sibling_lists will be empty.
1589 */
1590static void __perf_event_mark_enabled(struct perf_event *event,
1591 struct perf_event_context *ctx)
1592{
1593 struct perf_event *sub;
Stephane Eranian41587552011-01-03 18:20:01 +02001594 u64 tstamp = perf_event_time(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001595
1596 event->state = PERF_EVENT_STATE_INACTIVE;
Stephane Eranian41587552011-01-03 18:20:01 +02001597 event->tstamp_enabled = tstamp - event->total_time_enabled;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001598 list_for_each_entry(sub, &event->sibling_list, group_entry) {
Stephane Eranian41587552011-01-03 18:20:01 +02001599 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
1600 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001601 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001602}
1603
1604/*
1605 * Cross CPU call to enable a performance event
1606 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001607static int __perf_event_enable(void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001608{
1609 struct perf_event *event = info;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001610 struct perf_event_context *ctx = event->ctx;
1611 struct perf_event *leader = event->group_leader;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001612 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001613 int err;
1614
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001615 if (WARN_ON_ONCE(!ctx->is_active))
1616 return -EINVAL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001617
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001618 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001619 update_context_time(ctx);
1620
1621 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1622 goto unlock;
Stephane Eraniane5d13672011-02-14 11:20:01 +02001623
1624 /*
1625 * set current task's cgroup time reference point
1626 */
Stephane Eranian3f7cce32011-02-18 14:40:01 +02001627 perf_cgroup_set_timestamp(current, ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001628
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001629 __perf_event_mark_enabled(event, ctx);
1630
Stephane Eraniane5d13672011-02-14 11:20:01 +02001631 if (!event_filter_match(event)) {
1632 if (is_cgroup_event(event))
1633 perf_cgroup_defer_enabled(event);
Peter Zijlstraf4c41762009-12-16 17:55:54 +01001634 goto unlock;
Stephane Eraniane5d13672011-02-14 11:20:01 +02001635 }
Peter Zijlstraf4c41762009-12-16 17:55:54 +01001636
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001637 /*
1638 * If the event is in a group and isn't the group leader,
1639 * then don't put it on unless the group is on.
1640 */
1641 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
1642 goto unlock;
1643
1644 if (!group_can_go_on(event, cpuctx, 1)) {
1645 err = -EEXIST;
1646 } else {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001647 if (event == leader)
Peter Zijlstra6e377382010-02-11 13:21:58 +01001648 err = group_sched_in(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001649 else
Peter Zijlstra6e377382010-02-11 13:21:58 +01001650 err = event_sched_in(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001651 }
1652
1653 if (err) {
1654 /*
1655 * If this event can't go on and it's part of a
1656 * group, then the whole group has to come off.
1657 */
1658 if (leader != event)
1659 group_sched_out(leader, cpuctx, ctx);
1660 if (leader->attr.pinned) {
1661 update_group_times(leader);
1662 leader->state = PERF_EVENT_STATE_ERROR;
1663 }
1664 }
1665
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001666unlock:
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001667 raw_spin_unlock(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001668
1669 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001670}
1671
1672/*
1673 * Enable a event.
1674 *
1675 * If event->ctx is a cloned context, callers must make sure that
1676 * every task struct that event->ctx->task could possibly point to
1677 * remains valid. This condition is satisfied when called through
1678 * perf_event_for_each_child or perf_event_for_each as described
1679 * for perf_event_disable.
1680 */
Frederic Weisbecker44234ad2009-12-09 09:25:48 +01001681void perf_event_enable(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001682{
1683 struct perf_event_context *ctx = event->ctx;
1684 struct task_struct *task = ctx->task;
1685
1686 if (!task) {
1687 /*
1688 * Enable the event on the cpu that it's on
1689 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001690 cpu_function_call(event->cpu, __perf_event_enable, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001691 return;
1692 }
1693
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001694 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001695 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1696 goto out;
1697
1698 /*
1699 * If the event is in error state, clear that first.
1700 * That way, if we see the event in error state below, we
1701 * know that it has gone back into error state, as distinct
1702 * from the task having been scheduled away before the
1703 * cross-call arrived.
1704 */
1705 if (event->state == PERF_EVENT_STATE_ERROR)
1706 event->state = PERF_EVENT_STATE_OFF;
1707
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001708retry:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001709 if (!ctx->is_active) {
1710 __perf_event_mark_enabled(event, ctx);
1711 goto out;
1712 }
1713
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001714 raw_spin_unlock_irq(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001715
1716 if (!task_function_call(task, __perf_event_enable, event))
1717 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001718
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001719 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001720
1721 /*
1722 * If the context is active and the event is still off,
1723 * we need to retry the cross-call.
1724 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001725 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1726 /*
1727 * task could have been flipped by a concurrent
1728 * perf_event_context_sched_out()
1729 */
1730 task = ctx->task;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001731 goto retry;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001732 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001733
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001734out:
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001735 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001736}
1737
1738static int perf_event_refresh(struct perf_event *event, int refresh)
1739{
1740 /*
1741 * not supported on inherited events
1742 */
Franck Bui-Huu2e939d12010-11-23 16:21:44 +01001743 if (event->attr.inherit || !is_sampling_event(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001744 return -EINVAL;
1745
1746 atomic_add(refresh, &event->event_limit);
1747 perf_event_enable(event);
1748
1749 return 0;
1750}
1751
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001752static void ctx_sched_out(struct perf_event_context *ctx,
1753 struct perf_cpu_context *cpuctx,
1754 enum event_type_t event_type)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001755{
1756 struct perf_event *event;
1757
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001758 raw_spin_lock(&ctx->lock);
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02001759 perf_pmu_disable(ctx->pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001760 ctx->is_active = 0;
1761 if (likely(!ctx->nr_events))
1762 goto out;
1763 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001764 update_cgrp_time_from_cpuctx(cpuctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001765
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001766 if (!ctx->nr_active)
Peter Zijlstra24cd7f52010-06-11 17:32:03 +02001767 goto out;
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001768
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001769 if (event_type & EVENT_PINNED) {
Frederic Weisbecker889ff012010-01-09 20:04:47 +01001770 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1771 group_sched_out(event, cpuctx, ctx);
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001772 }
Frederic Weisbecker889ff012010-01-09 20:04:47 +01001773
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001774 if (event_type & EVENT_FLEXIBLE) {
Frederic Weisbecker889ff012010-01-09 20:04:47 +01001775 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
Xiao Guangrong8c9ed8e2009-09-25 13:51:17 +08001776 group_sched_out(event, cpuctx, ctx);
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001777 }
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001778out:
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02001779 perf_pmu_enable(ctx->pmu);
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001780 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001781}
1782
1783/*
1784 * Test whether two contexts are equivalent, i.e. whether they
1785 * have both been cloned from the same version of the same context
1786 * and they both have the same number of enabled events.
1787 * If the number of enabled events is the same, then the set
1788 * of enabled events should be the same, because these are both
1789 * inherited contexts, therefore we can't access individual events
1790 * in them directly with an fd; we can only enable/disable all
1791 * events via prctl, or enable/disable all events in a family
1792 * via ioctl, which will have the same effect on both contexts.
1793 */
1794static int context_equiv(struct perf_event_context *ctx1,
1795 struct perf_event_context *ctx2)
1796{
1797 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1798 && ctx1->parent_gen == ctx2->parent_gen
1799 && !ctx1->pin_count && !ctx2->pin_count;
1800}
1801
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001802static void __perf_event_sync_stat(struct perf_event *event,
1803 struct perf_event *next_event)
1804{
1805 u64 value;
1806
1807 if (!event->attr.inherit_stat)
1808 return;
1809
1810 /*
1811 * Update the event value, we cannot use perf_event_read()
1812 * because we're in the middle of a context switch and have IRQs
1813 * disabled, which upsets smp_call_function_single(), however
1814 * we know the event must be on the current CPU, therefore we
1815 * don't need to use it.
1816 */
1817 switch (event->state) {
1818 case PERF_EVENT_STATE_ACTIVE:
Peter Zijlstra3dbebf12009-11-20 22:19:52 +01001819 event->pmu->read(event);
1820 /* fall-through */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001821
1822 case PERF_EVENT_STATE_INACTIVE:
1823 update_event_times(event);
1824 break;
1825
1826 default:
1827 break;
1828 }
1829
1830 /*
1831 * In order to keep per-task stats reliable we need to flip the event
1832 * values when we flip the contexts.
1833 */
Peter Zijlstrae7850592010-05-21 14:43:08 +02001834 value = local64_read(&next_event->count);
1835 value = local64_xchg(&event->count, value);
1836 local64_set(&next_event->count, value);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001837
1838 swap(event->total_time_enabled, next_event->total_time_enabled);
1839 swap(event->total_time_running, next_event->total_time_running);
1840
1841 /*
1842 * Since we swizzled the values, update the user visible data too.
1843 */
1844 perf_event_update_userpage(event);
1845 perf_event_update_userpage(next_event);
1846}
1847
1848#define list_next_entry(pos, member) \
1849 list_entry(pos->member.next, typeof(*pos), member)
1850
1851static void perf_event_sync_stat(struct perf_event_context *ctx,
1852 struct perf_event_context *next_ctx)
1853{
1854 struct perf_event *event, *next_event;
1855
1856 if (!ctx->nr_stat)
1857 return;
1858
Peter Zijlstra02ffdbc2009-11-20 22:19:50 +01001859 update_context_time(ctx);
1860
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001861 event = list_first_entry(&ctx->event_list,
1862 struct perf_event, event_entry);
1863
1864 next_event = list_first_entry(&next_ctx->event_list,
1865 struct perf_event, event_entry);
1866
1867 while (&event->event_entry != &ctx->event_list &&
1868 &next_event->event_entry != &next_ctx->event_list) {
1869
1870 __perf_event_sync_stat(event, next_event);
1871
1872 event = list_next_entry(event, event_entry);
1873 next_event = list_next_entry(next_event, event_entry);
1874 }
1875}
1876
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001877static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1878 struct task_struct *next)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001879{
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001880 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001881 struct perf_event_context *next_ctx;
1882 struct perf_event_context *parent;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001883 struct perf_cpu_context *cpuctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001884 int do_switch = 1;
1885
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001886 if (likely(!ctx))
1887 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001888
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001889 cpuctx = __get_cpu_context(ctx);
1890 if (!cpuctx->task_ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001891 return;
1892
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001893 rcu_read_lock();
1894 parent = rcu_dereference(ctx->parent_ctx);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001895 next_ctx = next->perf_event_ctxp[ctxn];
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001896 if (parent && next_ctx &&
1897 rcu_dereference(next_ctx->parent_ctx) == parent) {
1898 /*
1899 * Looks like the two contexts are clones, so we might be
1900 * able to optimize the context switch. We lock both
1901 * contexts and check that they are clones under the
1902 * lock (including re-checking that neither has been
1903 * uncloned in the meantime). It doesn't matter which
1904 * order we take the locks because no other cpu could
1905 * be trying to lock both of these tasks.
1906 */
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001907 raw_spin_lock(&ctx->lock);
1908 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001909 if (context_equiv(ctx, next_ctx)) {
1910 /*
1911 * XXX do we need a memory barrier of sorts
1912 * wrt to rcu_dereference() of perf_event_ctxp
1913 */
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001914 task->perf_event_ctxp[ctxn] = next_ctx;
1915 next->perf_event_ctxp[ctxn] = ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001916 ctx->task = next;
1917 next_ctx->task = task;
1918 do_switch = 0;
1919
1920 perf_event_sync_stat(ctx, next_ctx);
1921 }
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001922 raw_spin_unlock(&next_ctx->lock);
1923 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001924 }
1925 rcu_read_unlock();
1926
1927 if (do_switch) {
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001928 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001929 cpuctx->task_ctx = NULL;
1930 }
1931}
1932
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001933#define for_each_task_context_nr(ctxn) \
1934 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
1935
1936/*
1937 * Called from scheduler to remove the events of the current task,
1938 * with interrupts disabled.
1939 *
1940 * We stop each event and update the event value in event->count.
1941 *
1942 * This does not protect us against NMI, but disable()
1943 * sets the disabled bit in the control field of event _before_
1944 * accessing the event control register. If a NMI hits, then it will
1945 * not restart the event.
1946 */
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02001947void __perf_event_task_sched_out(struct task_struct *task,
1948 struct task_struct *next)
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001949{
1950 int ctxn;
1951
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001952 for_each_task_context_nr(ctxn)
1953 perf_event_context_sched_out(task, ctxn, next);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001954
1955 /*
1956 * if cgroup events exist on this CPU, then we need
1957 * to check if we have to switch out PMU state.
1958 * cgroup event are system-wide mode only
1959 */
1960 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1961 perf_cgroup_sched_out(task);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001962}
1963
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001964static void task_ctx_sched_out(struct perf_event_context *ctx,
1965 enum event_type_t event_type)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001966{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02001967 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001968
1969 if (!cpuctx->task_ctx)
1970 return;
1971
1972 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1973 return;
1974
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001975 ctx_sched_out(ctx, cpuctx, event_type);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001976 cpuctx->task_ctx = NULL;
1977}
1978
1979/*
1980 * Called with IRQs disabled
1981 */
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001982static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1983 enum event_type_t event_type)
1984{
1985 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001986}
1987
1988static void
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01001989ctx_pinned_sched_in(struct perf_event_context *ctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01001990 struct perf_cpu_context *cpuctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001991{
1992 struct perf_event *event;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001993
Frederic Weisbecker889ff012010-01-09 20:04:47 +01001994 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1995 if (event->state <= PERF_EVENT_STATE_OFF)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001996 continue;
Stephane Eranian5632ab12011-01-03 18:20:01 +02001997 if (!event_filter_match(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001998 continue;
1999
Stephane Eraniane5d13672011-02-14 11:20:01 +02002000 /* may need to reset tstamp_enabled */
2001 if (is_cgroup_event(event))
2002 perf_cgroup_mark_enabled(event, ctx);
2003
Xiao Guangrong8c9ed8e2009-09-25 13:51:17 +08002004 if (group_can_go_on(event, cpuctx, 1))
Peter Zijlstra6e377382010-02-11 13:21:58 +01002005 group_sched_in(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002006
2007 /*
2008 * If this pinned group hasn't been scheduled,
2009 * put it in error state.
2010 */
2011 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2012 update_group_times(event);
2013 event->state = PERF_EVENT_STATE_ERROR;
2014 }
2015 }
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002016}
2017
2018static void
2019ctx_flexible_sched_in(struct perf_event_context *ctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01002020 struct perf_cpu_context *cpuctx)
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002021{
2022 struct perf_event *event;
2023 int can_add_hw = 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002024
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002025 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2026 /* Ignore events in OFF or ERROR state */
2027 if (event->state <= PERF_EVENT_STATE_OFF)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002028 continue;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002029 /*
2030 * Listen to the 'cpu' scheduling filter constraint
2031 * of events:
2032 */
Stephane Eranian5632ab12011-01-03 18:20:01 +02002033 if (!event_filter_match(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002034 continue;
2035
Stephane Eraniane5d13672011-02-14 11:20:01 +02002036 /* may need to reset tstamp_enabled */
2037 if (is_cgroup_event(event))
2038 perf_cgroup_mark_enabled(event, ctx);
2039
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002040 if (group_can_go_on(event, cpuctx, can_add_hw)) {
Peter Zijlstra6e377382010-02-11 13:21:58 +01002041 if (group_sched_in(event, cpuctx, ctx))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002042 can_add_hw = 0;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002043 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002044 }
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002045}
2046
2047static void
2048ctx_sched_in(struct perf_event_context *ctx,
2049 struct perf_cpu_context *cpuctx,
Stephane Eraniane5d13672011-02-14 11:20:01 +02002050 enum event_type_t event_type,
2051 struct task_struct *task)
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002052{
Stephane Eraniane5d13672011-02-14 11:20:01 +02002053 u64 now;
2054
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002055 raw_spin_lock(&ctx->lock);
2056 ctx->is_active = 1;
2057 if (likely(!ctx->nr_events))
2058 goto out;
2059
Stephane Eraniane5d13672011-02-14 11:20:01 +02002060 now = perf_clock();
2061 ctx->timestamp = now;
Stephane Eranian3f7cce32011-02-18 14:40:01 +02002062 perf_cgroup_set_timestamp(task, ctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002063 /*
2064 * First go through the list and put on any pinned groups
2065 * in order to give them the best chance of going on.
2066 */
2067 if (event_type & EVENT_PINNED)
Peter Zijlstra6e377382010-02-11 13:21:58 +01002068 ctx_pinned_sched_in(ctx, cpuctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002069
2070 /* Then walk through the lower prio flexible groups */
2071 if (event_type & EVENT_FLEXIBLE)
Peter Zijlstra6e377382010-02-11 13:21:58 +01002072 ctx_flexible_sched_in(ctx, cpuctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002073
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002074out:
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002075 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002076}
2077
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002078static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
Stephane Eraniane5d13672011-02-14 11:20:01 +02002079 enum event_type_t event_type,
2080 struct task_struct *task)
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002081{
2082 struct perf_event_context *ctx = &cpuctx->ctx;
2083
Stephane Eraniane5d13672011-02-14 11:20:01 +02002084 ctx_sched_in(ctx, cpuctx, event_type, task);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002085}
2086
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002087static void task_ctx_sched_in(struct perf_event_context *ctx,
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002088 enum event_type_t event_type)
2089{
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002090 struct perf_cpu_context *cpuctx;
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002091
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002092 cpuctx = __get_cpu_context(ctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002093 if (cpuctx->task_ctx == ctx)
2094 return;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002095
Stephane Eraniane5d13672011-02-14 11:20:01 +02002096 ctx_sched_in(ctx, cpuctx, event_type, NULL);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002097 cpuctx->task_ctx = ctx;
2098}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002099
Stephane Eraniane5d13672011-02-14 11:20:01 +02002100static void perf_event_context_sched_in(struct perf_event_context *ctx,
2101 struct task_struct *task)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002102{
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002103 struct perf_cpu_context *cpuctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002104
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002105 cpuctx = __get_cpu_context(ctx);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002106 if (cpuctx->task_ctx == ctx)
2107 return;
2108
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02002109 perf_pmu_disable(ctx->pmu);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002110 /*
2111 * We want to keep the following priority order:
2112 * cpu pinned (that don't need to move), task pinned,
2113 * cpu flexible, task flexible.
2114 */
2115 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2116
Stephane Eraniane5d13672011-02-14 11:20:01 +02002117 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2118 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2119 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002120
2121 cpuctx->task_ctx = ctx;
eranian@google.com9b33fa62010-03-10 22:26:05 -08002122
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002123 /*
2124 * Since these rotations are per-cpu, we need to ensure the
2125 * cpu-context we got scheduled on is actually rotating.
2126 */
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002127 perf_pmu_rotate_start(ctx->pmu);
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02002128 perf_pmu_enable(ctx->pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002129}
2130
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002131/*
2132 * Called from scheduler to add the events of the current task
2133 * with interrupts disabled.
2134 *
2135 * We restore the event value and then enable it.
2136 *
2137 * This does not protect us against NMI, but enable()
2138 * sets the enabled bit in the control field of event _before_
2139 * accessing the event control register. If a NMI hits, then it will
2140 * keep the event running.
2141 */
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02002142void __perf_event_task_sched_in(struct task_struct *task)
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002143{
2144 struct perf_event_context *ctx;
2145 int ctxn;
2146
2147 for_each_task_context_nr(ctxn) {
2148 ctx = task->perf_event_ctxp[ctxn];
2149 if (likely(!ctx))
2150 continue;
2151
Stephane Eraniane5d13672011-02-14 11:20:01 +02002152 perf_event_context_sched_in(ctx, task);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002153 }
Stephane Eraniane5d13672011-02-14 11:20:01 +02002154 /*
2155 * if cgroup events exist on this CPU, then we need
2156 * to check if we have to switch in PMU state.
2157 * cgroup event are system-wide mode only
2158 */
2159 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2160 perf_cgroup_sched_in(task);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002161}
2162
Peter Zijlstraabd50712010-01-26 18:50:16 +01002163static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2164{
2165 u64 frequency = event->attr.sample_freq;
2166 u64 sec = NSEC_PER_SEC;
2167 u64 divisor, dividend;
2168
2169 int count_fls, nsec_fls, frequency_fls, sec_fls;
2170
2171 count_fls = fls64(count);
2172 nsec_fls = fls64(nsec);
2173 frequency_fls = fls64(frequency);
2174 sec_fls = 30;
2175
2176 /*
2177 * We got @count in @nsec, with a target of sample_freq HZ
2178 * the target period becomes:
2179 *
2180 * @count * 10^9
2181 * period = -------------------
2182 * @nsec * sample_freq
2183 *
2184 */
2185
2186 /*
2187 * Reduce accuracy by one bit such that @a and @b converge
2188 * to a similar magnitude.
2189 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002190#define REDUCE_FLS(a, b) \
Peter Zijlstraabd50712010-01-26 18:50:16 +01002191do { \
2192 if (a##_fls > b##_fls) { \
2193 a >>= 1; \
2194 a##_fls--; \
2195 } else { \
2196 b >>= 1; \
2197 b##_fls--; \
2198 } \
2199} while (0)
2200
2201 /*
2202 * Reduce accuracy until either term fits in a u64, then proceed with
2203 * the other, so that finally we can do a u64/u64 division.
2204 */
2205 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2206 REDUCE_FLS(nsec, frequency);
2207 REDUCE_FLS(sec, count);
2208 }
2209
2210 if (count_fls + sec_fls > 64) {
2211 divisor = nsec * frequency;
2212
2213 while (count_fls + sec_fls > 64) {
2214 REDUCE_FLS(count, sec);
2215 divisor >>= 1;
2216 }
2217
2218 dividend = count * sec;
2219 } else {
2220 dividend = count * sec;
2221
2222 while (nsec_fls + frequency_fls > 64) {
2223 REDUCE_FLS(nsec, frequency);
2224 dividend >>= 1;
2225 }
2226
2227 divisor = nsec * frequency;
2228 }
2229
Peter Zijlstraf6ab91a2010-06-04 15:18:01 +02002230 if (!divisor)
2231 return dividend;
2232
Peter Zijlstraabd50712010-01-26 18:50:16 +01002233 return div64_u64(dividend, divisor);
2234}
2235
2236static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002237{
2238 struct hw_perf_event *hwc = &event->hw;
Peter Zijlstraf6ab91a2010-06-04 15:18:01 +02002239 s64 period, sample_period;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002240 s64 delta;
2241
Peter Zijlstraabd50712010-01-26 18:50:16 +01002242 period = perf_calculate_period(event, nsec, count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002243
2244 delta = (s64)(period - hwc->sample_period);
2245 delta = (delta + 7) / 8; /* low pass filter */
2246
2247 sample_period = hwc->sample_period + delta;
2248
2249 if (!sample_period)
2250 sample_period = 1;
2251
2252 hwc->sample_period = sample_period;
Peter Zijlstraabd50712010-01-26 18:50:16 +01002253
Peter Zijlstrae7850592010-05-21 14:43:08 +02002254 if (local64_read(&hwc->period_left) > 8*sample_period) {
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02002255 event->pmu->stop(event, PERF_EF_UPDATE);
Peter Zijlstrae7850592010-05-21 14:43:08 +02002256 local64_set(&hwc->period_left, 0);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02002257 event->pmu->start(event, PERF_EF_RELOAD);
Peter Zijlstraabd50712010-01-26 18:50:16 +01002258 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002259}
2260
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002261static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002262{
2263 struct perf_event *event;
2264 struct hw_perf_event *hwc;
Peter Zijlstraabd50712010-01-26 18:50:16 +01002265 u64 interrupts, now;
2266 s64 delta;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002267
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002268 raw_spin_lock(&ctx->lock);
Paul Mackerras03541f82009-10-14 16:58:03 +11002269 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002270 if (event->state != PERF_EVENT_STATE_ACTIVE)
2271 continue;
2272
Stephane Eranian5632ab12011-01-03 18:20:01 +02002273 if (!event_filter_match(event))
Peter Zijlstra5d27c232009-12-17 13:16:32 +01002274 continue;
2275
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002276 hwc = &event->hw;
2277
2278 interrupts = hwc->interrupts;
2279 hwc->interrupts = 0;
2280
2281 /*
2282 * unthrottle events on the tick
2283 */
2284 if (interrupts == MAX_INTERRUPTS) {
2285 perf_log_throttle(event, 1);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02002286 event->pmu->start(event, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002287 }
2288
2289 if (!event->attr.freq || !event->attr.sample_freq)
2290 continue;
2291
Peter Zijlstraabd50712010-01-26 18:50:16 +01002292 event->pmu->read(event);
Peter Zijlstrae7850592010-05-21 14:43:08 +02002293 now = local64_read(&event->count);
Peter Zijlstraabd50712010-01-26 18:50:16 +01002294 delta = now - hwc->freq_count_stamp;
2295 hwc->freq_count_stamp = now;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002296
Peter Zijlstraabd50712010-01-26 18:50:16 +01002297 if (delta > 0)
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002298 perf_adjust_period(event, period, delta);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002299 }
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002300 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002301}
2302
2303/*
2304 * Round-robin a context's events:
2305 */
2306static void rotate_ctx(struct perf_event_context *ctx)
2307{
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002308 raw_spin_lock(&ctx->lock);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002309
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01002310 /*
2311 * Rotate the first entry last of non-pinned groups. Rotation might be
2312 * disabled by the inheritance code.
2313 */
2314 if (!ctx->rotate_disable)
2315 list_rotate_left(&ctx->flexible_groups);
Frederic Weisbeckere2864172010-01-09 21:05:28 +01002316
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002317 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002318}
2319
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002320/*
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002321 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
2322 * because they're strictly cpu affine and rotate_start is called with IRQs
2323 * disabled, while rotate_context is called from IRQ context.
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002324 */
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002325static void perf_rotate_context(struct perf_cpu_context *cpuctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002326{
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002327 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002328 struct perf_event_context *ctx = NULL;
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002329 int rotate = 0, remove = 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002330
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002331 if (cpuctx->ctx.nr_events) {
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002332 remove = 0;
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002333 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2334 rotate = 1;
2335 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002336
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002337 ctx = cpuctx->task_ctx;
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002338 if (ctx && ctx->nr_events) {
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002339 remove = 0;
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002340 if (ctx->nr_events != ctx->nr_active)
2341 rotate = 1;
2342 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002343
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02002344 perf_pmu_disable(cpuctx->ctx.pmu);
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002345 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002346 if (ctx)
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002347 perf_ctx_adjust_freq(ctx, interval);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002348
Peter Zijlstrad4944a02010-03-08 13:51:20 +01002349 if (!rotate)
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002350 goto done;
Peter Zijlstrad4944a02010-03-08 13:51:20 +01002351
Frederic Weisbecker7defb0f2010-01-17 12:15:31 +01002352 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002353 if (ctx)
Frederic Weisbecker7defb0f2010-01-17 12:15:31 +01002354 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002355
2356 rotate_ctx(&cpuctx->ctx);
2357 if (ctx)
2358 rotate_ctx(ctx);
2359
Stephane Eraniane5d13672011-02-14 11:20:01 +02002360 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002361 if (ctx)
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002362 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002363
2364done:
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002365 if (remove)
2366 list_del_init(&cpuctx->rotation_list);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02002367
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02002368 perf_pmu_enable(cpuctx->ctx.pmu);
2369}
2370
2371void perf_event_task_tick(void)
2372{
2373 struct list_head *head = &__get_cpu_var(rotation_list);
2374 struct perf_cpu_context *cpuctx, *tmp;
2375
2376 WARN_ON(!irqs_disabled());
2377
2378 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
2379 if (cpuctx->jiffies_interval == 1 ||
2380 !(jiffies % cpuctx->jiffies_interval))
2381 perf_rotate_context(cpuctx);
2382 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002383}
2384
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002385static int event_enable_on_exec(struct perf_event *event,
2386 struct perf_event_context *ctx)
2387{
2388 if (!event->attr.enable_on_exec)
2389 return 0;
2390
2391 event->attr.enable_on_exec = 0;
2392 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2393 return 0;
2394
2395 __perf_event_mark_enabled(event, ctx);
2396
2397 return 1;
2398}
2399
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002400/*
2401 * Enable all of a task's events that have been marked enable-on-exec.
2402 * This expects task == current.
2403 */
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002404static void perf_event_enable_on_exec(struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002405{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002406 struct perf_event *event;
2407 unsigned long flags;
2408 int enabled = 0;
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002409 int ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002410
2411 local_irq_save(flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002412 if (!ctx || !ctx->nr_events)
2413 goto out;
2414
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002415 task_ctx_sched_out(ctx, EVENT_ALL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002416
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002417 raw_spin_lock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002418
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002419 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2420 ret = event_enable_on_exec(event, ctx);
2421 if (ret)
2422 enabled = 1;
2423 }
2424
2425 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2426 ret = event_enable_on_exec(event, ctx);
2427 if (ret)
2428 enabled = 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002429 }
2430
2431 /*
2432 * Unclone this context if we enabled any event.
2433 */
2434 if (enabled)
2435 unclone_ctx(ctx);
2436
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002437 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002438
Stephane Eraniane5d13672011-02-14 11:20:01 +02002439 perf_event_context_sched_in(ctx, ctx->task);
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002440out:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002441 local_irq_restore(flags);
2442}
2443
2444/*
2445 * Cross CPU call to read the hardware event
2446 */
2447static void __perf_event_read(void *info)
2448{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002449 struct perf_event *event = info;
2450 struct perf_event_context *ctx = event->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002451 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002452
2453 /*
2454 * If this is a task context, we need to check whether it is
2455 * the current task context of this cpu. If not it has been
2456 * scheduled out before the smp call arrived. In that case
2457 * event->count would have been updated to a recent sample
2458 * when the event was scheduled out.
2459 */
2460 if (ctx->task && cpuctx->task_ctx != ctx)
2461 return;
2462
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002463 raw_spin_lock(&ctx->lock);
Stephane Eraniane5d13672011-02-14 11:20:01 +02002464 if (ctx->is_active) {
Peter Zijlstra542e72f2011-01-26 15:38:35 +01002465 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02002466 update_cgrp_time_from_event(event);
2467 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002468 update_event_times(event);
Peter Zijlstra542e72f2011-01-26 15:38:35 +01002469 if (event->state == PERF_EVENT_STATE_ACTIVE)
2470 event->pmu->read(event);
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002471 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002472}
2473
Peter Zijlstrab5e58792010-05-21 14:43:12 +02002474static inline u64 perf_event_count(struct perf_event *event)
2475{
Peter Zijlstrae7850592010-05-21 14:43:08 +02002476 return local64_read(&event->count) + atomic64_read(&event->child_count);
Peter Zijlstrab5e58792010-05-21 14:43:12 +02002477}
2478
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002479static u64 perf_event_read(struct perf_event *event)
2480{
2481 /*
2482 * If event is enabled and currently active on a CPU, update the
2483 * value in the event structure:
2484 */
2485 if (event->state == PERF_EVENT_STATE_ACTIVE) {
2486 smp_call_function_single(event->oncpu,
2487 __perf_event_read, event, 1);
2488 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
Peter Zijlstra2b8988c2009-11-20 22:19:54 +01002489 struct perf_event_context *ctx = event->ctx;
2490 unsigned long flags;
2491
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002492 raw_spin_lock_irqsave(&ctx->lock, flags);
Stephane Eranianc530ccd2010-10-15 15:26:01 +02002493 /*
2494 * may read while context is not active
2495 * (e.g., thread is blocked), in that case
2496 * we cannot update context time
2497 */
Stephane Eraniane5d13672011-02-14 11:20:01 +02002498 if (ctx->is_active) {
Stephane Eranianc530ccd2010-10-15 15:26:01 +02002499 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02002500 update_cgrp_time_from_event(event);
2501 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002502 update_event_times(event);
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002503 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002504 }
2505
Peter Zijlstrab5e58792010-05-21 14:43:12 +02002506 return perf_event_count(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002507}
2508
2509/*
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002510 * Callchain support
2511 */
2512
2513struct callchain_cpus_entries {
2514 struct rcu_head rcu_head;
2515 struct perf_callchain_entry *cpu_entries[0];
2516};
2517
Frederic Weisbecker7ae07ea2010-08-14 20:45:13 +02002518static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002519static atomic_t nr_callchain_events;
2520static DEFINE_MUTEX(callchain_mutex);
2521struct callchain_cpus_entries *callchain_cpus_entries;
2522
2523
2524__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
2525 struct pt_regs *regs)
2526{
2527}
2528
2529__weak void perf_callchain_user(struct perf_callchain_entry *entry,
2530 struct pt_regs *regs)
2531{
2532}
2533
2534static void release_callchain_buffers_rcu(struct rcu_head *head)
2535{
2536 struct callchain_cpus_entries *entries;
2537 int cpu;
2538
2539 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
2540
2541 for_each_possible_cpu(cpu)
2542 kfree(entries->cpu_entries[cpu]);
2543
2544 kfree(entries);
2545}
2546
2547static void release_callchain_buffers(void)
2548{
2549 struct callchain_cpus_entries *entries;
2550
2551 entries = callchain_cpus_entries;
2552 rcu_assign_pointer(callchain_cpus_entries, NULL);
2553 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
2554}
2555
2556static int alloc_callchain_buffers(void)
2557{
2558 int cpu;
2559 int size;
2560 struct callchain_cpus_entries *entries;
2561
2562 /*
2563 * We can't use the percpu allocation API for data that can be
2564 * accessed from NMI. Use a temporary manual per cpu allocation
2565 * until that gets sorted out.
2566 */
Eric Dumazet88d4f0d2011-01-25 19:40:51 +01002567 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002568
2569 entries = kzalloc(size, GFP_KERNEL);
2570 if (!entries)
2571 return -ENOMEM;
2572
Frederic Weisbecker7ae07ea2010-08-14 20:45:13 +02002573 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002574
2575 for_each_possible_cpu(cpu) {
2576 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
2577 cpu_to_node(cpu));
2578 if (!entries->cpu_entries[cpu])
2579 goto fail;
2580 }
2581
2582 rcu_assign_pointer(callchain_cpus_entries, entries);
2583
2584 return 0;
2585
2586fail:
2587 for_each_possible_cpu(cpu)
2588 kfree(entries->cpu_entries[cpu]);
2589 kfree(entries);
2590
2591 return -ENOMEM;
2592}
2593
2594static int get_callchain_buffers(void)
2595{
2596 int err = 0;
2597 int count;
2598
2599 mutex_lock(&callchain_mutex);
2600
2601 count = atomic_inc_return(&nr_callchain_events);
2602 if (WARN_ON_ONCE(count < 1)) {
2603 err = -EINVAL;
2604 goto exit;
2605 }
2606
2607 if (count > 1) {
2608 /* If the allocation failed, give up */
2609 if (!callchain_cpus_entries)
2610 err = -ENOMEM;
2611 goto exit;
2612 }
2613
2614 err = alloc_callchain_buffers();
2615 if (err)
2616 release_callchain_buffers();
2617exit:
2618 mutex_unlock(&callchain_mutex);
2619
2620 return err;
2621}
2622
2623static void put_callchain_buffers(void)
2624{
2625 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
2626 release_callchain_buffers();
2627 mutex_unlock(&callchain_mutex);
2628 }
2629}
2630
2631static int get_recursion_context(int *recursion)
2632{
2633 int rctx;
2634
2635 if (in_nmi())
2636 rctx = 3;
2637 else if (in_irq())
2638 rctx = 2;
2639 else if (in_softirq())
2640 rctx = 1;
2641 else
2642 rctx = 0;
2643
2644 if (recursion[rctx])
2645 return -1;
2646
2647 recursion[rctx]++;
2648 barrier();
2649
2650 return rctx;
2651}
2652
2653static inline void put_recursion_context(int *recursion, int rctx)
2654{
2655 barrier();
2656 recursion[rctx]--;
2657}
2658
2659static struct perf_callchain_entry *get_callchain_entry(int *rctx)
2660{
2661 int cpu;
2662 struct callchain_cpus_entries *entries;
2663
2664 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
2665 if (*rctx == -1)
2666 return NULL;
2667
2668 entries = rcu_dereference(callchain_cpus_entries);
2669 if (!entries)
2670 return NULL;
2671
2672 cpu = smp_processor_id();
2673
2674 return &entries->cpu_entries[cpu][*rctx];
2675}
2676
2677static void
2678put_callchain_entry(int rctx)
2679{
2680 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
2681}
2682
2683static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2684{
2685 int rctx;
2686 struct perf_callchain_entry *entry;
2687
2688
2689 entry = get_callchain_entry(&rctx);
2690 if (rctx == -1)
2691 return NULL;
2692
2693 if (!entry)
2694 goto exit_put;
2695
2696 entry->nr = 0;
2697
2698 if (!user_mode(regs)) {
2699 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2700 perf_callchain_kernel(entry, regs);
2701 if (current->mm)
2702 regs = task_pt_regs(current);
2703 else
2704 regs = NULL;
2705 }
2706
2707 if (regs) {
2708 perf_callchain_store(entry, PERF_CONTEXT_USER);
2709 perf_callchain_user(entry, regs);
2710 }
2711
2712exit_put:
2713 put_callchain_entry(rctx);
2714
2715 return entry;
2716}
2717
2718/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002719 * Initialize the perf_event context in a task_struct:
2720 */
Peter Zijlstraeb184472010-09-07 15:55:13 +02002721static void __perf_event_init_context(struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002722{
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002723 raw_spin_lock_init(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002724 mutex_init(&ctx->mutex);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002725 INIT_LIST_HEAD(&ctx->pinned_groups);
2726 INIT_LIST_HEAD(&ctx->flexible_groups);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002727 INIT_LIST_HEAD(&ctx->event_list);
2728 atomic_set(&ctx->refcount, 1);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002729}
2730
Peter Zijlstraeb184472010-09-07 15:55:13 +02002731static struct perf_event_context *
2732alloc_perf_context(struct pmu *pmu, struct task_struct *task)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002733{
2734 struct perf_event_context *ctx;
Peter Zijlstraeb184472010-09-07 15:55:13 +02002735
2736 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
2737 if (!ctx)
2738 return NULL;
2739
2740 __perf_event_init_context(ctx);
2741 if (task) {
2742 ctx->task = task;
2743 get_task_struct(task);
2744 }
2745 ctx->pmu = pmu;
2746
2747 return ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002748}
2749
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07002750static struct task_struct *
2751find_lively_task_by_vpid(pid_t vpid)
2752{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002753 struct task_struct *task;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002754 int err;
2755
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002756 rcu_read_lock();
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07002757 if (!vpid)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002758 task = current;
2759 else
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07002760 task = find_task_by_vpid(vpid);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002761 if (task)
2762 get_task_struct(task);
2763 rcu_read_unlock();
2764
2765 if (!task)
2766 return ERR_PTR(-ESRCH);
2767
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002768 /* Reuse ptrace permission checks for now. */
2769 err = -EACCES;
2770 if (!ptrace_may_access(task, PTRACE_MODE_READ))
2771 goto errout;
2772
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07002773 return task;
2774errout:
2775 put_task_struct(task);
2776 return ERR_PTR(err);
2777
2778}
2779
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002780/*
2781 * Returns a matching context with refcount and pincount.
2782 */
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002783static struct perf_event_context *
Matt Helsley38a81da2010-09-13 13:01:20 -07002784find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002785{
2786 struct perf_event_context *ctx;
2787 struct perf_cpu_context *cpuctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002788 unsigned long flags;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002789 int ctxn, err;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002790
Oleg Nesterov22a4ec72011-01-18 17:10:08 +01002791 if (!task) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002792 /* Must be root to operate on a CPU event: */
2793 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2794 return ERR_PTR(-EACCES);
2795
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002796 /*
2797 * We could be clever and allow to attach a event to an
2798 * offline CPU and activate it when the CPU comes up, but
2799 * that's for later.
2800 */
2801 if (!cpu_online(cpu))
2802 return ERR_PTR(-ENODEV);
2803
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002804 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002805 ctx = &cpuctx->ctx;
2806 get_ctx(ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002807 ++ctx->pin_count;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002808
2809 return ctx;
2810 }
2811
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002812 err = -EINVAL;
2813 ctxn = pmu->task_ctx_nr;
2814 if (ctxn < 0)
2815 goto errout;
2816
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002817retry:
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002818 ctx = perf_lock_task_context(task, ctxn, &flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002819 if (ctx) {
2820 unclone_ctx(ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002821 ++ctx->pin_count;
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002822 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002823 }
2824
2825 if (!ctx) {
Peter Zijlstraeb184472010-09-07 15:55:13 +02002826 ctx = alloc_perf_context(pmu, task);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002827 err = -ENOMEM;
2828 if (!ctx)
2829 goto errout;
Peter Zijlstraeb184472010-09-07 15:55:13 +02002830
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002831 get_ctx(ctx);
Peter Zijlstraeb184472010-09-07 15:55:13 +02002832
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01002833 err = 0;
2834 mutex_lock(&task->perf_event_mutex);
2835 /*
2836 * If it has already passed perf_event_exit_task().
2837 * we must see PF_EXITING, it takes this mutex too.
2838 */
2839 if (task->flags & PF_EXITING)
2840 err = -ESRCH;
2841 else if (task->perf_event_ctxp[ctxn])
2842 err = -EAGAIN;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002843 else {
2844 ++ctx->pin_count;
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01002845 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002846 }
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01002847 mutex_unlock(&task->perf_event_mutex);
2848
2849 if (unlikely(err)) {
Peter Zijlstraeb184472010-09-07 15:55:13 +02002850 put_task_struct(task);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002851 kfree(ctx);
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01002852
2853 if (err == -EAGAIN)
2854 goto retry;
2855 goto errout;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002856 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002857 }
2858
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002859 return ctx;
2860
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002861errout:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002862 return ERR_PTR(err);
2863}
2864
Li Zefan6fb29152009-10-15 11:21:42 +08002865static void perf_event_free_filter(struct perf_event *event);
2866
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002867static void free_event_rcu(struct rcu_head *head)
2868{
2869 struct perf_event *event;
2870
2871 event = container_of(head, struct perf_event, rcu_head);
2872 if (event->ns)
2873 put_pid_ns(event->ns);
Li Zefan6fb29152009-10-15 11:21:42 +08002874 perf_event_free_filter(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002875 kfree(event);
2876}
2877
Peter Zijlstraca5135e2010-05-28 19:33:23 +02002878static void perf_buffer_put(struct perf_buffer *buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002879
2880static void free_event(struct perf_event *event)
2881{
Peter Zijlstrae360adb2010-10-14 14:01:34 +08002882 irq_work_sync(&event->pending);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002883
2884 if (!event->parent) {
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02002885 if (event->attach_state & PERF_ATTACH_TASK)
Stephane Eraniane5d13672011-02-14 11:20:01 +02002886 jump_label_dec(&perf_sched_events);
Eric B Munson3af9e852010-05-18 15:30:49 +01002887 if (event->attr.mmap || event->attr.mmap_data)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002888 atomic_dec(&nr_mmap_events);
2889 if (event->attr.comm)
2890 atomic_dec(&nr_comm_events);
2891 if (event->attr.task)
2892 atomic_dec(&nr_task_events);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02002893 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2894 put_callchain_buffers();
Peter Zijlstra08309372011-03-03 11:31:20 +01002895 if (is_cgroup_event(event)) {
2896 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2897 jump_label_dec(&perf_sched_events);
2898 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002899 }
2900
Peter Zijlstraca5135e2010-05-28 19:33:23 +02002901 if (event->buffer) {
2902 perf_buffer_put(event->buffer);
2903 event->buffer = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002904 }
2905
Stephane Eraniane5d13672011-02-14 11:20:01 +02002906 if (is_cgroup_event(event))
2907 perf_detach_cgroup(event);
2908
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002909 if (event->destroy)
2910 event->destroy(event);
2911
Peter Zijlstra0c67b402010-09-13 11:15:58 +02002912 if (event->ctx)
2913 put_ctx(event->ctx);
2914
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002915 call_rcu(&event->rcu_head, free_event_rcu);
2916}
2917
Arjan van de Venfb0459d2009-09-25 12:25:56 +02002918int perf_event_release_kernel(struct perf_event *event)
2919{
2920 struct perf_event_context *ctx = event->ctx;
2921
Peter Zijlstra050735b2010-05-11 11:51:53 +02002922 /*
2923 * Remove from the PMU, can't get re-enabled since we got
2924 * here because the last ref went.
2925 */
2926 perf_event_disable(event);
2927
Arjan van de Venfb0459d2009-09-25 12:25:56 +02002928 WARN_ON_ONCE(ctx->parent_ctx);
Peter Zijlstraa0507c82010-05-06 15:42:53 +02002929 /*
2930 * There are two ways this annotation is useful:
2931 *
2932 * 1) there is a lock recursion from perf_event_exit_task
2933 * see the comment there.
2934 *
2935 * 2) there is a lock-inversion with mmap_sem through
2936 * perf_event_read_group(), which takes faults while
2937 * holding ctx->mutex, however this is called after
2938 * the last filedesc died, so there is no possibility
2939 * to trigger the AB-BA case.
2940 */
2941 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
Peter Zijlstra050735b2010-05-11 11:51:53 +02002942 raw_spin_lock_irq(&ctx->lock);
Peter Zijlstra8a495422010-05-27 15:47:49 +02002943 perf_group_detach(event);
Peter Zijlstra050735b2010-05-11 11:51:53 +02002944 list_del_event(event, ctx);
Peter Zijlstra050735b2010-05-11 11:51:53 +02002945 raw_spin_unlock_irq(&ctx->lock);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02002946 mutex_unlock(&ctx->mutex);
2947
Arjan van de Venfb0459d2009-09-25 12:25:56 +02002948 free_event(event);
2949
2950 return 0;
2951}
2952EXPORT_SYMBOL_GPL(perf_event_release_kernel);
2953
Peter Zijlstraa66a3052009-11-23 11:37:23 +01002954/*
2955 * Called when the last reference to the file is gone.
2956 */
2957static int perf_release(struct inode *inode, struct file *file)
2958{
2959 struct perf_event *event = file->private_data;
Peter Zijlstra88821352010-11-09 19:01:43 +01002960 struct task_struct *owner;
Peter Zijlstraa66a3052009-11-23 11:37:23 +01002961
2962 file->private_data = NULL;
2963
Peter Zijlstra88821352010-11-09 19:01:43 +01002964 rcu_read_lock();
2965 owner = ACCESS_ONCE(event->owner);
2966 /*
2967 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
2968 * !owner it means the list deletion is complete and we can indeed
2969 * free this event, otherwise we need to serialize on
2970 * owner->perf_event_mutex.
2971 */
2972 smp_read_barrier_depends();
2973 if (owner) {
2974 /*
2975 * Since delayed_put_task_struct() also drops the last
2976 * task reference we can safely take a new reference
2977 * while holding the rcu_read_lock().
2978 */
2979 get_task_struct(owner);
2980 }
2981 rcu_read_unlock();
2982
2983 if (owner) {
2984 mutex_lock(&owner->perf_event_mutex);
2985 /*
2986 * We have to re-check the event->owner field, if it is cleared
2987 * we raced with perf_event_exit_task(), acquiring the mutex
2988 * ensured they're done, and we can proceed with freeing the
2989 * event.
2990 */
2991 if (event->owner)
2992 list_del_init(&event->owner_entry);
2993 mutex_unlock(&owner->perf_event_mutex);
2994 put_task_struct(owner);
2995 }
2996
Peter Zijlstraa66a3052009-11-23 11:37:23 +01002997 return perf_event_release_kernel(event);
2998}
2999
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003000u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003001{
3002 struct perf_event *child;
3003 u64 total = 0;
3004
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003005 *enabled = 0;
3006 *running = 0;
3007
Peter Zijlstra6f105812009-11-20 22:19:56 +01003008 mutex_lock(&event->child_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003009 total += perf_event_read(event);
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003010 *enabled += event->total_time_enabled +
3011 atomic64_read(&event->child_total_time_enabled);
3012 *running += event->total_time_running +
3013 atomic64_read(&event->child_total_time_running);
3014
3015 list_for_each_entry(child, &event->child_list, child_list) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003016 total += perf_event_read(child);
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003017 *enabled += child->total_time_enabled;
3018 *running += child->total_time_running;
3019 }
Peter Zijlstra6f105812009-11-20 22:19:56 +01003020 mutex_unlock(&event->child_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003021
3022 return total;
3023}
Arjan van de Venfb0459d2009-09-25 12:25:56 +02003024EXPORT_SYMBOL_GPL(perf_event_read_value);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003025
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003026static int perf_event_read_group(struct perf_event *event,
3027 u64 read_format, char __user *buf)
3028{
3029 struct perf_event *leader = event->group_leader, *sub;
Peter Zijlstra6f105812009-11-20 22:19:56 +01003030 int n = 0, size = 0, ret = -EFAULT;
3031 struct perf_event_context *ctx = leader->ctx;
Peter Zijlstraabf48682009-11-20 22:19:49 +01003032 u64 values[5];
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003033 u64 count, enabled, running;
Peter Zijlstraabf48682009-11-20 22:19:49 +01003034
Peter Zijlstra6f105812009-11-20 22:19:56 +01003035 mutex_lock(&ctx->mutex);
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003036 count = perf_event_read_value(leader, &enabled, &running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003037
3038 values[n++] = 1 + leader->nr_siblings;
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003039 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3040 values[n++] = enabled;
3041 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3042 values[n++] = running;
Peter Zijlstraabf48682009-11-20 22:19:49 +01003043 values[n++] = count;
3044 if (read_format & PERF_FORMAT_ID)
3045 values[n++] = primary_event_id(leader);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003046
3047 size = n * sizeof(u64);
3048
3049 if (copy_to_user(buf, values, size))
Peter Zijlstra6f105812009-11-20 22:19:56 +01003050 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003051
Peter Zijlstra6f105812009-11-20 22:19:56 +01003052 ret = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003053
3054 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
Peter Zijlstraabf48682009-11-20 22:19:49 +01003055 n = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003056
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003057 values[n++] = perf_event_read_value(sub, &enabled, &running);
Peter Zijlstraabf48682009-11-20 22:19:49 +01003058 if (read_format & PERF_FORMAT_ID)
3059 values[n++] = primary_event_id(sub);
3060
3061 size = n * sizeof(u64);
3062
Stephane Eranian184d3da2009-11-23 21:40:49 -08003063 if (copy_to_user(buf + ret, values, size)) {
Peter Zijlstra6f105812009-11-20 22:19:56 +01003064 ret = -EFAULT;
3065 goto unlock;
3066 }
Peter Zijlstraabf48682009-11-20 22:19:49 +01003067
3068 ret += size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003069 }
Peter Zijlstra6f105812009-11-20 22:19:56 +01003070unlock:
3071 mutex_unlock(&ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003072
Peter Zijlstraabf48682009-11-20 22:19:49 +01003073 return ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003074}
3075
3076static int perf_event_read_one(struct perf_event *event,
3077 u64 read_format, char __user *buf)
3078{
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003079 u64 enabled, running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003080 u64 values[4];
3081 int n = 0;
3082
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003083 values[n++] = perf_event_read_value(event, &enabled, &running);
3084 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3085 values[n++] = enabled;
3086 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3087 values[n++] = running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003088 if (read_format & PERF_FORMAT_ID)
3089 values[n++] = primary_event_id(event);
3090
3091 if (copy_to_user(buf, values, n * sizeof(u64)))
3092 return -EFAULT;
3093
3094 return n * sizeof(u64);
3095}
3096
3097/*
3098 * Read the performance event - simple non blocking version for now
3099 */
3100static ssize_t
3101perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3102{
3103 u64 read_format = event->attr.read_format;
3104 int ret;
3105
3106 /*
3107 * Return end-of-file for a read on a event that is in
3108 * error state (i.e. because it was pinned but it couldn't be
3109 * scheduled on to the CPU at some point).
3110 */
3111 if (event->state == PERF_EVENT_STATE_ERROR)
3112 return 0;
3113
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02003114 if (count < event->read_size)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003115 return -ENOSPC;
3116
3117 WARN_ON_ONCE(event->ctx->parent_ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003118 if (read_format & PERF_FORMAT_GROUP)
3119 ret = perf_event_read_group(event, read_format, buf);
3120 else
3121 ret = perf_event_read_one(event, read_format, buf);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003122
3123 return ret;
3124}
3125
3126static ssize_t
3127perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3128{
3129 struct perf_event *event = file->private_data;
3130
3131 return perf_read_hw(event, buf, count);
3132}
3133
3134static unsigned int perf_poll(struct file *file, poll_table *wait)
3135{
3136 struct perf_event *event = file->private_data;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003137 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003138 unsigned int events = POLL_HUP;
3139
3140 rcu_read_lock();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003141 buffer = rcu_dereference(event->buffer);
3142 if (buffer)
3143 events = atomic_xchg(&buffer->poll, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003144 rcu_read_unlock();
3145
3146 poll_wait(file, &event->waitq, wait);
3147
3148 return events;
3149}
3150
3151static void perf_event_reset(struct perf_event *event)
3152{
3153 (void)perf_event_read(event);
Peter Zijlstrae7850592010-05-21 14:43:08 +02003154 local64_set(&event->count, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003155 perf_event_update_userpage(event);
3156}
3157
3158/*
3159 * Holding the top-level event's child_mutex means that any
3160 * descendant process that has inherited this event will block
3161 * in sync_child_event if it goes to exit, thus satisfying the
3162 * task existence requirements of perf_event_enable/disable.
3163 */
3164static void perf_event_for_each_child(struct perf_event *event,
3165 void (*func)(struct perf_event *))
3166{
3167 struct perf_event *child;
3168
3169 WARN_ON_ONCE(event->ctx->parent_ctx);
3170 mutex_lock(&event->child_mutex);
3171 func(event);
3172 list_for_each_entry(child, &event->child_list, child_list)
3173 func(child);
3174 mutex_unlock(&event->child_mutex);
3175}
3176
3177static void perf_event_for_each(struct perf_event *event,
3178 void (*func)(struct perf_event *))
3179{
3180 struct perf_event_context *ctx = event->ctx;
3181 struct perf_event *sibling;
3182
3183 WARN_ON_ONCE(ctx->parent_ctx);
3184 mutex_lock(&ctx->mutex);
3185 event = event->group_leader;
3186
3187 perf_event_for_each_child(event, func);
3188 func(event);
3189 list_for_each_entry(sibling, &event->sibling_list, group_entry)
3190 perf_event_for_each_child(event, func);
3191 mutex_unlock(&ctx->mutex);
3192}
3193
3194static int perf_event_period(struct perf_event *event, u64 __user *arg)
3195{
3196 struct perf_event_context *ctx = event->ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003197 int ret = 0;
3198 u64 value;
3199
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01003200 if (!is_sampling_event(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003201 return -EINVAL;
3202
John Blackwoodad0cf342010-09-28 18:03:11 -04003203 if (copy_from_user(&value, arg, sizeof(value)))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003204 return -EFAULT;
3205
3206 if (!value)
3207 return -EINVAL;
3208
Thomas Gleixnere625cce12009-11-17 18:02:06 +01003209 raw_spin_lock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003210 if (event->attr.freq) {
3211 if (value > sysctl_perf_event_sample_rate) {
3212 ret = -EINVAL;
3213 goto unlock;
3214 }
3215
3216 event->attr.sample_freq = value;
3217 } else {
3218 event->attr.sample_period = value;
3219 event->hw.sample_period = value;
3220 }
3221unlock:
Thomas Gleixnere625cce12009-11-17 18:02:06 +01003222 raw_spin_unlock_irq(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003223
3224 return ret;
3225}
3226
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003227static const struct file_operations perf_fops;
3228
3229static struct perf_event *perf_fget_light(int fd, int *fput_needed)
3230{
3231 struct file *file;
3232
3233 file = fget_light(fd, fput_needed);
3234 if (!file)
3235 return ERR_PTR(-EBADF);
3236
3237 if (file->f_op != &perf_fops) {
3238 fput_light(file, *fput_needed);
3239 *fput_needed = 0;
3240 return ERR_PTR(-EBADF);
3241 }
3242
3243 return file->private_data;
3244}
3245
3246static int perf_event_set_output(struct perf_event *event,
3247 struct perf_event *output_event);
Li Zefan6fb29152009-10-15 11:21:42 +08003248static int perf_event_set_filter(struct perf_event *event, void __user *arg);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003249
3250static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3251{
3252 struct perf_event *event = file->private_data;
3253 void (*func)(struct perf_event *);
3254 u32 flags = arg;
3255
3256 switch (cmd) {
3257 case PERF_EVENT_IOC_ENABLE:
3258 func = perf_event_enable;
3259 break;
3260 case PERF_EVENT_IOC_DISABLE:
3261 func = perf_event_disable;
3262 break;
3263 case PERF_EVENT_IOC_RESET:
3264 func = perf_event_reset;
3265 break;
3266
3267 case PERF_EVENT_IOC_REFRESH:
3268 return perf_event_refresh(event, arg);
3269
3270 case PERF_EVENT_IOC_PERIOD:
3271 return perf_event_period(event, (u64 __user *)arg);
3272
3273 case PERF_EVENT_IOC_SET_OUTPUT:
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003274 {
3275 struct perf_event *output_event = NULL;
3276 int fput_needed = 0;
3277 int ret;
3278
3279 if (arg != -1) {
3280 output_event = perf_fget_light(arg, &fput_needed);
3281 if (IS_ERR(output_event))
3282 return PTR_ERR(output_event);
3283 }
3284
3285 ret = perf_event_set_output(event, output_event);
3286 if (output_event)
3287 fput_light(output_event->filp, fput_needed);
3288
3289 return ret;
3290 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003291
Li Zefan6fb29152009-10-15 11:21:42 +08003292 case PERF_EVENT_IOC_SET_FILTER:
3293 return perf_event_set_filter(event, (void __user *)arg);
3294
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003295 default:
3296 return -ENOTTY;
3297 }
3298
3299 if (flags & PERF_IOC_FLAG_GROUP)
3300 perf_event_for_each(event, func);
3301 else
3302 perf_event_for_each_child(event, func);
3303
3304 return 0;
3305}
3306
3307int perf_event_task_enable(void)
3308{
3309 struct perf_event *event;
3310
3311 mutex_lock(&current->perf_event_mutex);
3312 list_for_each_entry(event, &current->perf_event_list, owner_entry)
3313 perf_event_for_each_child(event, perf_event_enable);
3314 mutex_unlock(&current->perf_event_mutex);
3315
3316 return 0;
3317}
3318
3319int perf_event_task_disable(void)
3320{
3321 struct perf_event *event;
3322
3323 mutex_lock(&current->perf_event_mutex);
3324 list_for_each_entry(event, &current->perf_event_list, owner_entry)
3325 perf_event_for_each_child(event, perf_event_disable);
3326 mutex_unlock(&current->perf_event_mutex);
3327
3328 return 0;
3329}
3330
3331#ifndef PERF_EVENT_INDEX_OFFSET
3332# define PERF_EVENT_INDEX_OFFSET 0
3333#endif
3334
3335static int perf_event_index(struct perf_event *event)
3336{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02003337 if (event->hw.state & PERF_HES_STOPPED)
3338 return 0;
3339
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003340 if (event->state != PERF_EVENT_STATE_ACTIVE)
3341 return 0;
3342
3343 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
3344}
3345
3346/*
3347 * Callers need to ensure there can be no nesting of this function, otherwise
3348 * the seqlock logic goes bad. We can not serialize this because the arch
3349 * code calls this from NMI context.
3350 */
3351void perf_event_update_userpage(struct perf_event *event)
3352{
3353 struct perf_event_mmap_page *userpg;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003354 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003355
3356 rcu_read_lock();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003357 buffer = rcu_dereference(event->buffer);
3358 if (!buffer)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003359 goto unlock;
3360
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003361 userpg = buffer->user_page;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003362
3363 /*
3364 * Disable preemption so as to not let the corresponding user-space
3365 * spin too long if we get preempted.
3366 */
3367 preempt_disable();
3368 ++userpg->lock;
3369 barrier();
3370 userpg->index = perf_event_index(event);
Peter Zijlstrab5e58792010-05-21 14:43:12 +02003371 userpg->offset = perf_event_count(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003372 if (event->state == PERF_EVENT_STATE_ACTIVE)
Peter Zijlstrae7850592010-05-21 14:43:08 +02003373 userpg->offset -= local64_read(&event->hw.prev_count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003374
3375 userpg->time_enabled = event->total_time_enabled +
3376 atomic64_read(&event->child_total_time_enabled);
3377
3378 userpg->time_running = event->total_time_running +
3379 atomic64_read(&event->child_total_time_running);
3380
3381 barrier();
3382 ++userpg->lock;
3383 preempt_enable();
3384unlock:
3385 rcu_read_unlock();
3386}
3387
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003388static unsigned long perf_data_size(struct perf_buffer *buffer);
3389
3390static void
3391perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
3392{
3393 long max_size = perf_data_size(buffer);
3394
3395 if (watermark)
3396 buffer->watermark = min(max_size, watermark);
3397
3398 if (!buffer->watermark)
3399 buffer->watermark = max_size / 2;
3400
3401 if (flags & PERF_BUFFER_WRITABLE)
3402 buffer->writable = 1;
3403
3404 atomic_set(&buffer->refcount, 1);
3405}
3406
Peter Zijlstra906010b2009-09-21 16:08:49 +02003407#ifndef CONFIG_PERF_USE_VMALLOC
3408
3409/*
3410 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
3411 */
3412
3413static struct page *
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003414perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003415{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003416 if (pgoff > buffer->nr_pages)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003417 return NULL;
3418
3419 if (pgoff == 0)
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003420 return virt_to_page(buffer->user_page);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003421
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003422 return virt_to_page(buffer->data_pages[pgoff - 1]);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003423}
3424
Peter Zijlstraa19d35c2010-05-17 18:48:00 +02003425static void *perf_mmap_alloc_page(int cpu)
3426{
3427 struct page *page;
3428 int node;
3429
3430 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
3431 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
3432 if (!page)
3433 return NULL;
3434
3435 return page_address(page);
3436}
3437
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003438static struct perf_buffer *
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003439perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003440{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003441 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003442 unsigned long size;
3443 int i;
3444
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003445 size = sizeof(struct perf_buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003446 size += nr_pages * sizeof(void *);
3447
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003448 buffer = kzalloc(size, GFP_KERNEL);
3449 if (!buffer)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003450 goto fail;
3451
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003452 buffer->user_page = perf_mmap_alloc_page(cpu);
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003453 if (!buffer->user_page)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003454 goto fail_user_page;
3455
3456 for (i = 0; i < nr_pages; i++) {
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003457 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003458 if (!buffer->data_pages[i])
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003459 goto fail_data_pages;
3460 }
3461
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003462 buffer->nr_pages = nr_pages;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003463
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003464 perf_buffer_init(buffer, watermark, flags);
3465
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003466 return buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003467
3468fail_data_pages:
3469 for (i--; i >= 0; i--)
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003470 free_page((unsigned long)buffer->data_pages[i]);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003471
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003472 free_page((unsigned long)buffer->user_page);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003473
3474fail_user_page:
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003475 kfree(buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003476
3477fail:
Peter Zijlstra906010b2009-09-21 16:08:49 +02003478 return NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003479}
3480
3481static void perf_mmap_free_page(unsigned long addr)
3482{
3483 struct page *page = virt_to_page((void *)addr);
3484
3485 page->mapping = NULL;
3486 __free_page(page);
3487}
3488
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003489static void perf_buffer_free(struct perf_buffer *buffer)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003490{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003491 int i;
3492
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003493 perf_mmap_free_page((unsigned long)buffer->user_page);
3494 for (i = 0; i < buffer->nr_pages; i++)
3495 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
3496 kfree(buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003497}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003498
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003499static inline int page_order(struct perf_buffer *buffer)
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003500{
3501 return 0;
3502}
3503
Peter Zijlstra906010b2009-09-21 16:08:49 +02003504#else
3505
3506/*
3507 * Back perf_mmap() with vmalloc memory.
3508 *
3509 * Required for architectures that have d-cache aliasing issues.
3510 */
3511
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003512static inline int page_order(struct perf_buffer *buffer)
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003513{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003514 return buffer->page_order;
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003515}
3516
Peter Zijlstra906010b2009-09-21 16:08:49 +02003517static struct page *
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003518perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003519{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003520 if (pgoff > (1UL << page_order(buffer)))
Peter Zijlstra906010b2009-09-21 16:08:49 +02003521 return NULL;
3522
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003523 return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003524}
3525
3526static void perf_mmap_unmark_page(void *addr)
3527{
3528 struct page *page = vmalloc_to_page(addr);
3529
3530 page->mapping = NULL;
3531}
3532
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003533static void perf_buffer_free_work(struct work_struct *work)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003534{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003535 struct perf_buffer *buffer;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003536 void *base;
3537 int i, nr;
3538
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003539 buffer = container_of(work, struct perf_buffer, work);
3540 nr = 1 << page_order(buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003541
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003542 base = buffer->user_page;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003543 for (i = 0; i < nr + 1; i++)
3544 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
3545
3546 vfree(base);
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003547 kfree(buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003548}
3549
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003550static void perf_buffer_free(struct perf_buffer *buffer)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003551{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003552 schedule_work(&buffer->work);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003553}
3554
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003555static struct perf_buffer *
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003556perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003557{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003558 struct perf_buffer *buffer;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003559 unsigned long size;
3560 void *all_buf;
3561
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003562 size = sizeof(struct perf_buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003563 size += sizeof(void *);
3564
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003565 buffer = kzalloc(size, GFP_KERNEL);
3566 if (!buffer)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003567 goto fail;
3568
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003569 INIT_WORK(&buffer->work, perf_buffer_free_work);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003570
3571 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
3572 if (!all_buf)
3573 goto fail_all_buf;
3574
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003575 buffer->user_page = all_buf;
3576 buffer->data_pages[0] = all_buf + PAGE_SIZE;
3577 buffer->page_order = ilog2(nr_pages);
3578 buffer->nr_pages = 1;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003579
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003580 perf_buffer_init(buffer, watermark, flags);
3581
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003582 return buffer;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003583
3584fail_all_buf:
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003585 kfree(buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003586
3587fail:
3588 return NULL;
3589}
3590
3591#endif
3592
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003593static unsigned long perf_data_size(struct perf_buffer *buffer)
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003594{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003595 return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003596}
3597
Peter Zijlstra906010b2009-09-21 16:08:49 +02003598static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3599{
3600 struct perf_event *event = vma->vm_file->private_data;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003601 struct perf_buffer *buffer;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003602 int ret = VM_FAULT_SIGBUS;
3603
3604 if (vmf->flags & FAULT_FLAG_MKWRITE) {
3605 if (vmf->pgoff == 0)
3606 ret = 0;
3607 return ret;
3608 }
3609
3610 rcu_read_lock();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003611 buffer = rcu_dereference(event->buffer);
3612 if (!buffer)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003613 goto unlock;
3614
3615 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
3616 goto unlock;
3617
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003618 vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003619 if (!vmf->page)
3620 goto unlock;
3621
3622 get_page(vmf->page);
3623 vmf->page->mapping = vma->vm_file->f_mapping;
3624 vmf->page->index = vmf->pgoff;
3625
3626 ret = 0;
3627unlock:
3628 rcu_read_unlock();
3629
3630 return ret;
3631}
3632
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003633static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
Peter Zijlstra906010b2009-09-21 16:08:49 +02003634{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003635 struct perf_buffer *buffer;
Peter Zijlstra906010b2009-09-21 16:08:49 +02003636
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003637 buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
3638 perf_buffer_free(buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003639}
3640
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003641static struct perf_buffer *perf_buffer_get(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003642{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003643 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003644
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003645 rcu_read_lock();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003646 buffer = rcu_dereference(event->buffer);
3647 if (buffer) {
3648 if (!atomic_inc_not_zero(&buffer->refcount))
3649 buffer = NULL;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003650 }
3651 rcu_read_unlock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003652
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003653 return buffer;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003654}
3655
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003656static void perf_buffer_put(struct perf_buffer *buffer)
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003657{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003658 if (!atomic_dec_and_test(&buffer->refcount))
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003659 return;
3660
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003661 call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003662}
3663
3664static void perf_mmap_open(struct vm_area_struct *vma)
3665{
3666 struct perf_event *event = vma->vm_file->private_data;
3667
3668 atomic_inc(&event->mmap_count);
3669}
3670
3671static void perf_mmap_close(struct vm_area_struct *vma)
3672{
3673 struct perf_event *event = vma->vm_file->private_data;
3674
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003675 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003676 unsigned long size = perf_data_size(event->buffer);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003677 struct user_struct *user = event->mmap_user;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003678 struct perf_buffer *buffer = event->buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003679
Peter Zijlstra906010b2009-09-21 16:08:49 +02003680 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003681 vma->vm_mm->locked_vm -= event->mmap_locked;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003682 rcu_assign_pointer(event->buffer, NULL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003683 mutex_unlock(&event->mmap_mutex);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003684
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003685 perf_buffer_put(buffer);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003686 free_uid(user);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003687 }
3688}
3689
Alexey Dobriyanf0f37e22009-09-27 22:29:37 +04003690static const struct vm_operations_struct perf_mmap_vmops = {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003691 .open = perf_mmap_open,
3692 .close = perf_mmap_close,
3693 .fault = perf_mmap_fault,
3694 .page_mkwrite = perf_mmap_fault,
3695};
3696
3697static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3698{
3699 struct perf_event *event = file->private_data;
3700 unsigned long user_locked, user_lock_limit;
3701 struct user_struct *user = current_user();
3702 unsigned long locked, lock_limit;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003703 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003704 unsigned long vma_size;
3705 unsigned long nr_pages;
3706 long user_extra, extra;
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003707 int ret = 0, flags = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003708
Peter Zijlstrac7920612010-05-18 10:33:24 +02003709 /*
3710 * Don't allow mmap() of inherited per-task counters. This would
3711 * create a performance issue due to all children writing to the
3712 * same buffer.
3713 */
3714 if (event->cpu == -1 && event->attr.inherit)
3715 return -EINVAL;
3716
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003717 if (!(vma->vm_flags & VM_SHARED))
3718 return -EINVAL;
3719
3720 vma_size = vma->vm_end - vma->vm_start;
3721 nr_pages = (vma_size / PAGE_SIZE) - 1;
3722
3723 /*
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003724 * If we have buffer pages ensure they're a power-of-two number, so we
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003725 * can do bitmasks instead of modulo.
3726 */
3727 if (nr_pages != 0 && !is_power_of_2(nr_pages))
3728 return -EINVAL;
3729
3730 if (vma_size != PAGE_SIZE * (1 + nr_pages))
3731 return -EINVAL;
3732
3733 if (vma->vm_pgoff != 0)
3734 return -EINVAL;
3735
3736 WARN_ON_ONCE(event->ctx->parent_ctx);
3737 mutex_lock(&event->mmap_mutex);
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003738 if (event->buffer) {
3739 if (event->buffer->nr_pages == nr_pages)
3740 atomic_inc(&event->buffer->refcount);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003741 else
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003742 ret = -EINVAL;
3743 goto unlock;
3744 }
3745
3746 user_extra = nr_pages + 1;
3747 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
3748
3749 /*
3750 * Increase the limit linearly with more CPUs:
3751 */
3752 user_lock_limit *= num_online_cpus();
3753
3754 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
3755
3756 extra = 0;
3757 if (user_locked > user_lock_limit)
3758 extra = user_locked - user_lock_limit;
3759
Jiri Slaby78d7d402010-03-05 13:42:54 -08003760 lock_limit = rlimit(RLIMIT_MEMLOCK);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003761 lock_limit >>= PAGE_SHIFT;
3762 locked = vma->vm_mm->locked_vm + extra;
3763
3764 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
3765 !capable(CAP_IPC_LOCK)) {
3766 ret = -EPERM;
3767 goto unlock;
3768 }
3769
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003770 WARN_ON(event->buffer);
Peter Zijlstra906010b2009-09-21 16:08:49 +02003771
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003772 if (vma->vm_flags & VM_WRITE)
3773 flags |= PERF_BUFFER_WRITABLE;
3774
3775 buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
3776 event->cpu, flags);
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003777 if (!buffer) {
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003778 ret = -ENOMEM;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003779 goto unlock;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003780 }
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02003781 rcu_assign_pointer(event->buffer, buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003782
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003783 atomic_long_add(user_extra, &user->locked_vm);
3784 event->mmap_locked = extra;
3785 event->mmap_user = get_current_user();
3786 vma->vm_mm->locked_vm += event->mmap_locked;
3787
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003788unlock:
Peter Zijlstraac9721f2010-05-27 12:54:41 +02003789 if (!ret)
3790 atomic_inc(&event->mmap_count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003791 mutex_unlock(&event->mmap_mutex);
3792
3793 vma->vm_flags |= VM_RESERVED;
3794 vma->vm_ops = &perf_mmap_vmops;
3795
3796 return ret;
3797}
3798
3799static int perf_fasync(int fd, struct file *filp, int on)
3800{
3801 struct inode *inode = filp->f_path.dentry->d_inode;
3802 struct perf_event *event = filp->private_data;
3803 int retval;
3804
3805 mutex_lock(&inode->i_mutex);
3806 retval = fasync_helper(fd, filp, on, &event->fasync);
3807 mutex_unlock(&inode->i_mutex);
3808
3809 if (retval < 0)
3810 return retval;
3811
3812 return 0;
3813}
3814
3815static const struct file_operations perf_fops = {
Arnd Bergmann3326c1c2010-03-23 19:09:33 +01003816 .llseek = no_llseek,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003817 .release = perf_release,
3818 .read = perf_read,
3819 .poll = perf_poll,
3820 .unlocked_ioctl = perf_ioctl,
3821 .compat_ioctl = perf_ioctl,
3822 .mmap = perf_mmap,
3823 .fasync = perf_fasync,
3824};
3825
3826/*
3827 * Perf event wakeup
3828 *
3829 * If there's data, ensure we set the poll() state and publish everything
3830 * to user-space before waking everybody up.
3831 */
3832
3833void perf_event_wakeup(struct perf_event *event)
3834{
3835 wake_up_all(&event->waitq);
3836
3837 if (event->pending_kill) {
3838 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
3839 event->pending_kill = 0;
3840 }
3841}
3842
Peter Zijlstrae360adb2010-10-14 14:01:34 +08003843static void perf_pending_event(struct irq_work *entry)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003844{
3845 struct perf_event *event = container_of(entry,
3846 struct perf_event, pending);
3847
3848 if (event->pending_disable) {
3849 event->pending_disable = 0;
3850 __perf_event_disable(event);
3851 }
3852
3853 if (event->pending_wakeup) {
3854 event->pending_wakeup = 0;
3855 perf_event_wakeup(event);
3856 }
3857}
3858
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003859/*
Zhang, Yanmin39447b32010-04-19 13:32:41 +08003860 * We assume there is only KVM supporting the callbacks.
3861 * Later on, we might change it to a list if there is
3862 * another virtualization implementation supporting the callbacks.
3863 */
3864struct perf_guest_info_callbacks *perf_guest_cbs;
3865
3866int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3867{
3868 perf_guest_cbs = cbs;
3869 return 0;
3870}
3871EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
3872
3873int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3874{
3875 perf_guest_cbs = NULL;
3876 return 0;
3877}
3878EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3879
3880/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003881 * Output
3882 */
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003883static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003884 unsigned long offset, unsigned long head)
3885{
3886 unsigned long mask;
3887
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003888 if (!buffer->writable)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003889 return true;
3890
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003891 mask = perf_data_size(buffer) - 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003892
3893 offset = (offset - tail) & mask;
3894 head = (head - tail) & mask;
3895
3896 if ((int)(head - offset) < 0)
3897 return false;
3898
3899 return true;
3900}
3901
3902static void perf_output_wakeup(struct perf_output_handle *handle)
3903{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003904 atomic_set(&handle->buffer->poll, POLL_IN);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003905
3906 if (handle->nmi) {
3907 handle->event->pending_wakeup = 1;
Peter Zijlstrae360adb2010-10-14 14:01:34 +08003908 irq_work_queue(&handle->event->pending);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003909 } else
3910 perf_event_wakeup(handle->event);
3911}
3912
3913/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003914 * We need to ensure a later event_id doesn't publish a head when a former
Peter Zijlstraef607772010-05-18 10:50:41 +02003915 * event isn't done writing. However since we need to deal with NMIs we
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003916 * cannot fully serialize things.
3917 *
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003918 * We only publish the head (and generate a wakeup) when the outer-most
Peter Zijlstraef607772010-05-18 10:50:41 +02003919 * event completes.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003920 */
Peter Zijlstraef607772010-05-18 10:50:41 +02003921static void perf_output_get_handle(struct perf_output_handle *handle)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003922{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003923 struct perf_buffer *buffer = handle->buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003924
Peter Zijlstraef607772010-05-18 10:50:41 +02003925 preempt_disable();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003926 local_inc(&buffer->nest);
3927 handle->wakeup = local_read(&buffer->wakeup);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003928}
3929
Peter Zijlstraef607772010-05-18 10:50:41 +02003930static void perf_output_put_handle(struct perf_output_handle *handle)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003931{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003932 struct perf_buffer *buffer = handle->buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003933 unsigned long head;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003934
3935again:
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003936 head = local_read(&buffer->head);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003937
3938 /*
Peter Zijlstraef607772010-05-18 10:50:41 +02003939 * IRQ/NMI can happen here, which means we can miss a head update.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003940 */
3941
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003942 if (!local_dec_and_test(&buffer->nest))
Frederic Weisbeckeracd35a42010-05-20 21:28:34 +02003943 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003944
3945 /*
Peter Zijlstraef607772010-05-18 10:50:41 +02003946 * Publish the known good head. Rely on the full barrier implied
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003947 * by atomic_dec_and_test() order the buffer->head read and this
Peter Zijlstraef607772010-05-18 10:50:41 +02003948 * write.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003949 */
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003950 buffer->user_page->data_head = head;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003951
Peter Zijlstraef607772010-05-18 10:50:41 +02003952 /*
3953 * Now check if we missed an update, rely on the (compiler)
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003954 * barrier in atomic_dec_and_test() to re-read buffer->head.
Peter Zijlstraef607772010-05-18 10:50:41 +02003955 */
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003956 if (unlikely(head != local_read(&buffer->head))) {
3957 local_inc(&buffer->nest);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003958 goto again;
3959 }
3960
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003961 if (handle->wakeup != local_read(&buffer->wakeup))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003962 perf_output_wakeup(handle);
Peter Zijlstraef607772010-05-18 10:50:41 +02003963
Peter Zijlstra9ed60602010-06-11 17:36:35 +02003964out:
Peter Zijlstraef607772010-05-18 10:50:41 +02003965 preempt_enable();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003966}
3967
Peter Zijlstraa94ffaa2010-05-20 19:50:07 +02003968__always_inline void perf_output_copy(struct perf_output_handle *handle,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003969 const void *buf, unsigned int len)
3970{
Peter Zijlstra5d967a82010-05-20 16:46:39 +02003971 do {
Peter Zijlstraa94ffaa2010-05-20 19:50:07 +02003972 unsigned long size = min_t(unsigned long, handle->size, len);
Peter Zijlstra5d967a82010-05-20 16:46:39 +02003973
3974 memcpy(handle->addr, buf, size);
3975
3976 len -= size;
3977 handle->addr += size;
Frederic Weisbecker74048f82010-05-27 21:34:58 +02003978 buf += size;
Peter Zijlstra5d967a82010-05-20 16:46:39 +02003979 handle->size -= size;
3980 if (!handle->size) {
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003981 struct perf_buffer *buffer = handle->buffer;
Peter Zijlstra3cafa9f2010-05-20 19:07:56 +02003982
Peter Zijlstra5d967a82010-05-20 16:46:39 +02003983 handle->page++;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02003984 handle->page &= buffer->nr_pages - 1;
3985 handle->addr = buffer->data_pages[handle->page];
3986 handle->size = PAGE_SIZE << page_order(buffer);
Peter Zijlstra5d967a82010-05-20 16:46:39 +02003987 }
3988 } while (len);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003989}
3990
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02003991static void __perf_event_header__init_id(struct perf_event_header *header,
3992 struct perf_sample_data *data,
3993 struct perf_event *event)
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02003994{
3995 u64 sample_type = event->attr.sample_type;
3996
3997 data->type = sample_type;
3998 header->size += event->id_header_size;
3999
4000 if (sample_type & PERF_SAMPLE_TID) {
4001 /* namespace issues */
4002 data->tid_entry.pid = perf_event_pid(event, current);
4003 data->tid_entry.tid = perf_event_tid(event, current);
4004 }
4005
4006 if (sample_type & PERF_SAMPLE_TIME)
4007 data->time = perf_clock();
4008
4009 if (sample_type & PERF_SAMPLE_ID)
4010 data->id = primary_event_id(event);
4011
4012 if (sample_type & PERF_SAMPLE_STREAM_ID)
4013 data->stream_id = event->id;
4014
4015 if (sample_type & PERF_SAMPLE_CPU) {
4016 data->cpu_entry.cpu = raw_smp_processor_id();
4017 data->cpu_entry.reserved = 0;
4018 }
4019}
4020
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004021static void perf_event_header__init_id(struct perf_event_header *header,
4022 struct perf_sample_data *data,
4023 struct perf_event *event)
4024{
4025 if (event->attr.sample_id_all)
4026 __perf_event_header__init_id(header, data, event);
4027}
4028
4029static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4030 struct perf_sample_data *data)
4031{
4032 u64 sample_type = data->type;
4033
4034 if (sample_type & PERF_SAMPLE_TID)
4035 perf_output_put(handle, data->tid_entry);
4036
4037 if (sample_type & PERF_SAMPLE_TIME)
4038 perf_output_put(handle, data->time);
4039
4040 if (sample_type & PERF_SAMPLE_ID)
4041 perf_output_put(handle, data->id);
4042
4043 if (sample_type & PERF_SAMPLE_STREAM_ID)
4044 perf_output_put(handle, data->stream_id);
4045
4046 if (sample_type & PERF_SAMPLE_CPU)
4047 perf_output_put(handle, data->cpu_entry);
4048}
4049
4050static void perf_event__output_id_sample(struct perf_event *event,
4051 struct perf_output_handle *handle,
4052 struct perf_sample_data *sample)
4053{
4054 if (event->attr.sample_id_all)
4055 __perf_event__output_id_sample(handle, sample);
4056}
4057
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004058int perf_output_begin(struct perf_output_handle *handle,
4059 struct perf_event *event, unsigned int size,
4060 int nmi, int sample)
4061{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004062 struct perf_buffer *buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004063 unsigned long tail, offset, head;
4064 int have_lost;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004065 struct perf_sample_data sample_data;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004066 struct {
4067 struct perf_event_header header;
4068 u64 id;
4069 u64 lost;
4070 } lost_event;
4071
4072 rcu_read_lock();
4073 /*
4074 * For inherited events we send all the output towards the parent.
4075 */
4076 if (event->parent)
4077 event = event->parent;
4078
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004079 buffer = rcu_dereference(event->buffer);
4080 if (!buffer)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004081 goto out;
4082
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004083 handle->buffer = buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004084 handle->event = event;
4085 handle->nmi = nmi;
4086 handle->sample = sample;
4087
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004088 if (!buffer->nr_pages)
Stephane Eranian00d1d0b2010-05-17 12:46:01 +02004089 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004090
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004091 have_lost = local_read(&buffer->lost);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004092 if (have_lost) {
4093 lost_event.header.size = sizeof(lost_event);
4094 perf_event_header__init_id(&lost_event.header, &sample_data,
4095 event);
4096 size += lost_event.header.size;
4097 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004098
Peter Zijlstraef607772010-05-18 10:50:41 +02004099 perf_output_get_handle(handle);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004100
4101 do {
4102 /*
4103 * Userspace could choose to issue a mb() before updating the
4104 * tail pointer. So that all reads will be completed before the
4105 * write is issued.
4106 */
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004107 tail = ACCESS_ONCE(buffer->user_page->data_tail);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004108 smp_rmb();
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004109 offset = head = local_read(&buffer->head);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004110 head += size;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004111 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004112 goto fail;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004113 } while (local_cmpxchg(&buffer->head, offset, head) != offset);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004114
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004115 if (head - local_read(&buffer->wakeup) > buffer->watermark)
4116 local_add(buffer->watermark, &buffer->wakeup);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004117
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004118 handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
4119 handle->page &= buffer->nr_pages - 1;
4120 handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
4121 handle->addr = buffer->data_pages[handle->page];
Peter Zijlstra5d967a82010-05-20 16:46:39 +02004122 handle->addr += handle->size;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004123 handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
Peter Zijlstra5d967a82010-05-20 16:46:39 +02004124
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004125 if (have_lost) {
4126 lost_event.header.type = PERF_RECORD_LOST;
4127 lost_event.header.misc = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004128 lost_event.id = event->id;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004129 lost_event.lost = local_xchg(&buffer->lost, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004130
4131 perf_output_put(handle, lost_event);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004132 perf_event__output_id_sample(event, handle, &sample_data);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004133 }
4134
4135 return 0;
4136
4137fail:
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004138 local_inc(&buffer->lost);
Peter Zijlstraef607772010-05-18 10:50:41 +02004139 perf_output_put_handle(handle);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004140out:
4141 rcu_read_unlock();
4142
4143 return -ENOSPC;
4144}
4145
4146void perf_output_end(struct perf_output_handle *handle)
4147{
4148 struct perf_event *event = handle->event;
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004149 struct perf_buffer *buffer = handle->buffer;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004150
4151 int wakeup_events = event->attr.wakeup_events;
4152
4153 if (handle->sample && wakeup_events) {
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004154 int events = local_inc_return(&buffer->events);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004155 if (events >= wakeup_events) {
Peter Zijlstraca5135e2010-05-28 19:33:23 +02004156 local_sub(wakeup_events, &buffer->events);
4157 local_inc(&buffer->wakeup);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004158 }
4159 }
4160
Peter Zijlstraef607772010-05-18 10:50:41 +02004161 perf_output_put_handle(handle);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004162 rcu_read_unlock();
4163}
4164
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004165static void perf_output_read_one(struct perf_output_handle *handle,
Stephane Eranianeed01522010-10-26 16:08:01 +02004166 struct perf_event *event,
4167 u64 enabled, u64 running)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004168{
4169 u64 read_format = event->attr.read_format;
4170 u64 values[4];
4171 int n = 0;
4172
Peter Zijlstrab5e58792010-05-21 14:43:12 +02004173 values[n++] = perf_event_count(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004174 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
Stephane Eranianeed01522010-10-26 16:08:01 +02004175 values[n++] = enabled +
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004176 atomic64_read(&event->child_total_time_enabled);
4177 }
4178 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
Stephane Eranianeed01522010-10-26 16:08:01 +02004179 values[n++] = running +
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004180 atomic64_read(&event->child_total_time_running);
4181 }
4182 if (read_format & PERF_FORMAT_ID)
4183 values[n++] = primary_event_id(event);
4184
4185 perf_output_copy(handle, values, n * sizeof(u64));
4186}
4187
4188/*
4189 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
4190 */
4191static void perf_output_read_group(struct perf_output_handle *handle,
Stephane Eranianeed01522010-10-26 16:08:01 +02004192 struct perf_event *event,
4193 u64 enabled, u64 running)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004194{
4195 struct perf_event *leader = event->group_leader, *sub;
4196 u64 read_format = event->attr.read_format;
4197 u64 values[5];
4198 int n = 0;
4199
4200 values[n++] = 1 + leader->nr_siblings;
4201
4202 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
Stephane Eranianeed01522010-10-26 16:08:01 +02004203 values[n++] = enabled;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004204
4205 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
Stephane Eranianeed01522010-10-26 16:08:01 +02004206 values[n++] = running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004207
4208 if (leader != event)
4209 leader->pmu->read(leader);
4210
Peter Zijlstrab5e58792010-05-21 14:43:12 +02004211 values[n++] = perf_event_count(leader);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004212 if (read_format & PERF_FORMAT_ID)
4213 values[n++] = primary_event_id(leader);
4214
4215 perf_output_copy(handle, values, n * sizeof(u64));
4216
4217 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4218 n = 0;
4219
4220 if (sub != event)
4221 sub->pmu->read(sub);
4222
Peter Zijlstrab5e58792010-05-21 14:43:12 +02004223 values[n++] = perf_event_count(sub);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004224 if (read_format & PERF_FORMAT_ID)
4225 values[n++] = primary_event_id(sub);
4226
4227 perf_output_copy(handle, values, n * sizeof(u64));
4228 }
4229}
4230
Stephane Eranianeed01522010-10-26 16:08:01 +02004231#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
4232 PERF_FORMAT_TOTAL_TIME_RUNNING)
4233
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004234static void perf_output_read(struct perf_output_handle *handle,
4235 struct perf_event *event)
4236{
Stephane Eranianeed01522010-10-26 16:08:01 +02004237 u64 enabled = 0, running = 0, now, ctx_time;
4238 u64 read_format = event->attr.read_format;
4239
4240 /*
4241 * compute total_time_enabled, total_time_running
4242 * based on snapshot values taken when the event
4243 * was last scheduled in.
4244 *
4245 * we cannot simply called update_context_time()
4246 * because of locking issue as we are called in
4247 * NMI context
4248 */
4249 if (read_format & PERF_FORMAT_TOTAL_TIMES) {
4250 now = perf_clock();
4251 ctx_time = event->shadow_ctx_time + now;
4252 enabled = ctx_time - event->tstamp_enabled;
4253 running = ctx_time - event->tstamp_running;
4254 }
4255
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004256 if (event->attr.read_format & PERF_FORMAT_GROUP)
Stephane Eranianeed01522010-10-26 16:08:01 +02004257 perf_output_read_group(handle, event, enabled, running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004258 else
Stephane Eranianeed01522010-10-26 16:08:01 +02004259 perf_output_read_one(handle, event, enabled, running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004260}
4261
4262void perf_output_sample(struct perf_output_handle *handle,
4263 struct perf_event_header *header,
4264 struct perf_sample_data *data,
4265 struct perf_event *event)
4266{
4267 u64 sample_type = data->type;
4268
4269 perf_output_put(handle, *header);
4270
4271 if (sample_type & PERF_SAMPLE_IP)
4272 perf_output_put(handle, data->ip);
4273
4274 if (sample_type & PERF_SAMPLE_TID)
4275 perf_output_put(handle, data->tid_entry);
4276
4277 if (sample_type & PERF_SAMPLE_TIME)
4278 perf_output_put(handle, data->time);
4279
4280 if (sample_type & PERF_SAMPLE_ADDR)
4281 perf_output_put(handle, data->addr);
4282
4283 if (sample_type & PERF_SAMPLE_ID)
4284 perf_output_put(handle, data->id);
4285
4286 if (sample_type & PERF_SAMPLE_STREAM_ID)
4287 perf_output_put(handle, data->stream_id);
4288
4289 if (sample_type & PERF_SAMPLE_CPU)
4290 perf_output_put(handle, data->cpu_entry);
4291
4292 if (sample_type & PERF_SAMPLE_PERIOD)
4293 perf_output_put(handle, data->period);
4294
4295 if (sample_type & PERF_SAMPLE_READ)
4296 perf_output_read(handle, event);
4297
4298 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4299 if (data->callchain) {
4300 int size = 1;
4301
4302 if (data->callchain)
4303 size += data->callchain->nr;
4304
4305 size *= sizeof(u64);
4306
4307 perf_output_copy(handle, data->callchain, size);
4308 } else {
4309 u64 nr = 0;
4310 perf_output_put(handle, nr);
4311 }
4312 }
4313
4314 if (sample_type & PERF_SAMPLE_RAW) {
4315 if (data->raw) {
4316 perf_output_put(handle, data->raw->size);
4317 perf_output_copy(handle, data->raw->data,
4318 data->raw->size);
4319 } else {
4320 struct {
4321 u32 size;
4322 u32 data;
4323 } raw = {
4324 .size = sizeof(u32),
4325 .data = 0,
4326 };
4327 perf_output_put(handle, raw);
4328 }
4329 }
4330}
4331
4332void perf_prepare_sample(struct perf_event_header *header,
4333 struct perf_sample_data *data,
4334 struct perf_event *event,
4335 struct pt_regs *regs)
4336{
4337 u64 sample_type = event->attr.sample_type;
4338
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004339 header->type = PERF_RECORD_SAMPLE;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02004340 header->size = sizeof(*header) + event->header_size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004341
4342 header->misc = 0;
4343 header->misc |= perf_misc_flags(regs);
4344
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004345 __perf_event_header__init_id(header, data, event);
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02004346
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02004347 if (sample_type & PERF_SAMPLE_IP)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004348 data->ip = perf_instruction_pointer(regs);
4349
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004350 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4351 int size = 1;
4352
4353 data->callchain = perf_callchain(regs);
4354
4355 if (data->callchain)
4356 size += data->callchain->nr;
4357
4358 header->size += size * sizeof(u64);
4359 }
4360
4361 if (sample_type & PERF_SAMPLE_RAW) {
4362 int size = sizeof(u32);
4363
4364 if (data->raw)
4365 size += data->raw->size;
4366 else
4367 size += sizeof(u32);
4368
4369 WARN_ON_ONCE(size & (sizeof(u64)-1));
4370 header->size += size;
4371 }
4372}
4373
4374static void perf_event_output(struct perf_event *event, int nmi,
4375 struct perf_sample_data *data,
4376 struct pt_regs *regs)
4377{
4378 struct perf_output_handle handle;
4379 struct perf_event_header header;
4380
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02004381 /* protect the callchain buffers */
4382 rcu_read_lock();
4383
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004384 perf_prepare_sample(&header, data, event, regs);
4385
4386 if (perf_output_begin(&handle, event, header.size, nmi, 1))
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02004387 goto exit;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004388
4389 perf_output_sample(&handle, &header, data, event);
4390
4391 perf_output_end(&handle);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02004392
4393exit:
4394 rcu_read_unlock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004395}
4396
4397/*
4398 * read event_id
4399 */
4400
4401struct perf_read_event {
4402 struct perf_event_header header;
4403
4404 u32 pid;
4405 u32 tid;
4406};
4407
4408static void
4409perf_event_read_event(struct perf_event *event,
4410 struct task_struct *task)
4411{
4412 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004413 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004414 struct perf_read_event read_event = {
4415 .header = {
4416 .type = PERF_RECORD_READ,
4417 .misc = 0,
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02004418 .size = sizeof(read_event) + event->read_size,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004419 },
4420 .pid = perf_event_pid(event, task),
4421 .tid = perf_event_tid(event, task),
4422 };
4423 int ret;
4424
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004425 perf_event_header__init_id(&read_event.header, &sample, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004426 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
4427 if (ret)
4428 return;
4429
4430 perf_output_put(&handle, read_event);
4431 perf_output_read(&handle, event);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004432 perf_event__output_id_sample(event, &handle, &sample);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004433
4434 perf_output_end(&handle);
4435}
4436
4437/*
4438 * task tracking -- fork/exit
4439 *
Eric B Munson3af9e852010-05-18 15:30:49 +01004440 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004441 */
4442
4443struct perf_task_event {
4444 struct task_struct *task;
4445 struct perf_event_context *task_ctx;
4446
4447 struct {
4448 struct perf_event_header header;
4449
4450 u32 pid;
4451 u32 ppid;
4452 u32 tid;
4453 u32 ptid;
4454 u64 time;
4455 } event_id;
4456};
4457
4458static void perf_event_task_output(struct perf_event *event,
4459 struct perf_task_event *task_event)
4460{
4461 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004462 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004463 struct task_struct *task = task_event->task;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004464 int ret, size = task_event->event_id.header.size;
Mike Galbraith8bb39f92010-03-26 11:11:33 +01004465
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004466 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004467
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004468 ret = perf_output_begin(&handle, event,
4469 task_event->event_id.header.size, 0, 0);
Peter Zijlstraef607772010-05-18 10:50:41 +02004470 if (ret)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004471 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004472
4473 task_event->event_id.pid = perf_event_pid(event, task);
4474 task_event->event_id.ppid = perf_event_pid(event, current);
4475
4476 task_event->event_id.tid = perf_event_tid(event, task);
4477 task_event->event_id.ptid = perf_event_tid(event, current);
4478
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004479 perf_output_put(&handle, task_event->event_id);
4480
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004481 perf_event__output_id_sample(event, &handle, &sample);
4482
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004483 perf_output_end(&handle);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004484out:
4485 task_event->event_id.header.size = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004486}
4487
4488static int perf_event_task_match(struct perf_event *event)
4489{
Peter Zijlstra6f93d0a2010-02-14 11:12:04 +01004490 if (event->state < PERF_EVENT_STATE_INACTIVE)
Peter Zijlstra22e19082010-01-18 09:12:32 +01004491 return 0;
4492
Stephane Eranian5632ab12011-01-03 18:20:01 +02004493 if (!event_filter_match(event))
Peter Zijlstra5d27c232009-12-17 13:16:32 +01004494 return 0;
4495
Eric B Munson3af9e852010-05-18 15:30:49 +01004496 if (event->attr.comm || event->attr.mmap ||
4497 event->attr.mmap_data || event->attr.task)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004498 return 1;
4499
4500 return 0;
4501}
4502
4503static void perf_event_task_ctx(struct perf_event_context *ctx,
4504 struct perf_task_event *task_event)
4505{
4506 struct perf_event *event;
4507
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004508 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4509 if (perf_event_task_match(event))
4510 perf_event_task_output(event, task_event);
4511 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004512}
4513
4514static void perf_event_task_event(struct perf_task_event *task_event)
4515{
4516 struct perf_cpu_context *cpuctx;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004517 struct perf_event_context *ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004518 struct pmu *pmu;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004519 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004520
Peter Zijlstrad6ff86c2009-11-20 22:19:46 +01004521 rcu_read_lock();
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004522 list_for_each_entry_rcu(pmu, &pmus, entry) {
Peter Zijlstra41945f62010-09-16 19:17:24 +02004523 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra51676952010-12-07 14:18:20 +01004524 if (cpuctx->active_pmu != pmu)
4525 goto next;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004526 perf_event_task_ctx(&cpuctx->ctx, task_event);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004527
4528 ctx = task_event->task_ctx;
4529 if (!ctx) {
4530 ctxn = pmu->task_ctx_nr;
4531 if (ctxn < 0)
Peter Zijlstra41945f62010-09-16 19:17:24 +02004532 goto next;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004533 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4534 }
4535 if (ctx)
4536 perf_event_task_ctx(ctx, task_event);
Peter Zijlstra41945f62010-09-16 19:17:24 +02004537next:
4538 put_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004539 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004540 rcu_read_unlock();
4541}
4542
4543static void perf_event_task(struct task_struct *task,
4544 struct perf_event_context *task_ctx,
4545 int new)
4546{
4547 struct perf_task_event task_event;
4548
4549 if (!atomic_read(&nr_comm_events) &&
4550 !atomic_read(&nr_mmap_events) &&
4551 !atomic_read(&nr_task_events))
4552 return;
4553
4554 task_event = (struct perf_task_event){
4555 .task = task,
4556 .task_ctx = task_ctx,
4557 .event_id = {
4558 .header = {
4559 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
4560 .misc = 0,
4561 .size = sizeof(task_event.event_id),
4562 },
4563 /* .pid */
4564 /* .ppid */
4565 /* .tid */
4566 /* .ptid */
Peter Zijlstra6f93d0a2010-02-14 11:12:04 +01004567 .time = perf_clock(),
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004568 },
4569 };
4570
4571 perf_event_task_event(&task_event);
4572}
4573
4574void perf_event_fork(struct task_struct *task)
4575{
4576 perf_event_task(task, NULL, 1);
4577}
4578
4579/*
4580 * comm tracking
4581 */
4582
4583struct perf_comm_event {
4584 struct task_struct *task;
4585 char *comm;
4586 int comm_size;
4587
4588 struct {
4589 struct perf_event_header header;
4590
4591 u32 pid;
4592 u32 tid;
4593 } event_id;
4594};
4595
4596static void perf_event_comm_output(struct perf_event *event,
4597 struct perf_comm_event *comm_event)
4598{
4599 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004600 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004601 int size = comm_event->event_id.header.size;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004602 int ret;
4603
4604 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4605 ret = perf_output_begin(&handle, event,
4606 comm_event->event_id.header.size, 0, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004607
4608 if (ret)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004609 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004610
4611 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
4612 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
4613
4614 perf_output_put(&handle, comm_event->event_id);
4615 perf_output_copy(&handle, comm_event->comm,
4616 comm_event->comm_size);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004617
4618 perf_event__output_id_sample(event, &handle, &sample);
4619
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004620 perf_output_end(&handle);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004621out:
4622 comm_event->event_id.header.size = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004623}
4624
4625static int perf_event_comm_match(struct perf_event *event)
4626{
Peter Zijlstra6f93d0a2010-02-14 11:12:04 +01004627 if (event->state < PERF_EVENT_STATE_INACTIVE)
Peter Zijlstra22e19082010-01-18 09:12:32 +01004628 return 0;
4629
Stephane Eranian5632ab12011-01-03 18:20:01 +02004630 if (!event_filter_match(event))
Peter Zijlstra5d27c232009-12-17 13:16:32 +01004631 return 0;
4632
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004633 if (event->attr.comm)
4634 return 1;
4635
4636 return 0;
4637}
4638
4639static void perf_event_comm_ctx(struct perf_event_context *ctx,
4640 struct perf_comm_event *comm_event)
4641{
4642 struct perf_event *event;
4643
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004644 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4645 if (perf_event_comm_match(event))
4646 perf_event_comm_output(event, comm_event);
4647 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004648}
4649
4650static void perf_event_comm_event(struct perf_comm_event *comm_event)
4651{
4652 struct perf_cpu_context *cpuctx;
4653 struct perf_event_context *ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004654 char comm[TASK_COMM_LEN];
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004655 unsigned int size;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004656 struct pmu *pmu;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004657 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004658
4659 memset(comm, 0, sizeof(comm));
Márton Németh96b02d72009-11-21 23:10:15 +01004660 strlcpy(comm, comm_event->task->comm, sizeof(comm));
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004661 size = ALIGN(strlen(comm)+1, sizeof(u64));
4662
4663 comm_event->comm = comm;
4664 comm_event->comm_size = size;
4665
4666 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
Peter Zijlstraf6595f32009-11-20 22:19:47 +01004667 rcu_read_lock();
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004668 list_for_each_entry_rcu(pmu, &pmus, entry) {
Peter Zijlstra41945f62010-09-16 19:17:24 +02004669 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra51676952010-12-07 14:18:20 +01004670 if (cpuctx->active_pmu != pmu)
4671 goto next;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004672 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004673
4674 ctxn = pmu->task_ctx_nr;
4675 if (ctxn < 0)
Peter Zijlstra41945f62010-09-16 19:17:24 +02004676 goto next;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004677
4678 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4679 if (ctx)
4680 perf_event_comm_ctx(ctx, comm_event);
Peter Zijlstra41945f62010-09-16 19:17:24 +02004681next:
4682 put_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004683 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004684 rcu_read_unlock();
4685}
4686
4687void perf_event_comm(struct task_struct *task)
4688{
4689 struct perf_comm_event comm_event;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004690 struct perf_event_context *ctx;
4691 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004692
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004693 for_each_task_context_nr(ctxn) {
4694 ctx = task->perf_event_ctxp[ctxn];
4695 if (!ctx)
4696 continue;
4697
4698 perf_event_enable_on_exec(ctx);
4699 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004700
4701 if (!atomic_read(&nr_comm_events))
4702 return;
4703
4704 comm_event = (struct perf_comm_event){
4705 .task = task,
4706 /* .comm */
4707 /* .comm_size */
4708 .event_id = {
4709 .header = {
4710 .type = PERF_RECORD_COMM,
4711 .misc = 0,
4712 /* .size */
4713 },
4714 /* .pid */
4715 /* .tid */
4716 },
4717 };
4718
4719 perf_event_comm_event(&comm_event);
4720}
4721
4722/*
4723 * mmap tracking
4724 */
4725
4726struct perf_mmap_event {
4727 struct vm_area_struct *vma;
4728
4729 const char *file_name;
4730 int file_size;
4731
4732 struct {
4733 struct perf_event_header header;
4734
4735 u32 pid;
4736 u32 tid;
4737 u64 start;
4738 u64 len;
4739 u64 pgoff;
4740 } event_id;
4741};
4742
4743static void perf_event_mmap_output(struct perf_event *event,
4744 struct perf_mmap_event *mmap_event)
4745{
4746 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004747 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004748 int size = mmap_event->event_id.header.size;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004749 int ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004750
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004751 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4752 ret = perf_output_begin(&handle, event,
4753 mmap_event->event_id.header.size, 0, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004754 if (ret)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004755 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004756
4757 mmap_event->event_id.pid = perf_event_pid(event, current);
4758 mmap_event->event_id.tid = perf_event_tid(event, current);
4759
4760 perf_output_put(&handle, mmap_event->event_id);
4761 perf_output_copy(&handle, mmap_event->file_name,
4762 mmap_event->file_size);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004763
4764 perf_event__output_id_sample(event, &handle, &sample);
4765
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004766 perf_output_end(&handle);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004767out:
4768 mmap_event->event_id.header.size = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004769}
4770
4771static int perf_event_mmap_match(struct perf_event *event,
Eric B Munson3af9e852010-05-18 15:30:49 +01004772 struct perf_mmap_event *mmap_event,
4773 int executable)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004774{
Peter Zijlstra6f93d0a2010-02-14 11:12:04 +01004775 if (event->state < PERF_EVENT_STATE_INACTIVE)
Peter Zijlstra22e19082010-01-18 09:12:32 +01004776 return 0;
4777
Stephane Eranian5632ab12011-01-03 18:20:01 +02004778 if (!event_filter_match(event))
Peter Zijlstra5d27c232009-12-17 13:16:32 +01004779 return 0;
4780
Eric B Munson3af9e852010-05-18 15:30:49 +01004781 if ((!executable && event->attr.mmap_data) ||
4782 (executable && event->attr.mmap))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004783 return 1;
4784
4785 return 0;
4786}
4787
4788static void perf_event_mmap_ctx(struct perf_event_context *ctx,
Eric B Munson3af9e852010-05-18 15:30:49 +01004789 struct perf_mmap_event *mmap_event,
4790 int executable)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004791{
4792 struct perf_event *event;
4793
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004794 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
Eric B Munson3af9e852010-05-18 15:30:49 +01004795 if (perf_event_mmap_match(event, mmap_event, executable))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004796 perf_event_mmap_output(event, mmap_event);
4797 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004798}
4799
4800static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4801{
4802 struct perf_cpu_context *cpuctx;
4803 struct perf_event_context *ctx;
4804 struct vm_area_struct *vma = mmap_event->vma;
4805 struct file *file = vma->vm_file;
4806 unsigned int size;
4807 char tmp[16];
4808 char *buf = NULL;
4809 const char *name;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004810 struct pmu *pmu;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004811 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004812
4813 memset(tmp, 0, sizeof(tmp));
4814
4815 if (file) {
4816 /*
4817 * d_path works from the end of the buffer backwards, so we
4818 * need to add enough zero bytes after the string to handle
4819 * the 64bit alignment we do later.
4820 */
4821 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
4822 if (!buf) {
4823 name = strncpy(tmp, "//enomem", sizeof(tmp));
4824 goto got_name;
4825 }
4826 name = d_path(&file->f_path, buf, PATH_MAX);
4827 if (IS_ERR(name)) {
4828 name = strncpy(tmp, "//toolong", sizeof(tmp));
4829 goto got_name;
4830 }
4831 } else {
4832 if (arch_vma_name(mmap_event->vma)) {
4833 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
4834 sizeof(tmp));
4835 goto got_name;
4836 }
4837
4838 if (!vma->vm_mm) {
4839 name = strncpy(tmp, "[vdso]", sizeof(tmp));
4840 goto got_name;
Eric B Munson3af9e852010-05-18 15:30:49 +01004841 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
4842 vma->vm_end >= vma->vm_mm->brk) {
4843 name = strncpy(tmp, "[heap]", sizeof(tmp));
4844 goto got_name;
4845 } else if (vma->vm_start <= vma->vm_mm->start_stack &&
4846 vma->vm_end >= vma->vm_mm->start_stack) {
4847 name = strncpy(tmp, "[stack]", sizeof(tmp));
4848 goto got_name;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004849 }
4850
4851 name = strncpy(tmp, "//anon", sizeof(tmp));
4852 goto got_name;
4853 }
4854
4855got_name:
4856 size = ALIGN(strlen(name)+1, sizeof(u64));
4857
4858 mmap_event->file_name = name;
4859 mmap_event->file_size = size;
4860
4861 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
4862
Peter Zijlstraf6d9dd22009-11-20 22:19:48 +01004863 rcu_read_lock();
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004864 list_for_each_entry_rcu(pmu, &pmus, entry) {
Peter Zijlstra41945f62010-09-16 19:17:24 +02004865 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra51676952010-12-07 14:18:20 +01004866 if (cpuctx->active_pmu != pmu)
4867 goto next;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004868 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4869 vma->vm_flags & VM_EXEC);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004870
4871 ctxn = pmu->task_ctx_nr;
4872 if (ctxn < 0)
Peter Zijlstra41945f62010-09-16 19:17:24 +02004873 goto next;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02004874
4875 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4876 if (ctx) {
4877 perf_event_mmap_ctx(ctx, mmap_event,
4878 vma->vm_flags & VM_EXEC);
4879 }
Peter Zijlstra41945f62010-09-16 19:17:24 +02004880next:
4881 put_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02004882 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004883 rcu_read_unlock();
4884
4885 kfree(buf);
4886}
4887
Eric B Munson3af9e852010-05-18 15:30:49 +01004888void perf_event_mmap(struct vm_area_struct *vma)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004889{
4890 struct perf_mmap_event mmap_event;
4891
4892 if (!atomic_read(&nr_mmap_events))
4893 return;
4894
4895 mmap_event = (struct perf_mmap_event){
4896 .vma = vma,
4897 /* .file_name */
4898 /* .file_size */
4899 .event_id = {
4900 .header = {
4901 .type = PERF_RECORD_MMAP,
Zhang, Yanmin39447b32010-04-19 13:32:41 +08004902 .misc = PERF_RECORD_MISC_USER,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004903 /* .size */
4904 },
4905 /* .pid */
4906 /* .tid */
4907 .start = vma->vm_start,
4908 .len = vma->vm_end - vma->vm_start,
Peter Zijlstra3a0304e2010-02-26 10:33:41 +01004909 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004910 },
4911 };
4912
4913 perf_event_mmap_event(&mmap_event);
4914}
4915
4916/*
4917 * IRQ throttle logging
4918 */
4919
4920static void perf_log_throttle(struct perf_event *event, int enable)
4921{
4922 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004923 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004924 int ret;
4925
4926 struct {
4927 struct perf_event_header header;
4928 u64 time;
4929 u64 id;
4930 u64 stream_id;
4931 } throttle_event = {
4932 .header = {
4933 .type = PERF_RECORD_THROTTLE,
4934 .misc = 0,
4935 .size = sizeof(throttle_event),
4936 },
4937 .time = perf_clock(),
4938 .id = primary_event_id(event),
4939 .stream_id = event->id,
4940 };
4941
4942 if (enable)
4943 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4944
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004945 perf_event_header__init_id(&throttle_event.header, &sample, event);
4946
4947 ret = perf_output_begin(&handle, event,
4948 throttle_event.header.size, 1, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004949 if (ret)
4950 return;
4951
4952 perf_output_put(&handle, throttle_event);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02004953 perf_event__output_id_sample(event, &handle, &sample);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004954 perf_output_end(&handle);
4955}
4956
4957/*
4958 * Generic event overflow handling, sampling.
4959 */
4960
4961static int __perf_event_overflow(struct perf_event *event, int nmi,
4962 int throttle, struct perf_sample_data *data,
4963 struct pt_regs *regs)
4964{
4965 int events = atomic_read(&event->event_limit);
4966 struct hw_perf_event *hwc = &event->hw;
4967 int ret = 0;
4968
Peter Zijlstra96398822010-11-24 18:55:29 +01004969 /*
4970 * Non-sampling counters might still use the PMI to fold short
4971 * hardware counters, ignore those.
4972 */
4973 if (unlikely(!is_sampling_event(event)))
4974 return 0;
4975
Peter Zijlstra163ec432011-02-16 11:22:34 +01004976 if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
4977 if (throttle) {
4978 hwc->interrupts = MAX_INTERRUPTS;
4979 perf_log_throttle(event, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004980 ret = 1;
4981 }
Peter Zijlstra163ec432011-02-16 11:22:34 +01004982 } else
4983 hwc->interrupts++;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004984
4985 if (event->attr.freq) {
4986 u64 now = perf_clock();
Peter Zijlstraabd50712010-01-26 18:50:16 +01004987 s64 delta = now - hwc->freq_time_stamp;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004988
Peter Zijlstraabd50712010-01-26 18:50:16 +01004989 hwc->freq_time_stamp = now;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004990
Peter Zijlstraabd50712010-01-26 18:50:16 +01004991 if (delta > 0 && delta < 2*TICK_NSEC)
4992 perf_adjust_period(event, delta, hwc->last_period);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004993 }
4994
4995 /*
4996 * XXX event_limit might not quite work as expected on inherited
4997 * events
4998 */
4999
5000 event->pending_kill = POLL_IN;
5001 if (events && atomic_dec_and_test(&event->event_limit)) {
5002 ret = 1;
5003 event->pending_kill = POLL_HUP;
5004 if (nmi) {
5005 event->pending_disable = 1;
Peter Zijlstrae360adb2010-10-14 14:01:34 +08005006 irq_work_queue(&event->pending);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005007 } else
5008 perf_event_disable(event);
5009 }
5010
Peter Zijlstra453f19e2009-11-20 22:19:43 +01005011 if (event->overflow_handler)
5012 event->overflow_handler(event, nmi, data, regs);
5013 else
5014 perf_event_output(event, nmi, data, regs);
5015
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005016 return ret;
5017}
5018
5019int perf_event_overflow(struct perf_event *event, int nmi,
5020 struct perf_sample_data *data,
5021 struct pt_regs *regs)
5022{
5023 return __perf_event_overflow(event, nmi, 1, data, regs);
5024}
5025
5026/*
5027 * Generic software event infrastructure
5028 */
5029
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005030struct swevent_htable {
5031 struct swevent_hlist *swevent_hlist;
5032 struct mutex hlist_mutex;
5033 int hlist_refcount;
5034
5035 /* Recursion avoidance in each contexts */
5036 int recursion[PERF_NR_CONTEXTS];
5037};
5038
5039static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
5040
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005041/*
5042 * We directly increment event->count and keep a second value in
5043 * event->hw.period_left to count intervals. This period event
5044 * is kept in the range [-sample_period, 0] so that we can use the
5045 * sign as trigger.
5046 */
5047
5048static u64 perf_swevent_set_period(struct perf_event *event)
5049{
5050 struct hw_perf_event *hwc = &event->hw;
5051 u64 period = hwc->last_period;
5052 u64 nr, offset;
5053 s64 old, val;
5054
5055 hwc->last_period = hwc->sample_period;
5056
5057again:
Peter Zijlstrae7850592010-05-21 14:43:08 +02005058 old = val = local64_read(&hwc->period_left);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005059 if (val < 0)
5060 return 0;
5061
5062 nr = div64_u64(period + val, period);
5063 offset = nr * period;
5064 val -= offset;
Peter Zijlstrae7850592010-05-21 14:43:08 +02005065 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005066 goto again;
5067
5068 return nr;
5069}
5070
Peter Zijlstra0cff7842009-11-20 22:19:44 +01005071static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005072 int nmi, struct perf_sample_data *data,
5073 struct pt_regs *regs)
5074{
5075 struct hw_perf_event *hwc = &event->hw;
5076 int throttle = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005077
5078 data->period = event->hw.last_period;
Peter Zijlstra0cff7842009-11-20 22:19:44 +01005079 if (!overflow)
5080 overflow = perf_swevent_set_period(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005081
5082 if (hwc->interrupts == MAX_INTERRUPTS)
5083 return;
5084
5085 for (; overflow; overflow--) {
5086 if (__perf_event_overflow(event, nmi, throttle,
5087 data, regs)) {
5088 /*
5089 * We inhibit the overflow from happening when
5090 * hwc->interrupts == MAX_INTERRUPTS.
5091 */
5092 break;
5093 }
5094 throttle = 1;
5095 }
5096}
5097
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005098static void perf_swevent_event(struct perf_event *event, u64 nr,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005099 int nmi, struct perf_sample_data *data,
5100 struct pt_regs *regs)
5101{
5102 struct hw_perf_event *hwc = &event->hw;
5103
Peter Zijlstrae7850592010-05-21 14:43:08 +02005104 local64_add(nr, &event->count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005105
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005106 if (!regs)
5107 return;
5108
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01005109 if (!is_sampling_event(event))
Peter Zijlstra0cff7842009-11-20 22:19:44 +01005110 return;
5111
5112 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
5113 return perf_swevent_overflow(event, 1, nmi, data, regs);
5114
Peter Zijlstrae7850592010-05-21 14:43:08 +02005115 if (local64_add_negative(nr, &hwc->period_left))
Peter Zijlstra0cff7842009-11-20 22:19:44 +01005116 return;
5117
5118 perf_swevent_overflow(event, 0, nmi, data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005119}
5120
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005121static int perf_exclude_event(struct perf_event *event,
5122 struct pt_regs *regs)
5123{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005124 if (event->hw.state & PERF_HES_STOPPED)
Frederic Weisbecker91b2f482011-03-07 21:27:08 +01005125 return 1;
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005126
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005127 if (regs) {
5128 if (event->attr.exclude_user && user_mode(regs))
5129 return 1;
5130
5131 if (event->attr.exclude_kernel && !user_mode(regs))
5132 return 1;
5133 }
5134
5135 return 0;
5136}
5137
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005138static int perf_swevent_match(struct perf_event *event,
5139 enum perf_type_id type,
Li Zefan6fb29152009-10-15 11:21:42 +08005140 u32 event_id,
5141 struct perf_sample_data *data,
5142 struct pt_regs *regs)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005143{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005144 if (event->attr.type != type)
5145 return 0;
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005146
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005147 if (event->attr.config != event_id)
5148 return 0;
5149
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005150 if (perf_exclude_event(event, regs))
5151 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005152
5153 return 1;
5154}
5155
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005156static inline u64 swevent_hash(u64 type, u32 event_id)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005157{
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005158 u64 val = event_id | (type << 32);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005159
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005160 return hash_64(val, SWEVENT_HLIST_BITS);
5161}
5162
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005163static inline struct hlist_head *
5164__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005165{
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005166 u64 hash = swevent_hash(type, event_id);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005167
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005168 return &hlist->heads[hash];
5169}
5170
5171/* For the read side: events when they trigger */
5172static inline struct hlist_head *
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005173find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005174{
5175 struct swevent_hlist *hlist;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005176
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005177 hlist = rcu_dereference(swhash->swevent_hlist);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005178 if (!hlist)
5179 return NULL;
5180
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005181 return __find_swevent_head(hlist, type, event_id);
5182}
5183
5184/* For the event head insertion and removal in the hlist */
5185static inline struct hlist_head *
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005186find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005187{
5188 struct swevent_hlist *hlist;
5189 u32 event_id = event->attr.config;
5190 u64 type = event->attr.type;
5191
5192 /*
5193 * Event scheduling is always serialized against hlist allocation
5194 * and release. Which makes the protected version suitable here.
5195 * The context lock guarantees that.
5196 */
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005197 hlist = rcu_dereference_protected(swhash->swevent_hlist,
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005198 lockdep_is_held(&event->ctx->lock));
5199 if (!hlist)
5200 return NULL;
5201
5202 return __find_swevent_head(hlist, type, event_id);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005203}
5204
5205static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5206 u64 nr, int nmi,
5207 struct perf_sample_data *data,
5208 struct pt_regs *regs)
5209{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005210 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005211 struct perf_event *event;
5212 struct hlist_node *node;
5213 struct hlist_head *head;
5214
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005215 rcu_read_lock();
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005216 head = find_swevent_head_rcu(swhash, type, event_id);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005217 if (!head)
5218 goto end;
5219
5220 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
Li Zefan6fb29152009-10-15 11:21:42 +08005221 if (perf_swevent_match(event, type, event_id, data, regs))
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005222 perf_swevent_event(event, nr, nmi, data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005223 }
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005224end:
5225 rcu_read_unlock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005226}
5227
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01005228int perf_swevent_get_recursion_context(void)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005229{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005230 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01005231
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005232 return get_recursion_context(swhash->recursion);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005233}
Ingo Molnar645e8cc2009-11-22 12:20:19 +01005234EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005235
Jesper Juhlfa9f90b2010-11-28 21:39:34 +01005236inline void perf_swevent_put_recursion_context(int rctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005237{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005238 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02005239
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005240 put_recursion_context(swhash->recursion, rctx);
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01005241}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005242
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005243void __perf_sw_event(u32 event_id, u64 nr, int nmi,
5244 struct pt_regs *regs, u64 addr)
5245{
Ingo Molnara4234bf2009-11-23 10:57:59 +01005246 struct perf_sample_data data;
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01005247 int rctx;
5248
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005249 preempt_disable_notrace();
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01005250 rctx = perf_swevent_get_recursion_context();
5251 if (rctx < 0)
5252 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005253
Peter Zijlstradc1d6282010-03-03 15:55:04 +01005254 perf_sample_data_init(&data, addr);
Ingo Molnara4234bf2009-11-23 10:57:59 +01005255
5256 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01005257
5258 perf_swevent_put_recursion_context(rctx);
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005259 preempt_enable_notrace();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005260}
5261
5262static void perf_swevent_read(struct perf_event *event)
5263{
5264}
5265
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005266static int perf_swevent_add(struct perf_event *event, int flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005267{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005268 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005269 struct hw_perf_event *hwc = &event->hw;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005270 struct hlist_head *head;
5271
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01005272 if (is_sampling_event(event)) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005273 hwc->last_period = hwc->sample_period;
5274 perf_swevent_set_period(event);
5275 }
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005276
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005277 hwc->state = !(flags & PERF_EF_START);
5278
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005279 head = find_swevent_head(swhash, event);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005280 if (WARN_ON_ONCE(!head))
5281 return -EINVAL;
5282
5283 hlist_add_head_rcu(&event->hlist_entry, head);
5284
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005285 return 0;
5286}
5287
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005288static void perf_swevent_del(struct perf_event *event, int flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005289{
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005290 hlist_del_rcu(&event->hlist_entry);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005291}
5292
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005293static void perf_swevent_start(struct perf_event *event, int flags)
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02005294{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005295 event->hw.state = 0;
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02005296}
5297
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005298static void perf_swevent_stop(struct perf_event *event, int flags)
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02005299{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005300 event->hw.state = PERF_HES_STOPPED;
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02005301}
5302
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005303/* Deref the hlist from the update side */
5304static inline struct swevent_hlist *
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005305swevent_hlist_deref(struct swevent_htable *swhash)
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005306{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005307 return rcu_dereference_protected(swhash->swevent_hlist,
5308 lockdep_is_held(&swhash->hlist_mutex));
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005309}
5310
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005311static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
5312{
5313 struct swevent_hlist *hlist;
5314
5315 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
5316 kfree(hlist);
5317}
5318
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005319static void swevent_hlist_release(struct swevent_htable *swhash)
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005320{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005321 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005322
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02005323 if (!hlist)
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005324 return;
5325
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005326 rcu_assign_pointer(swhash->swevent_hlist, NULL);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005327 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
5328}
5329
5330static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
5331{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005332 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005333
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005334 mutex_lock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005335
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005336 if (!--swhash->hlist_refcount)
5337 swevent_hlist_release(swhash);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005338
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005339 mutex_unlock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005340}
5341
5342static void swevent_hlist_put(struct perf_event *event)
5343{
5344 int cpu;
5345
5346 if (event->cpu != -1) {
5347 swevent_hlist_put_cpu(event, event->cpu);
5348 return;
5349 }
5350
5351 for_each_possible_cpu(cpu)
5352 swevent_hlist_put_cpu(event, cpu);
5353}
5354
5355static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
5356{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005357 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005358 int err = 0;
5359
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005360 mutex_lock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005361
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005362 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005363 struct swevent_hlist *hlist;
5364
5365 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5366 if (!hlist) {
5367 err = -ENOMEM;
5368 goto exit;
5369 }
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005370 rcu_assign_pointer(swhash->swevent_hlist, hlist);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005371 }
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005372 swhash->hlist_refcount++;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02005373exit:
Peter Zijlstrab28ab832010-09-06 14:48:15 +02005374 mutex_unlock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005375
5376 return err;
5377}
5378
5379static int swevent_hlist_get(struct perf_event *event)
5380{
5381 int err;
5382 int cpu, failed_cpu;
5383
5384 if (event->cpu != -1)
5385 return swevent_hlist_get_cpu(event, event->cpu);
5386
5387 get_online_cpus();
5388 for_each_possible_cpu(cpu) {
5389 err = swevent_hlist_get_cpu(event, cpu);
5390 if (err) {
5391 failed_cpu = cpu;
5392 goto fail;
5393 }
5394 }
5395 put_online_cpus();
5396
5397 return 0;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02005398fail:
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005399 for_each_possible_cpu(cpu) {
5400 if (cpu == failed_cpu)
5401 break;
5402 swevent_hlist_put_cpu(event, cpu);
5403 }
5404
5405 put_online_cpus();
5406 return err;
5407}
5408
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005409atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
Frederic Weisbecker95476b62010-04-14 23:42:18 +02005410
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005411static void sw_perf_event_destroy(struct perf_event *event)
5412{
5413 u64 event_id = event->attr.config;
5414
5415 WARN_ON(event->parent);
5416
Peter Zijlstra7e54a5a2010-10-14 22:32:45 +02005417 jump_label_dec(&perf_swevent_enabled[event_id]);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005418 swevent_hlist_put(event);
5419}
5420
5421static int perf_swevent_init(struct perf_event *event)
5422{
5423 int event_id = event->attr.config;
5424
5425 if (event->attr.type != PERF_TYPE_SOFTWARE)
5426 return -ENOENT;
5427
5428 switch (event_id) {
5429 case PERF_COUNT_SW_CPU_CLOCK:
5430 case PERF_COUNT_SW_TASK_CLOCK:
5431 return -ENOENT;
5432
5433 default:
5434 break;
5435 }
5436
Dan Carpenterce677832010-10-24 21:50:42 +02005437 if (event_id >= PERF_COUNT_SW_MAX)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005438 return -ENOENT;
5439
5440 if (!event->parent) {
5441 int err;
5442
5443 err = swevent_hlist_get(event);
5444 if (err)
5445 return err;
5446
Peter Zijlstra7e54a5a2010-10-14 22:32:45 +02005447 jump_label_inc(&perf_swevent_enabled[event_id]);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005448 event->destroy = sw_perf_event_destroy;
5449 }
5450
5451 return 0;
5452}
5453
5454static struct pmu perf_swevent = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02005455 .task_ctx_nr = perf_sw_context,
5456
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005457 .event_init = perf_swevent_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005458 .add = perf_swevent_add,
5459 .del = perf_swevent_del,
5460 .start = perf_swevent_start,
5461 .stop = perf_swevent_stop,
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005462 .read = perf_swevent_read,
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005463};
Frederic Weisbecker95476b62010-04-14 23:42:18 +02005464
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005465#ifdef CONFIG_EVENT_TRACING
5466
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005467static int perf_tp_filter_match(struct perf_event *event,
Frederic Weisbecker95476b62010-04-14 23:42:18 +02005468 struct perf_sample_data *data)
5469{
5470 void *record = data->raw->data;
5471
5472 if (likely(!event->filter) || filter_match_preds(event->filter, record))
5473 return 1;
5474 return 0;
5475}
5476
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005477static int perf_tp_event_match(struct perf_event *event,
5478 struct perf_sample_data *data,
5479 struct pt_regs *regs)
5480{
Frederic Weisbeckera0f7d0f2011-03-07 21:27:09 +01005481 if (event->hw.state & PERF_HES_STOPPED)
5482 return 0;
Peter Zijlstra580d6072010-05-20 20:54:31 +02005483 /*
5484 * All tracepoints are from kernel-space.
5485 */
5486 if (event->attr.exclude_kernel)
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005487 return 0;
5488
5489 if (!perf_tp_filter_match(event, data))
5490 return 0;
5491
5492 return 1;
5493}
5494
5495void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
Peter Zijlstraecc55f82010-05-21 15:11:34 +02005496 struct pt_regs *regs, struct hlist_head *head, int rctx)
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005497{
5498 struct perf_sample_data data;
5499 struct perf_event *event;
5500 struct hlist_node *node;
5501
5502 struct perf_raw_record raw = {
5503 .size = entry_size,
5504 .data = record,
5505 };
5506
5507 perf_sample_data_init(&data, addr);
5508 data.raw = &raw;
5509
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005510 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5511 if (perf_tp_event_match(event, &data, regs))
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005512 perf_swevent_event(event, count, 1, &data, regs);
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005513 }
Peter Zijlstraecc55f82010-05-21 15:11:34 +02005514
5515 perf_swevent_put_recursion_context(rctx);
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005516}
5517EXPORT_SYMBOL_GPL(perf_tp_event);
5518
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005519static void tp_perf_event_destroy(struct perf_event *event)
5520{
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005521 perf_trace_destroy(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005522}
5523
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005524static int perf_tp_event_init(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005525{
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02005526 int err;
5527
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005528 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5529 return -ENOENT;
5530
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02005531 err = perf_trace_init(event);
5532 if (err)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005533 return err;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005534
5535 event->destroy = tp_perf_event_destroy;
5536
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005537 return 0;
5538}
5539
5540static struct pmu perf_tracepoint = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02005541 .task_ctx_nr = perf_sw_context,
5542
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005543 .event_init = perf_tp_event_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005544 .add = perf_trace_add,
5545 .del = perf_trace_del,
5546 .start = perf_swevent_start,
5547 .stop = perf_swevent_stop,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005548 .read = perf_swevent_read,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005549};
5550
5551static inline void perf_tp_register(void)
5552{
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005553 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005554}
Li Zefan6fb29152009-10-15 11:21:42 +08005555
5556static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5557{
5558 char *filter_str;
5559 int ret;
5560
5561 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5562 return -EINVAL;
5563
5564 filter_str = strndup_user(arg, PAGE_SIZE);
5565 if (IS_ERR(filter_str))
5566 return PTR_ERR(filter_str);
5567
5568 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
5569
5570 kfree(filter_str);
5571 return ret;
5572}
5573
5574static void perf_event_free_filter(struct perf_event *event)
5575{
5576 ftrace_profile_free_filter(event);
5577}
5578
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005579#else
Li Zefan6fb29152009-10-15 11:21:42 +08005580
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005581static inline void perf_tp_register(void)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005582{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005583}
Li Zefan6fb29152009-10-15 11:21:42 +08005584
5585static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5586{
5587 return -ENOENT;
5588}
5589
5590static void perf_event_free_filter(struct perf_event *event)
5591{
5592}
5593
Li Zefan07b139c2009-12-21 14:27:35 +08005594#endif /* CONFIG_EVENT_TRACING */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005595
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02005596#ifdef CONFIG_HAVE_HW_BREAKPOINT
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005597void perf_bp_event(struct perf_event *bp, void *data)
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02005598{
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005599 struct perf_sample_data sample;
5600 struct pt_regs *regs = data;
5601
Peter Zijlstradc1d6282010-03-03 15:55:04 +01005602 perf_sample_data_init(&sample, bp->attr.bp_addr);
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01005603
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005604 if (!bp->hw.state && !perf_exclude_event(bp, regs))
5605 perf_swevent_event(bp, 1, 1, &sample, regs);
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02005606}
5607#endif
5608
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005609/*
5610 * hrtimer based swevent callback
5611 */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005612
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005613static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005614{
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005615 enum hrtimer_restart ret = HRTIMER_RESTART;
5616 struct perf_sample_data data;
5617 struct pt_regs *regs;
5618 struct perf_event *event;
5619 u64 period;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005620
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005621 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
Peter Zijlstraba3dd362011-02-15 12:41:46 +01005622
5623 if (event->state != PERF_EVENT_STATE_ACTIVE)
5624 return HRTIMER_NORESTART;
5625
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005626 event->pmu->read(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005627
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005628 perf_sample_data_init(&data, 0);
5629 data.period = event->hw.last_period;
5630 regs = get_irq_regs();
5631
5632 if (regs && !perf_exclude_event(event, regs)) {
5633 if (!(event->attr.exclude_idle && current->pid == 0))
5634 if (perf_event_overflow(event, 0, &data, regs))
5635 ret = HRTIMER_NORESTART;
5636 }
5637
5638 period = max_t(u64, 10000, event->hw.sample_period);
5639 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
5640
5641 return ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005642}
5643
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005644static void perf_swevent_start_hrtimer(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005645{
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005646 struct hw_perf_event *hwc = &event->hw;
Franck Bui-Huu5d508e82010-11-23 16:21:45 +01005647 s64 period;
5648
5649 if (!is_sampling_event(event))
5650 return;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005651
Franck Bui-Huu5d508e82010-11-23 16:21:45 +01005652 period = local64_read(&hwc->period_left);
5653 if (period) {
5654 if (period < 0)
5655 period = 10000;
Peter Zijlstrafa407f32010-06-24 12:35:12 +02005656
Franck Bui-Huu5d508e82010-11-23 16:21:45 +01005657 local64_set(&hwc->period_left, 0);
5658 } else {
5659 period = max_t(u64, 10000, hwc->sample_period);
5660 }
5661 __hrtimer_start_range_ns(&hwc->hrtimer,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005662 ns_to_ktime(period), 0,
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02005663 HRTIMER_MODE_REL_PINNED, 0);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005664}
5665
5666static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5667{
5668 struct hw_perf_event *hwc = &event->hw;
5669
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01005670 if (is_sampling_event(event)) {
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005671 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
Peter Zijlstrafa407f32010-06-24 12:35:12 +02005672 local64_set(&hwc->period_left, ktime_to_ns(remaining));
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005673
5674 hrtimer_cancel(&hwc->hrtimer);
5675 }
5676}
5677
Peter Zijlstraba3dd362011-02-15 12:41:46 +01005678static void perf_swevent_init_hrtimer(struct perf_event *event)
5679{
5680 struct hw_perf_event *hwc = &event->hw;
5681
5682 if (!is_sampling_event(event))
5683 return;
5684
5685 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5686 hwc->hrtimer.function = perf_swevent_hrtimer;
5687
5688 /*
5689 * Since hrtimers have a fixed rate, we can do a static freq->period
5690 * mapping and avoid the whole period adjust feedback stuff.
5691 */
5692 if (event->attr.freq) {
5693 long freq = event->attr.sample_freq;
5694
5695 event->attr.sample_period = NSEC_PER_SEC / freq;
5696 hwc->sample_period = event->attr.sample_period;
5697 local64_set(&hwc->period_left, hwc->sample_period);
5698 event->attr.freq = 0;
5699 }
5700}
5701
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005702/*
5703 * Software event: cpu wall time clock
5704 */
5705
5706static void cpu_clock_event_update(struct perf_event *event)
5707{
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005708 s64 prev;
5709 u64 now;
5710
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005711 now = local_clock();
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005712 prev = local64_xchg(&event->hw.prev_count, now);
5713 local64_add(now - prev, &event->count);
5714}
5715
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005716static void cpu_clock_event_start(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005717{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005718 local64_set(&event->hw.prev_count, local_clock());
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005719 perf_swevent_start_hrtimer(event);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005720}
5721
5722static void cpu_clock_event_stop(struct perf_event *event, int flags)
5723{
5724 perf_swevent_cancel_hrtimer(event);
5725 cpu_clock_event_update(event);
5726}
5727
5728static int cpu_clock_event_add(struct perf_event *event, int flags)
5729{
5730 if (flags & PERF_EF_START)
5731 cpu_clock_event_start(event, flags);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005732
5733 return 0;
5734}
5735
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005736static void cpu_clock_event_del(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005737{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005738 cpu_clock_event_stop(event, flags);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005739}
5740
5741static void cpu_clock_event_read(struct perf_event *event)
5742{
5743 cpu_clock_event_update(event);
5744}
5745
5746static int cpu_clock_event_init(struct perf_event *event)
5747{
5748 if (event->attr.type != PERF_TYPE_SOFTWARE)
5749 return -ENOENT;
5750
5751 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5752 return -ENOENT;
5753
Peter Zijlstraba3dd362011-02-15 12:41:46 +01005754 perf_swevent_init_hrtimer(event);
5755
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005756 return 0;
5757}
5758
5759static struct pmu perf_cpu_clock = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02005760 .task_ctx_nr = perf_sw_context,
5761
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005762 .event_init = cpu_clock_event_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005763 .add = cpu_clock_event_add,
5764 .del = cpu_clock_event_del,
5765 .start = cpu_clock_event_start,
5766 .stop = cpu_clock_event_stop,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005767 .read = cpu_clock_event_read,
5768};
5769
5770/*
5771 * Software event: task time clock
5772 */
5773
5774static void task_clock_event_update(struct perf_event *event, u64 now)
5775{
5776 u64 prev;
5777 s64 delta;
5778
5779 prev = local64_xchg(&event->hw.prev_count, now);
5780 delta = now - prev;
5781 local64_add(delta, &event->count);
5782}
5783
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005784static void task_clock_event_start(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005785{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005786 local64_set(&event->hw.prev_count, event->ctx->time);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005787 perf_swevent_start_hrtimer(event);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005788}
5789
5790static void task_clock_event_stop(struct perf_event *event, int flags)
5791{
5792 perf_swevent_cancel_hrtimer(event);
5793 task_clock_event_update(event, event->ctx->time);
5794}
5795
5796static int task_clock_event_add(struct perf_event *event, int flags)
5797{
5798 if (flags & PERF_EF_START)
5799 task_clock_event_start(event, flags);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005800
5801 return 0;
5802}
5803
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005804static void task_clock_event_del(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005805{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005806 task_clock_event_stop(event, PERF_EF_UPDATE);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005807}
5808
5809static void task_clock_event_read(struct perf_event *event)
5810{
Peter Zijlstra768a06e2011-02-22 16:52:24 +01005811 u64 now = perf_clock();
5812 u64 delta = now - event->ctx->timestamp;
5813 u64 time = event->ctx->time + delta;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005814
5815 task_clock_event_update(event, time);
5816}
5817
5818static int task_clock_event_init(struct perf_event *event)
5819{
5820 if (event->attr.type != PERF_TYPE_SOFTWARE)
5821 return -ENOENT;
5822
5823 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5824 return -ENOENT;
5825
Peter Zijlstraba3dd362011-02-15 12:41:46 +01005826 perf_swevent_init_hrtimer(event);
5827
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005828 return 0;
5829}
5830
5831static struct pmu perf_task_clock = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02005832 .task_ctx_nr = perf_sw_context,
5833
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005834 .event_init = task_clock_event_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02005835 .add = task_clock_event_add,
5836 .del = task_clock_event_del,
5837 .start = task_clock_event_start,
5838 .stop = task_clock_event_stop,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005839 .read = task_clock_event_read,
5840};
5841
Peter Zijlstraad5133b2010-06-15 12:22:39 +02005842static void perf_pmu_nop_void(struct pmu *pmu)
5843{
5844}
5845
5846static int perf_pmu_nop_int(struct pmu *pmu)
5847{
5848 return 0;
5849}
5850
5851static void perf_pmu_start_txn(struct pmu *pmu)
5852{
5853 perf_pmu_disable(pmu);
5854}
5855
5856static int perf_pmu_commit_txn(struct pmu *pmu)
5857{
5858 perf_pmu_enable(pmu);
5859 return 0;
5860}
5861
5862static void perf_pmu_cancel_txn(struct pmu *pmu)
5863{
5864 perf_pmu_enable(pmu);
5865}
5866
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005867/*
5868 * Ensures all contexts with the same task_ctx_nr have the same
5869 * pmu_cpu_context too.
5870 */
5871static void *find_pmu_context(int ctxn)
5872{
5873 struct pmu *pmu;
5874
5875 if (ctxn < 0)
5876 return NULL;
5877
5878 list_for_each_entry(pmu, &pmus, entry) {
5879 if (pmu->task_ctx_nr == ctxn)
5880 return pmu->pmu_cpu_context;
5881 }
5882
5883 return NULL;
5884}
5885
Peter Zijlstra51676952010-12-07 14:18:20 +01005886static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005887{
Peter Zijlstra51676952010-12-07 14:18:20 +01005888 int cpu;
5889
5890 for_each_possible_cpu(cpu) {
5891 struct perf_cpu_context *cpuctx;
5892
5893 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5894
5895 if (cpuctx->active_pmu == old_pmu)
5896 cpuctx->active_pmu = pmu;
5897 }
5898}
5899
5900static void free_pmu_context(struct pmu *pmu)
5901{
5902 struct pmu *i;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005903
5904 mutex_lock(&pmus_lock);
5905 /*
5906 * Like a real lame refcount.
5907 */
Peter Zijlstra51676952010-12-07 14:18:20 +01005908 list_for_each_entry(i, &pmus, entry) {
5909 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
5910 update_pmu_context(i, pmu);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005911 goto out;
Peter Zijlstra51676952010-12-07 14:18:20 +01005912 }
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005913 }
5914
Peter Zijlstra51676952010-12-07 14:18:20 +01005915 free_percpu(pmu->pmu_cpu_context);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005916out:
5917 mutex_unlock(&pmus_lock);
5918}
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005919static struct idr pmu_idr;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005920
Peter Zijlstraabe43402010-11-17 23:17:37 +01005921static ssize_t
5922type_show(struct device *dev, struct device_attribute *attr, char *page)
5923{
5924 struct pmu *pmu = dev_get_drvdata(dev);
5925
5926 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5927}
5928
5929static struct device_attribute pmu_dev_attrs[] = {
5930 __ATTR_RO(type),
5931 __ATTR_NULL,
5932};
5933
5934static int pmu_bus_running;
5935static struct bus_type pmu_bus = {
5936 .name = "event_source",
5937 .dev_attrs = pmu_dev_attrs,
5938};
5939
5940static void pmu_dev_release(struct device *dev)
5941{
5942 kfree(dev);
5943}
5944
5945static int pmu_dev_alloc(struct pmu *pmu)
5946{
5947 int ret = -ENOMEM;
5948
5949 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
5950 if (!pmu->dev)
5951 goto out;
5952
5953 device_initialize(pmu->dev);
5954 ret = dev_set_name(pmu->dev, "%s", pmu->name);
5955 if (ret)
5956 goto free_dev;
5957
5958 dev_set_drvdata(pmu->dev, pmu);
5959 pmu->dev->bus = &pmu_bus;
5960 pmu->dev->release = pmu_dev_release;
5961 ret = device_add(pmu->dev);
5962 if (ret)
5963 goto free_dev;
5964
5965out:
5966 return ret;
5967
5968free_dev:
5969 put_device(pmu->dev);
5970 goto out;
5971}
5972
Peter Zijlstra547e9fd2011-01-19 12:51:39 +01005973static struct lock_class_key cpuctx_mutex;
5974
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005975int perf_pmu_register(struct pmu *pmu, char *name, int type)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005976{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02005977 int cpu, ret;
Peter Zijlstra33696fc2010-06-14 08:49:00 +02005978
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02005979 mutex_lock(&pmus_lock);
Peter Zijlstra33696fc2010-06-14 08:49:00 +02005980 ret = -ENOMEM;
5981 pmu->pmu_disable_count = alloc_percpu(int);
5982 if (!pmu->pmu_disable_count)
5983 goto unlock;
Peter Zijlstraad5133b2010-06-15 12:22:39 +02005984
Peter Zijlstra2e80a822010-11-17 23:17:36 +01005985 pmu->type = -1;
5986 if (!name)
5987 goto skip_type;
5988 pmu->name = name;
5989
5990 if (type < 0) {
5991 int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
5992 if (!err)
5993 goto free_pdc;
5994
5995 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
5996 if (err) {
5997 ret = err;
5998 goto free_pdc;
5999 }
6000 }
6001 pmu->type = type;
6002
Peter Zijlstraabe43402010-11-17 23:17:37 +01006003 if (pmu_bus_running) {
6004 ret = pmu_dev_alloc(pmu);
6005 if (ret)
6006 goto free_idr;
6007 }
6008
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006009skip_type:
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006010 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
6011 if (pmu->pmu_cpu_context)
6012 goto got_cpu_context;
6013
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006014 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
6015 if (!pmu->pmu_cpu_context)
Peter Zijlstraabe43402010-11-17 23:17:37 +01006016 goto free_dev;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006017
6018 for_each_possible_cpu(cpu) {
6019 struct perf_cpu_context *cpuctx;
6020
6021 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
Peter Zijlstraeb184472010-09-07 15:55:13 +02006022 __perf_event_init_context(&cpuctx->ctx);
Peter Zijlstra547e9fd2011-01-19 12:51:39 +01006023 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006024 cpuctx->ctx.type = cpu_context;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006025 cpuctx->ctx.pmu = pmu;
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02006026 cpuctx->jiffies_interval = 1;
6027 INIT_LIST_HEAD(&cpuctx->rotation_list);
Peter Zijlstra51676952010-12-07 14:18:20 +01006028 cpuctx->active_pmu = pmu;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006029 }
6030
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006031got_cpu_context:
Peter Zijlstraad5133b2010-06-15 12:22:39 +02006032 if (!pmu->start_txn) {
6033 if (pmu->pmu_enable) {
6034 /*
6035 * If we have pmu_enable/pmu_disable calls, install
6036 * transaction stubs that use that to try and batch
6037 * hardware accesses.
6038 */
6039 pmu->start_txn = perf_pmu_start_txn;
6040 pmu->commit_txn = perf_pmu_commit_txn;
6041 pmu->cancel_txn = perf_pmu_cancel_txn;
6042 } else {
6043 pmu->start_txn = perf_pmu_nop_void;
6044 pmu->commit_txn = perf_pmu_nop_int;
6045 pmu->cancel_txn = perf_pmu_nop_void;
6046 }
6047 }
6048
6049 if (!pmu->pmu_enable) {
6050 pmu->pmu_enable = perf_pmu_nop_void;
6051 pmu->pmu_disable = perf_pmu_nop_void;
6052 }
6053
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006054 list_add_rcu(&pmu->entry, &pmus);
Peter Zijlstra33696fc2010-06-14 08:49:00 +02006055 ret = 0;
6056unlock:
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006057 mutex_unlock(&pmus_lock);
6058
Peter Zijlstra33696fc2010-06-14 08:49:00 +02006059 return ret;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006060
Peter Zijlstraabe43402010-11-17 23:17:37 +01006061free_dev:
6062 device_del(pmu->dev);
6063 put_device(pmu->dev);
6064
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006065free_idr:
6066 if (pmu->type >= PERF_TYPE_MAX)
6067 idr_remove(&pmu_idr, pmu->type);
6068
Peter Zijlstra108b02c2010-09-06 14:32:03 +02006069free_pdc:
6070 free_percpu(pmu->pmu_disable_count);
6071 goto unlock;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006072}
6073
6074void perf_pmu_unregister(struct pmu *pmu)
6075{
6076 mutex_lock(&pmus_lock);
6077 list_del_rcu(&pmu->entry);
6078 mutex_unlock(&pmus_lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006079
6080 /*
Peter Zijlstracde8e882010-09-13 11:06:55 +02006081 * We dereference the pmu list under both SRCU and regular RCU, so
6082 * synchronize against both of those.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006083 */
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006084 synchronize_srcu(&pmus_srcu);
Peter Zijlstracde8e882010-09-13 11:06:55 +02006085 synchronize_rcu();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006086
Peter Zijlstra33696fc2010-06-14 08:49:00 +02006087 free_percpu(pmu->pmu_disable_count);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006088 if (pmu->type >= PERF_TYPE_MAX)
6089 idr_remove(&pmu_idr, pmu->type);
Peter Zijlstraabe43402010-11-17 23:17:37 +01006090 device_del(pmu->dev);
6091 put_device(pmu->dev);
Peter Zijlstra51676952010-12-07 14:18:20 +01006092 free_pmu_context(pmu);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006093}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006094
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006095struct pmu *perf_init_event(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006096{
Peter Zijlstra51b0fe32010-06-11 13:35:57 +02006097 struct pmu *pmu = NULL;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006098 int idx;
Lin Ming940c5b22011-02-27 21:13:31 +08006099 int ret;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006100
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006101 idx = srcu_read_lock(&pmus_srcu);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006102
6103 rcu_read_lock();
6104 pmu = idr_find(&pmu_idr, event->attr.type);
6105 rcu_read_unlock();
Lin Ming940c5b22011-02-27 21:13:31 +08006106 if (pmu) {
6107 ret = pmu->event_init(event);
6108 if (ret)
6109 pmu = ERR_PTR(ret);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006110 goto unlock;
Lin Ming940c5b22011-02-27 21:13:31 +08006111 }
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006112
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006113 list_for_each_entry_rcu(pmu, &pmus, entry) {
Lin Ming940c5b22011-02-27 21:13:31 +08006114 ret = pmu->event_init(event);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006115 if (!ret)
Peter Zijlstrae5f4d332010-09-10 17:38:06 +02006116 goto unlock;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006117
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006118 if (ret != -ENOENT) {
6119 pmu = ERR_PTR(ret);
Peter Zijlstrae5f4d332010-09-10 17:38:06 +02006120 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006121 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006122 }
Peter Zijlstrae5f4d332010-09-10 17:38:06 +02006123 pmu = ERR_PTR(-ENOENT);
6124unlock:
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006125 srcu_read_unlock(&pmus_srcu, idx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006126
6127 return pmu;
6128}
6129
6130/*
6131 * Allocate and initialize a event structure
6132 */
6133static struct perf_event *
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006134perf_event_alloc(struct perf_event_attr *attr, int cpu,
Peter Zijlstrad580ff82010-10-14 17:43:23 +02006135 struct task_struct *task,
6136 struct perf_event *group_leader,
6137 struct perf_event *parent_event,
6138 perf_overflow_handler_t overflow_handler)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006139{
Peter Zijlstra51b0fe32010-06-11 13:35:57 +02006140 struct pmu *pmu;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006141 struct perf_event *event;
6142 struct hw_perf_event *hwc;
6143 long err;
6144
Oleg Nesterov66832eb2011-01-18 17:10:32 +01006145 if ((unsigned)cpu >= nr_cpu_ids) {
6146 if (!task || cpu != -1)
6147 return ERR_PTR(-EINVAL);
6148 }
6149
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006150 event = kzalloc(sizeof(*event), GFP_KERNEL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006151 if (!event)
6152 return ERR_PTR(-ENOMEM);
6153
6154 /*
6155 * Single events are their own group leaders, with an
6156 * empty sibling list:
6157 */
6158 if (!group_leader)
6159 group_leader = event;
6160
6161 mutex_init(&event->child_mutex);
6162 INIT_LIST_HEAD(&event->child_list);
6163
6164 INIT_LIST_HEAD(&event->group_entry);
6165 INIT_LIST_HEAD(&event->event_entry);
6166 INIT_LIST_HEAD(&event->sibling_list);
6167 init_waitqueue_head(&event->waitq);
Peter Zijlstrae360adb2010-10-14 14:01:34 +08006168 init_irq_work(&event->pending, perf_pending_event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006169
6170 mutex_init(&event->mmap_mutex);
6171
6172 event->cpu = cpu;
6173 event->attr = *attr;
6174 event->group_leader = group_leader;
6175 event->pmu = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006176 event->oncpu = -1;
6177
6178 event->parent = parent_event;
6179
6180 event->ns = get_pid_ns(current->nsproxy->pid_ns);
6181 event->id = atomic64_inc_return(&perf_event_id);
6182
6183 event->state = PERF_EVENT_STATE_INACTIVE;
6184
Peter Zijlstrad580ff82010-10-14 17:43:23 +02006185 if (task) {
6186 event->attach_state = PERF_ATTACH_TASK;
6187#ifdef CONFIG_HAVE_HW_BREAKPOINT
6188 /*
6189 * hw_breakpoint is a bit difficult here..
6190 */
6191 if (attr->type == PERF_TYPE_BREAKPOINT)
6192 event->hw.bp_target = task;
6193#endif
6194 }
6195
Frederic Weisbeckerb326e952009-12-05 09:44:31 +01006196 if (!overflow_handler && parent_event)
6197 overflow_handler = parent_event->overflow_handler;
Oleg Nesterov66832eb2011-01-18 17:10:32 +01006198
Frederic Weisbeckerb326e952009-12-05 09:44:31 +01006199 event->overflow_handler = overflow_handler;
Frederic Weisbecker97eaf532009-10-18 15:33:50 +02006200
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006201 if (attr->disabled)
6202 event->state = PERF_EVENT_STATE_OFF;
6203
6204 pmu = NULL;
6205
6206 hwc = &event->hw;
6207 hwc->sample_period = attr->sample_period;
6208 if (attr->freq && attr->sample_freq)
6209 hwc->sample_period = 1;
6210 hwc->last_period = hwc->sample_period;
6211
Peter Zijlstrae7850592010-05-21 14:43:08 +02006212 local64_set(&hwc->period_left, hwc->sample_period);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006213
6214 /*
6215 * we currently do not support PERF_FORMAT_GROUP on inherited events
6216 */
6217 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
6218 goto done;
6219
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006220 pmu = perf_init_event(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006221
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006222done:
6223 err = 0;
6224 if (!pmu)
6225 err = -EINVAL;
6226 else if (IS_ERR(pmu))
6227 err = PTR_ERR(pmu);
6228
6229 if (err) {
6230 if (event->ns)
6231 put_pid_ns(event->ns);
6232 kfree(event);
6233 return ERR_PTR(err);
6234 }
6235
6236 event->pmu = pmu;
6237
6238 if (!event->parent) {
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02006239 if (event->attach_state & PERF_ATTACH_TASK)
Stephane Eraniane5d13672011-02-14 11:20:01 +02006240 jump_label_inc(&perf_sched_events);
Eric B Munson3af9e852010-05-18 15:30:49 +01006241 if (event->attr.mmap || event->attr.mmap_data)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006242 atomic_inc(&nr_mmap_events);
6243 if (event->attr.comm)
6244 atomic_inc(&nr_comm_events);
6245 if (event->attr.task)
6246 atomic_inc(&nr_task_events);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02006247 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
6248 err = get_callchain_buffers();
6249 if (err) {
6250 free_event(event);
6251 return ERR_PTR(err);
6252 }
6253 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006254 }
6255
6256 return event;
6257}
6258
6259static int perf_copy_attr(struct perf_event_attr __user *uattr,
6260 struct perf_event_attr *attr)
6261{
6262 u32 size;
6263 int ret;
6264
6265 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
6266 return -EFAULT;
6267
6268 /*
6269 * zero the full structure, so that a short copy will be nice.
6270 */
6271 memset(attr, 0, sizeof(*attr));
6272
6273 ret = get_user(size, &uattr->size);
6274 if (ret)
6275 return ret;
6276
6277 if (size > PAGE_SIZE) /* silly large */
6278 goto err_size;
6279
6280 if (!size) /* abi compat */
6281 size = PERF_ATTR_SIZE_VER0;
6282
6283 if (size < PERF_ATTR_SIZE_VER0)
6284 goto err_size;
6285
6286 /*
6287 * If we're handed a bigger struct than we know of,
6288 * ensure all the unknown bits are 0 - i.e. new
6289 * user-space does not rely on any kernel feature
6290 * extensions we dont know about yet.
6291 */
6292 if (size > sizeof(*attr)) {
6293 unsigned char __user *addr;
6294 unsigned char __user *end;
6295 unsigned char val;
6296
6297 addr = (void __user *)uattr + sizeof(*attr);
6298 end = (void __user *)uattr + size;
6299
6300 for (; addr < end; addr++) {
6301 ret = get_user(val, addr);
6302 if (ret)
6303 return ret;
6304 if (val)
6305 goto err_size;
6306 }
6307 size = sizeof(*attr);
6308 }
6309
6310 ret = copy_from_user(attr, uattr, size);
6311 if (ret)
6312 return -EFAULT;
6313
6314 /*
6315 * If the type exists, the corresponding creation will verify
6316 * the attr->config.
6317 */
6318 if (attr->type >= PERF_TYPE_MAX)
6319 return -EINVAL;
6320
Mahesh Salgaonkarcd757642010-01-30 10:25:18 +05306321 if (attr->__reserved_1)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006322 return -EINVAL;
6323
6324 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
6325 return -EINVAL;
6326
6327 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
6328 return -EINVAL;
6329
6330out:
6331 return ret;
6332
6333err_size:
6334 put_user(sizeof(*attr), &uattr->size);
6335 ret = -E2BIG;
6336 goto out;
6337}
6338
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006339static int
6340perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006341{
Peter Zijlstraca5135e2010-05-28 19:33:23 +02006342 struct perf_buffer *buffer = NULL, *old_buffer = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006343 int ret = -EINVAL;
6344
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006345 if (!output_event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006346 goto set;
6347
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006348 /* don't allow circular references */
6349 if (event == output_event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006350 goto out;
6351
Peter Zijlstra0f139302010-05-20 14:35:15 +02006352 /*
6353 * Don't allow cross-cpu buffers
6354 */
6355 if (output_event->cpu != event->cpu)
6356 goto out;
6357
6358 /*
6359 * If its not a per-cpu buffer, it must be the same task.
6360 */
6361 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
6362 goto out;
6363
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006364set:
6365 mutex_lock(&event->mmap_mutex);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006366 /* Can't redirect output if we've got an active mmap() */
6367 if (atomic_read(&event->mmap_count))
6368 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006369
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006370 if (output_event) {
6371 /* get the buffer we want to redirect to */
Peter Zijlstraca5135e2010-05-28 19:33:23 +02006372 buffer = perf_buffer_get(output_event);
6373 if (!buffer)
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006374 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006375 }
6376
Peter Zijlstraca5135e2010-05-28 19:33:23 +02006377 old_buffer = event->buffer;
6378 rcu_assign_pointer(event->buffer, buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006379 ret = 0;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006380unlock:
6381 mutex_unlock(&event->mmap_mutex);
6382
Peter Zijlstraca5135e2010-05-28 19:33:23 +02006383 if (old_buffer)
6384 perf_buffer_put(old_buffer);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006385out:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006386 return ret;
6387}
6388
6389/**
6390 * sys_perf_event_open - open a performance event, associate it to a task/cpu
6391 *
6392 * @attr_uptr: event_id type attributes for monitoring/sampling
6393 * @pid: target pid
6394 * @cpu: target cpu
6395 * @group_fd: group leader event fd
6396 */
6397SYSCALL_DEFINE5(perf_event_open,
6398 struct perf_event_attr __user *, attr_uptr,
6399 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
6400{
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006401 struct perf_event *group_leader = NULL, *output_event = NULL;
6402 struct perf_event *event, *sibling;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006403 struct perf_event_attr attr;
6404 struct perf_event_context *ctx;
6405 struct file *event_file = NULL;
6406 struct file *group_file = NULL;
Matt Helsley38a81da2010-09-13 13:01:20 -07006407 struct task_struct *task = NULL;
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006408 struct pmu *pmu;
Al Viroea635c62010-05-26 17:40:29 -04006409 int event_fd;
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006410 int move_group = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006411 int fput_needed = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006412 int err;
6413
6414 /* for future expandability... */
Stephane Eraniane5d13672011-02-14 11:20:01 +02006415 if (flags & ~PERF_FLAG_ALL)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006416 return -EINVAL;
6417
6418 err = perf_copy_attr(attr_uptr, &attr);
6419 if (err)
6420 return err;
6421
6422 if (!attr.exclude_kernel) {
6423 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6424 return -EACCES;
6425 }
6426
6427 if (attr.freq) {
6428 if (attr.sample_freq > sysctl_perf_event_sample_rate)
6429 return -EINVAL;
6430 }
6431
Stephane Eraniane5d13672011-02-14 11:20:01 +02006432 /*
6433 * In cgroup mode, the pid argument is used to pass the fd
6434 * opened to the cgroup directory in cgroupfs. The cpu argument
6435 * designates the cpu on which to monitor threads from that
6436 * cgroup.
6437 */
6438 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6439 return -EINVAL;
6440
Al Viroea635c62010-05-26 17:40:29 -04006441 event_fd = get_unused_fd_flags(O_RDWR);
6442 if (event_fd < 0)
6443 return event_fd;
6444
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006445 if (group_fd != -1) {
6446 group_leader = perf_fget_light(group_fd, &fput_needed);
6447 if (IS_ERR(group_leader)) {
6448 err = PTR_ERR(group_leader);
Stephane Eraniand14b12d2010-09-17 11:28:47 +02006449 goto err_fd;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006450 }
6451 group_file = group_leader->filp;
6452 if (flags & PERF_FLAG_FD_OUTPUT)
6453 output_event = group_leader;
6454 if (flags & PERF_FLAG_FD_NO_GROUP)
6455 group_leader = NULL;
6456 }
6457
Stephane Eraniane5d13672011-02-14 11:20:01 +02006458 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02006459 task = find_lively_task_by_vpid(pid);
6460 if (IS_ERR(task)) {
6461 err = PTR_ERR(task);
6462 goto err_group_fd;
6463 }
6464 }
6465
Peter Zijlstrad580ff82010-10-14 17:43:23 +02006466 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
Stephane Eraniand14b12d2010-09-17 11:28:47 +02006467 if (IS_ERR(event)) {
6468 err = PTR_ERR(event);
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02006469 goto err_task;
Stephane Eraniand14b12d2010-09-17 11:28:47 +02006470 }
6471
Stephane Eraniane5d13672011-02-14 11:20:01 +02006472 if (flags & PERF_FLAG_PID_CGROUP) {
6473 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6474 if (err)
6475 goto err_alloc;
Peter Zijlstra08309372011-03-03 11:31:20 +01006476 /*
6477 * one more event:
6478 * - that has cgroup constraint on event->cpu
6479 * - that may need work on context switch
6480 */
6481 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6482 jump_label_inc(&perf_sched_events);
Stephane Eraniane5d13672011-02-14 11:20:01 +02006483 }
6484
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006485 /*
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006486 * Special case software events and allow them to be part of
6487 * any hardware group.
6488 */
6489 pmu = event->pmu;
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006490
6491 if (group_leader &&
6492 (is_software_event(event) != is_software_event(group_leader))) {
6493 if (is_software_event(event)) {
6494 /*
6495 * If event and group_leader are not both a software
6496 * event, and event is, then group leader is not.
6497 *
6498 * Allow the addition of software events to !software
6499 * groups, this is safe because software events never
6500 * fail to schedule.
6501 */
6502 pmu = group_leader->pmu;
6503 } else if (is_software_event(group_leader) &&
6504 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
6505 /*
6506 * In case the group is a pure software group, and we
6507 * try to add a hardware event, move the whole group to
6508 * the hardware context.
6509 */
6510 move_group = 1;
6511 }
6512 }
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006513
6514 /*
6515 * Get the target context (task or percpu):
6516 */
Matt Helsley38a81da2010-09-13 13:01:20 -07006517 ctx = find_get_context(pmu, task, cpu);
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006518 if (IS_ERR(ctx)) {
6519 err = PTR_ERR(ctx);
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02006520 goto err_alloc;
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006521 }
6522
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006523 /*
6524 * Look up the group leader (we will attach this event to it):
6525 */
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006526 if (group_leader) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006527 err = -EINVAL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006528
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006529 /*
6530 * Do not allow a recursive hierarchy (this new sibling
6531 * becoming part of another group-sibling):
6532 */
6533 if (group_leader->group_leader != group_leader)
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006534 goto err_context;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006535 /*
6536 * Do not allow to attach to a group in a different
6537 * task or CPU context:
6538 */
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006539 if (move_group) {
6540 if (group_leader->ctx->type != ctx->type)
6541 goto err_context;
6542 } else {
6543 if (group_leader->ctx != ctx)
6544 goto err_context;
6545 }
6546
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006547 /*
6548 * Only a group leader can be exclusive or pinned
6549 */
6550 if (attr.exclusive || attr.pinned)
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006551 goto err_context;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006552 }
6553
6554 if (output_event) {
6555 err = perf_event_set_output(event, output_event);
6556 if (err)
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006557 goto err_context;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02006558 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006559
Al Viroea635c62010-05-26 17:40:29 -04006560 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
6561 if (IS_ERR(event_file)) {
6562 err = PTR_ERR(event_file);
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006563 goto err_context;
Al Viroea635c62010-05-26 17:40:29 -04006564 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006565
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006566 if (move_group) {
6567 struct perf_event_context *gctx = group_leader->ctx;
6568
6569 mutex_lock(&gctx->mutex);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006570 perf_remove_from_context(group_leader);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006571 list_for_each_entry(sibling, &group_leader->sibling_list,
6572 group_entry) {
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006573 perf_remove_from_context(sibling);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006574 put_ctx(gctx);
6575 }
6576 mutex_unlock(&gctx->mutex);
6577 put_ctx(gctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006578 }
6579
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006580 event->filp = event_file;
6581 WARN_ON_ONCE(ctx->parent_ctx);
6582 mutex_lock(&ctx->mutex);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02006583
6584 if (move_group) {
6585 perf_install_in_context(ctx, group_leader, cpu);
6586 get_ctx(ctx);
6587 list_for_each_entry(sibling, &group_leader->sibling_list,
6588 group_entry) {
6589 perf_install_in_context(ctx, sibling, cpu);
6590 get_ctx(ctx);
6591 }
6592 }
6593
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006594 perf_install_in_context(ctx, event, cpu);
6595 ++ctx->generation;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006596 perf_unpin_context(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006597 mutex_unlock(&ctx->mutex);
6598
6599 event->owner = current;
Peter Zijlstra88821352010-11-09 19:01:43 +01006600
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006601 mutex_lock(&current->perf_event_mutex);
6602 list_add_tail(&event->owner_entry, &current->perf_event_list);
6603 mutex_unlock(&current->perf_event_mutex);
6604
Peter Zijlstra8a495422010-05-27 15:47:49 +02006605 /*
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02006606 * Precalculate sample_data sizes
6607 */
6608 perf_event__header_size(event);
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02006609 perf_event__id_header_size(event);
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02006610
6611 /*
Peter Zijlstra8a495422010-05-27 15:47:49 +02006612 * Drop the reference on the group_event after placing the
6613 * new event on the sibling_list. This ensures destruction
6614 * of the group leader will find the pointer to itself in
6615 * perf_group_detach().
6616 */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006617 fput_light(group_file, fput_needed);
Al Viroea635c62010-05-26 17:40:29 -04006618 fd_install(event_fd, event_file);
6619 return event_fd;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006620
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006621err_context:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006622 perf_unpin_context(ctx);
Al Viroea635c62010-05-26 17:40:29 -04006623 put_ctx(ctx);
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02006624err_alloc:
6625 free_event(event);
Peter Zijlstrae7d0bc02010-10-14 16:54:51 +02006626err_task:
6627 if (task)
6628 put_task_struct(task);
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006629err_group_fd:
6630 fput_light(group_file, fput_needed);
Al Viroea635c62010-05-26 17:40:29 -04006631err_fd:
6632 put_unused_fd(event_fd);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006633 return err;
6634}
6635
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006636/**
6637 * perf_event_create_kernel_counter
6638 *
6639 * @attr: attributes of the counter to create
6640 * @cpu: cpu in which the counter is bound
Matt Helsley38a81da2010-09-13 13:01:20 -07006641 * @task: task to profile (NULL for percpu)
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006642 */
6643struct perf_event *
6644perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
Matt Helsley38a81da2010-09-13 13:01:20 -07006645 struct task_struct *task,
Frederic Weisbeckerb326e952009-12-05 09:44:31 +01006646 perf_overflow_handler_t overflow_handler)
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006647{
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006648 struct perf_event_context *ctx;
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006649 struct perf_event *event;
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006650 int err;
6651
6652 /*
6653 * Get the target context (task or percpu):
6654 */
6655
Peter Zijlstrad580ff82010-10-14 17:43:23 +02006656 event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
Frederic Weisbeckerc6567f62009-11-26 05:35:41 +01006657 if (IS_ERR(event)) {
6658 err = PTR_ERR(event);
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006659 goto err;
6660 }
6661
Matt Helsley38a81da2010-09-13 13:01:20 -07006662 ctx = find_get_context(event->pmu, task, cpu);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006663 if (IS_ERR(ctx)) {
6664 err = PTR_ERR(ctx);
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006665 goto err_free;
Frederic Weisbeckerc6567f62009-11-26 05:35:41 +01006666 }
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006667
6668 event->filp = NULL;
6669 WARN_ON_ONCE(ctx->parent_ctx);
6670 mutex_lock(&ctx->mutex);
6671 perf_install_in_context(ctx, event, cpu);
6672 ++ctx->generation;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006673 perf_unpin_context(ctx);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006674 mutex_unlock(&ctx->mutex);
6675
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006676 return event;
6677
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02006678err_free:
6679 free_event(event);
6680err:
Frederic Weisbeckerc6567f62009-11-26 05:35:41 +01006681 return ERR_PTR(err);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02006682}
6683EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
6684
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006685static void sync_child_event(struct perf_event *child_event,
6686 struct task_struct *child)
6687{
6688 struct perf_event *parent_event = child_event->parent;
6689 u64 child_val;
6690
6691 if (child_event->attr.inherit_stat)
6692 perf_event_read_event(child_event, child);
6693
Peter Zijlstrab5e58792010-05-21 14:43:12 +02006694 child_val = perf_event_count(child_event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006695
6696 /*
6697 * Add back the child's count to the parent's count:
6698 */
Peter Zijlstraa6e6dea2010-05-21 14:27:58 +02006699 atomic64_add(child_val, &parent_event->child_count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006700 atomic64_add(child_event->total_time_enabled,
6701 &parent_event->child_total_time_enabled);
6702 atomic64_add(child_event->total_time_running,
6703 &parent_event->child_total_time_running);
6704
6705 /*
6706 * Remove this event from the parent's list
6707 */
6708 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6709 mutex_lock(&parent_event->child_mutex);
6710 list_del_init(&child_event->child_list);
6711 mutex_unlock(&parent_event->child_mutex);
6712
6713 /*
6714 * Release the parent event, if this was the last
6715 * reference to it.
6716 */
6717 fput(parent_event->filp);
6718}
6719
6720static void
6721__perf_event_exit_task(struct perf_event *child_event,
6722 struct perf_event_context *child_ctx,
6723 struct task_struct *child)
6724{
6725 struct perf_event *parent_event;
6726
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01006727 perf_remove_from_context(child_event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006728
6729 parent_event = child_event->parent;
6730 /*
6731 * It can happen that parent exits first, and has events
6732 * that are still around due to the child reference. These
6733 * events need to be zapped - but otherwise linger.
6734 */
6735 if (parent_event) {
6736 sync_child_event(child_event, child);
6737 free_event(child_event);
6738 }
6739}
6740
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006741static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006742{
6743 struct perf_event *child_event, *tmp;
6744 struct perf_event_context *child_ctx;
6745 unsigned long flags;
6746
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006747 if (likely(!child->perf_event_ctxp[ctxn])) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006748 perf_event_task(child, NULL, 0);
6749 return;
6750 }
6751
6752 local_irq_save(flags);
6753 /*
6754 * We can't reschedule here because interrupts are disabled,
6755 * and either child is current or it is a task that can't be
6756 * scheduled, so we are now safe from rescheduling changing
6757 * our context.
6758 */
Oleg Nesterov806839b2011-01-21 18:45:47 +01006759 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
Peter Zijlstra82cd6de2010-10-14 17:57:23 +02006760 task_ctx_sched_out(child_ctx, EVENT_ALL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006761
6762 /*
6763 * Take the context lock here so that if find_get_context is
6764 * reading child->perf_event_ctxp, we wait until it has
6765 * incremented the context's refcount before we do put_ctx below.
6766 */
Thomas Gleixnere625cce12009-11-17 18:02:06 +01006767 raw_spin_lock(&child_ctx->lock);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006768 child->perf_event_ctxp[ctxn] = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006769 /*
6770 * If this context is a clone; unclone it so it can't get
6771 * swapped to another process while we're removing all
6772 * the events from it.
6773 */
6774 unclone_ctx(child_ctx);
Peter Zijlstra5e942bb2009-11-23 11:37:26 +01006775 update_context_time(child_ctx);
Thomas Gleixnere625cce12009-11-17 18:02:06 +01006776 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006777
6778 /*
6779 * Report the task dead after unscheduling the events so that we
6780 * won't get any samples after PERF_RECORD_EXIT. We can however still
6781 * get a few PERF_RECORD_READ events.
6782 */
6783 perf_event_task(child, child_ctx, 0);
6784
6785 /*
6786 * We can recurse on the same lock type through:
6787 *
6788 * __perf_event_exit_task()
6789 * sync_child_event()
6790 * fput(parent_event->filp)
6791 * perf_release()
6792 * mutex_lock(&ctx->mutex)
6793 *
6794 * But since its the parent context it won't be the same instance.
6795 */
Peter Zijlstraa0507c82010-05-06 15:42:53 +02006796 mutex_lock(&child_ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006797
6798again:
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006799 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
6800 group_entry)
6801 __perf_event_exit_task(child_event, child_ctx, child);
6802
6803 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006804 group_entry)
6805 __perf_event_exit_task(child_event, child_ctx, child);
6806
6807 /*
6808 * If the last event was a group event, it will have appended all
6809 * its siblings to the list, but we obtained 'tmp' before that which
6810 * will still point to the list head terminating the iteration.
6811 */
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006812 if (!list_empty(&child_ctx->pinned_groups) ||
6813 !list_empty(&child_ctx->flexible_groups))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006814 goto again;
6815
6816 mutex_unlock(&child_ctx->mutex);
6817
6818 put_ctx(child_ctx);
6819}
6820
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006821/*
6822 * When a child task exits, feed back event values to parent events.
6823 */
6824void perf_event_exit_task(struct task_struct *child)
6825{
Peter Zijlstra88821352010-11-09 19:01:43 +01006826 struct perf_event *event, *tmp;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006827 int ctxn;
6828
Peter Zijlstra88821352010-11-09 19:01:43 +01006829 mutex_lock(&child->perf_event_mutex);
6830 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
6831 owner_entry) {
6832 list_del_init(&event->owner_entry);
6833
6834 /*
6835 * Ensure the list deletion is visible before we clear
6836 * the owner, closes a race against perf_release() where
6837 * we need to serialize on the owner->perf_event_mutex.
6838 */
6839 smp_wmb();
6840 event->owner = NULL;
6841 }
6842 mutex_unlock(&child->perf_event_mutex);
6843
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006844 for_each_task_context_nr(ctxn)
6845 perf_event_exit_task_context(child, ctxn);
6846}
6847
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006848static void perf_free_event(struct perf_event *event,
6849 struct perf_event_context *ctx)
6850{
6851 struct perf_event *parent = event->parent;
6852
6853 if (WARN_ON_ONCE(!parent))
6854 return;
6855
6856 mutex_lock(&parent->child_mutex);
6857 list_del_init(&event->child_list);
6858 mutex_unlock(&parent->child_mutex);
6859
6860 fput(parent->filp);
6861
Peter Zijlstra8a495422010-05-27 15:47:49 +02006862 perf_group_detach(event);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01006863 list_del_event(event, ctx);
6864 free_event(event);
6865}
6866
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006867/*
6868 * free an unexposed, unused context as created by inheritance by
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006869 * perf_event_init_task below, used by fork() in case of fail.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006870 */
6871void perf_event_free_task(struct task_struct *task)
6872{
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006873 struct perf_event_context *ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006874 struct perf_event *event, *tmp;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006875 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006876
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006877 for_each_task_context_nr(ctxn) {
6878 ctx = task->perf_event_ctxp[ctxn];
6879 if (!ctx)
6880 continue;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006881
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006882 mutex_lock(&ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006883again:
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006884 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
6885 group_entry)
6886 perf_free_event(event, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006887
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006888 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
6889 group_entry)
6890 perf_free_event(event, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006891
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006892 if (!list_empty(&ctx->pinned_groups) ||
6893 !list_empty(&ctx->flexible_groups))
6894 goto again;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006895
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006896 mutex_unlock(&ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006897
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02006898 put_ctx(ctx);
6899 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006900}
6901
Peter Zijlstra4e231c72010-09-09 21:01:59 +02006902void perf_event_delayed_put(struct task_struct *task)
6903{
6904 int ctxn;
6905
6906 for_each_task_context_nr(ctxn)
6907 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
6908}
6909
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006910/*
6911 * inherit a event from parent task to child task:
6912 */
6913static struct perf_event *
6914inherit_event(struct perf_event *parent_event,
6915 struct task_struct *parent,
6916 struct perf_event_context *parent_ctx,
6917 struct task_struct *child,
6918 struct perf_event *group_leader,
6919 struct perf_event_context *child_ctx)
6920{
6921 struct perf_event *child_event;
Peter Zijlstracee010e2010-09-10 12:51:54 +02006922 unsigned long flags;
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006923
6924 /*
6925 * Instead of creating recursive hierarchies of events,
6926 * we link inherited events back to the original parent,
6927 * which has a filp for sure, which we use as the reference
6928 * count:
6929 */
6930 if (parent_event->parent)
6931 parent_event = parent_event->parent;
6932
6933 child_event = perf_event_alloc(&parent_event->attr,
6934 parent_event->cpu,
Peter Zijlstrad580ff82010-10-14 17:43:23 +02006935 child,
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006936 group_leader, parent_event,
6937 NULL);
6938 if (IS_ERR(child_event))
6939 return child_event;
6940 get_ctx(child_ctx);
6941
6942 /*
6943 * Make the child state follow the state of the parent event,
6944 * not its attr.disabled bit. We hold the parent's mutex,
6945 * so we won't race with perf_event_{en, dis}able_family.
6946 */
6947 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
6948 child_event->state = PERF_EVENT_STATE_INACTIVE;
6949 else
6950 child_event->state = PERF_EVENT_STATE_OFF;
6951
6952 if (parent_event->attr.freq) {
6953 u64 sample_period = parent_event->hw.sample_period;
6954 struct hw_perf_event *hwc = &child_event->hw;
6955
6956 hwc->sample_period = sample_period;
6957 hwc->last_period = sample_period;
6958
6959 local64_set(&hwc->period_left, sample_period);
6960 }
6961
6962 child_event->ctx = child_ctx;
6963 child_event->overflow_handler = parent_event->overflow_handler;
6964
6965 /*
Thomas Gleixner614b6782010-12-03 16:24:32 -02006966 * Precalculate sample_data sizes
6967 */
6968 perf_event__header_size(child_event);
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02006969 perf_event__id_header_size(child_event);
Thomas Gleixner614b6782010-12-03 16:24:32 -02006970
6971 /*
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006972 * Link it up in the child's context:
6973 */
Peter Zijlstracee010e2010-09-10 12:51:54 +02006974 raw_spin_lock_irqsave(&child_ctx->lock, flags);
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006975 add_event_to_ctx(child_event, child_ctx);
Peter Zijlstracee010e2010-09-10 12:51:54 +02006976 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02006977
6978 /*
6979 * Get a reference to the parent filp - we will fput it
6980 * when the child event exits. This is safe to do because
6981 * we are in the parent and we know that the filp still
6982 * exists and has a nonzero count:
6983 */
6984 atomic_long_inc(&parent_event->filp->f_count);
6985
6986 /*
6987 * Link this into the parent event's child list
6988 */
6989 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
6990 mutex_lock(&parent_event->child_mutex);
6991 list_add_tail(&child_event->child_list, &parent_event->child_list);
6992 mutex_unlock(&parent_event->child_mutex);
6993
6994 return child_event;
6995}
6996
6997static int inherit_group(struct perf_event *parent_event,
6998 struct task_struct *parent,
6999 struct perf_event_context *parent_ctx,
7000 struct task_struct *child,
7001 struct perf_event_context *child_ctx)
7002{
7003 struct perf_event *leader;
7004 struct perf_event *sub;
7005 struct perf_event *child_ctr;
7006
7007 leader = inherit_event(parent_event, parent, parent_ctx,
7008 child, NULL, child_ctx);
7009 if (IS_ERR(leader))
7010 return PTR_ERR(leader);
7011 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
7012 child_ctr = inherit_event(sub, parent, parent_ctx,
7013 child, leader, child_ctx);
7014 if (IS_ERR(child_ctr))
7015 return PTR_ERR(child_ctr);
7016 }
7017 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007018}
7019
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007020static int
7021inherit_task_group(struct perf_event *event, struct task_struct *parent,
7022 struct perf_event_context *parent_ctx,
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007023 struct task_struct *child, int ctxn,
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007024 int *inherited_all)
7025{
7026 int ret;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007027 struct perf_event_context *child_ctx;
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007028
7029 if (!event->attr.inherit) {
7030 *inherited_all = 0;
7031 return 0;
7032 }
7033
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01007034 child_ctx = child->perf_event_ctxp[ctxn];
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007035 if (!child_ctx) {
7036 /*
7037 * This is executed from the parent task context, so
7038 * inherit events that have been marked for cloning.
7039 * First allocate and initialize a context for the
7040 * child.
7041 */
7042
Peter Zijlstraeb184472010-09-07 15:55:13 +02007043 child_ctx = alloc_perf_context(event->pmu, child);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007044 if (!child_ctx)
7045 return -ENOMEM;
7046
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007047 child->perf_event_ctxp[ctxn] = child_ctx;
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007048 }
7049
7050 ret = inherit_group(event, parent, parent_ctx,
7051 child, child_ctx);
7052
7053 if (ret)
7054 *inherited_all = 0;
7055
7056 return ret;
7057}
7058
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007059/*
7060 * Initialize the perf_event context in task_struct
7061 */
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007062int perf_event_init_context(struct task_struct *child, int ctxn)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007063{
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007064 struct perf_event_context *child_ctx, *parent_ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007065 struct perf_event_context *cloned_ctx;
7066 struct perf_event *event;
7067 struct task_struct *parent = current;
7068 int inherited_all = 1;
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01007069 unsigned long flags;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007070 int ret = 0;
7071
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007072 if (likely(!parent->perf_event_ctxp[ctxn]))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007073 return 0;
7074
7075 /*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007076 * If the parent's context is a clone, pin it so it won't get
7077 * swapped under us.
7078 */
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007079 parent_ctx = perf_pin_task_context(parent, ctxn);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007080
7081 /*
7082 * No need to check if parent_ctx != NULL here; since we saw
7083 * it non-NULL earlier, the only reason for it to become NULL
7084 * is if we exit, and since we're currently in the middle of
7085 * a fork we can't be exiting at the same time.
7086 */
7087
7088 /*
7089 * Lock the parent list. No need to lock the child - not PID
7090 * hashed yet and not running, so nobody can access it.
7091 */
7092 mutex_lock(&parent_ctx->mutex);
7093
7094 /*
7095 * We dont have to disable NMIs - we are only looking at
7096 * the list, not manipulating it:
7097 */
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007098 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007099 ret = inherit_task_group(event, parent, parent_ctx,
7100 child, ctxn, &inherited_all);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007101 if (ret)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007102 break;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007103 }
7104
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01007105 /*
7106 * We can't hold ctx->lock when iterating the ->flexible_group list due
7107 * to allocations, but we need to prevent rotation because
7108 * rotate_ctx() will change the list from interrupt context.
7109 */
7110 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
7111 parent_ctx->rotate_disable = 1;
7112 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
7113
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007114 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007115 ret = inherit_task_group(event, parent, parent_ctx,
7116 child, ctxn, &inherited_all);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007117 if (ret)
7118 break;
7119 }
7120
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01007121 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
7122 parent_ctx->rotate_disable = 0;
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01007123
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007124 child_ctx = child->perf_event_ctxp[ctxn];
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007125
Peter Zijlstra05cbaa22009-12-30 16:00:35 +01007126 if (child_ctx && inherited_all) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007127 /*
7128 * Mark the child context as a clone of the parent
7129 * context, or of whatever the parent is a clone of.
Peter Zijlstrac5ed5142011-01-17 13:45:37 +01007130 *
7131 * Note that if the parent is a clone, the holding of
7132 * parent_ctx->lock avoids it from being uncloned.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007133 */
Peter Zijlstrac5ed5142011-01-17 13:45:37 +01007134 cloned_ctx = parent_ctx->parent_ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007135 if (cloned_ctx) {
7136 child_ctx->parent_ctx = cloned_ctx;
7137 child_ctx->parent_gen = parent_ctx->parent_gen;
7138 } else {
7139 child_ctx->parent_ctx = parent_ctx;
7140 child_ctx->parent_gen = parent_ctx->generation;
7141 }
7142 get_ctx(child_ctx->parent_ctx);
7143 }
7144
Peter Zijlstrac5ed5142011-01-17 13:45:37 +01007145 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007146 mutex_unlock(&parent_ctx->mutex);
7147
7148 perf_unpin_context(parent_ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01007149 put_ctx(parent_ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007150
7151 return ret;
7152}
7153
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007154/*
7155 * Initialize the perf_event context in task_struct
7156 */
7157int perf_event_init_task(struct task_struct *child)
7158{
7159 int ctxn, ret;
7160
Oleg Nesterov8550d7c2011-01-19 19:22:28 +01007161 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
7162 mutex_init(&child->perf_event_mutex);
7163 INIT_LIST_HEAD(&child->perf_event_list);
7164
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007165 for_each_task_context_nr(ctxn) {
7166 ret = perf_event_init_context(child, ctxn);
7167 if (ret)
7168 return ret;
7169 }
7170
7171 return 0;
7172}
7173
Paul Mackerras220b1402010-03-10 20:45:52 +11007174static void __init perf_event_init_all_cpus(void)
7175{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007176 struct swevent_htable *swhash;
Paul Mackerras220b1402010-03-10 20:45:52 +11007177 int cpu;
Paul Mackerras220b1402010-03-10 20:45:52 +11007178
7179 for_each_possible_cpu(cpu) {
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007180 swhash = &per_cpu(swevent_htable, cpu);
7181 mutex_init(&swhash->hlist_mutex);
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02007182 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
Paul Mackerras220b1402010-03-10 20:45:52 +11007183 }
7184}
7185
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007186static void __cpuinit perf_event_init_cpu(int cpu)
7187{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007188 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007189
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007190 mutex_lock(&swhash->hlist_mutex);
7191 if (swhash->hlist_refcount > 0) {
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02007192 struct swevent_hlist *hlist;
7193
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007194 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
7195 WARN_ON(!hlist);
7196 rcu_assign_pointer(swhash->swevent_hlist, hlist);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02007197 }
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007198 mutex_unlock(&swhash->hlist_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007199}
7200
Peter Zijlstrac2774432010-12-08 15:29:02 +01007201#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02007202static void perf_pmu_rotate_stop(struct pmu *pmu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007203{
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02007204 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
7205
7206 WARN_ON(!irqs_disabled());
7207
7208 list_del_init(&cpuctx->rotation_list);
7209}
7210
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007211static void __perf_event_exit_context(void *__info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007212{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007213 struct perf_event_context *ctx = __info;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007214 struct perf_event *event, *tmp;
7215
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007216 perf_pmu_rotate_stop(ctx->pmu);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02007217
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007218 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01007219 __perf_remove_from_context(event);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01007220 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01007221 __perf_remove_from_context(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007222}
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007223
7224static void perf_event_exit_cpu_context(int cpu)
7225{
7226 struct perf_event_context *ctx;
7227 struct pmu *pmu;
7228 int idx;
7229
7230 idx = srcu_read_lock(&pmus_srcu);
7231 list_for_each_entry_rcu(pmu, &pmus, entry) {
Peter Zijlstra917bdd12010-09-17 11:28:49 +02007232 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007233
7234 mutex_lock(&ctx->mutex);
7235 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
7236 mutex_unlock(&ctx->mutex);
7237 }
7238 srcu_read_unlock(&pmus_srcu, idx);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007239}
7240
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007241static void perf_event_exit_cpu(int cpu)
7242{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007243 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007244
Peter Zijlstrab28ab832010-09-06 14:48:15 +02007245 mutex_lock(&swhash->hlist_mutex);
7246 swevent_hlist_release(swhash);
7247 mutex_unlock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02007248
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007249 perf_event_exit_cpu_context(cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007250}
7251#else
7252static inline void perf_event_exit_cpu(int cpu) { }
7253#endif
7254
Peter Zijlstrac2774432010-12-08 15:29:02 +01007255static int
7256perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
7257{
7258 int cpu;
7259
7260 for_each_online_cpu(cpu)
7261 perf_event_exit_cpu(cpu);
7262
7263 return NOTIFY_OK;
7264}
7265
7266/*
7267 * Run the perf reboot notifier at the very last possible moment so that
7268 * the generic watchdog code runs as long as possible.
7269 */
7270static struct notifier_block perf_reboot_notifier = {
7271 .notifier_call = perf_reboot,
7272 .priority = INT_MIN,
7273};
7274
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007275static int __cpuinit
7276perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
7277{
7278 unsigned int cpu = (long)hcpu;
7279
Peter Zijlstra5e116372010-06-11 13:35:08 +02007280 switch (action & ~CPU_TASKS_FROZEN) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007281
7282 case CPU_UP_PREPARE:
Peter Zijlstra5e116372010-06-11 13:35:08 +02007283 case CPU_DOWN_FAILED:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007284 perf_event_init_cpu(cpu);
7285 break;
7286
Peter Zijlstra5e116372010-06-11 13:35:08 +02007287 case CPU_UP_CANCELED:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007288 case CPU_DOWN_PREPARE:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007289 perf_event_exit_cpu(cpu);
7290 break;
7291
7292 default:
7293 break;
7294 }
7295
7296 return NOTIFY_OK;
7297}
7298
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007299void __init perf_event_init(void)
7300{
Jason Wessel3c502e72010-11-04 17:33:01 -05007301 int ret;
7302
Peter Zijlstra2e80a822010-11-17 23:17:36 +01007303 idr_init(&pmu_idr);
7304
Paul Mackerras220b1402010-03-10 20:45:52 +11007305 perf_event_init_all_cpus();
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007306 init_srcu_struct(&pmus_srcu);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01007307 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
7308 perf_pmu_register(&perf_cpu_clock, NULL, -1);
7309 perf_pmu_register(&perf_task_clock, NULL, -1);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007310 perf_tp_register();
7311 perf_cpu_notifier(perf_cpu_notify);
Peter Zijlstrac2774432010-12-08 15:29:02 +01007312 register_reboot_notifier(&perf_reboot_notifier);
Jason Wessel3c502e72010-11-04 17:33:01 -05007313
7314 ret = init_hw_breakpoint();
7315 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007316}
Peter Zijlstraabe43402010-11-17 23:17:37 +01007317
7318static int __init perf_event_sysfs_init(void)
7319{
7320 struct pmu *pmu;
7321 int ret;
7322
7323 mutex_lock(&pmus_lock);
7324
7325 ret = bus_register(&pmu_bus);
7326 if (ret)
7327 goto unlock;
7328
7329 list_for_each_entry(pmu, &pmus, entry) {
7330 if (!pmu->name || pmu->type < 0)
7331 continue;
7332
7333 ret = pmu_dev_alloc(pmu);
7334 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
7335 }
7336 pmu_bus_running = 1;
7337 ret = 0;
7338
7339unlock:
7340 mutex_unlock(&pmus_lock);
7341
7342 return ret;
7343}
7344device_initcall(perf_event_sysfs_init);
Stephane Eraniane5d13672011-02-14 11:20:01 +02007345
7346#ifdef CONFIG_CGROUP_PERF
7347static struct cgroup_subsys_state *perf_cgroup_create(
7348 struct cgroup_subsys *ss, struct cgroup *cont)
7349{
7350 struct perf_cgroup *jc;
Stephane Eraniane5d13672011-02-14 11:20:01 +02007351
Li Zefan1b15d052011-03-03 14:26:06 +08007352 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
Stephane Eraniane5d13672011-02-14 11:20:01 +02007353 if (!jc)
7354 return ERR_PTR(-ENOMEM);
7355
Stephane Eraniane5d13672011-02-14 11:20:01 +02007356 jc->info = alloc_percpu(struct perf_cgroup_info);
7357 if (!jc->info) {
7358 kfree(jc);
7359 return ERR_PTR(-ENOMEM);
7360 }
7361
Stephane Eraniane5d13672011-02-14 11:20:01 +02007362 return &jc->css;
7363}
7364
7365static void perf_cgroup_destroy(struct cgroup_subsys *ss,
7366 struct cgroup *cont)
7367{
7368 struct perf_cgroup *jc;
7369 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7370 struct perf_cgroup, css);
7371 free_percpu(jc->info);
7372 kfree(jc);
7373}
7374
7375static int __perf_cgroup_move(void *info)
7376{
7377 struct task_struct *task = info;
7378 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7379 return 0;
7380}
7381
7382static void perf_cgroup_move(struct task_struct *task)
7383{
7384 task_function_call(task, __perf_cgroup_move, task);
7385}
7386
7387static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7388 struct cgroup *old_cgrp, struct task_struct *task,
7389 bool threadgroup)
7390{
7391 perf_cgroup_move(task);
7392 if (threadgroup) {
7393 struct task_struct *c;
7394 rcu_read_lock();
7395 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7396 perf_cgroup_move(c);
7397 }
7398 rcu_read_unlock();
7399 }
7400}
7401
7402static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7403 struct cgroup *old_cgrp, struct task_struct *task)
7404{
7405 /*
7406 * cgroup_exit() is called in the copy_process() failure path.
7407 * Ignore this case since the task hasn't ran yet, this avoids
7408 * trying to poke a half freed task state from generic code.
7409 */
7410 if (!(task->flags & PF_EXITING))
7411 return;
7412
7413 perf_cgroup_move(task);
7414}
7415
7416struct cgroup_subsys perf_subsys = {
7417 .name = "perf_event",
7418 .subsys_id = perf_subsys_id,
7419 .create = perf_cgroup_create,
7420 .destroy = perf_cgroup_destroy,
7421 .exit = perf_cgroup_exit,
7422 .attach = perf_cgroup_attach,
7423};
7424#endif /* CONFIG_CGROUP_PERF */