blob: de14b67a3b0bcfc0fe5b80320a620a9951ec1674 [file] [log] [blame]
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001/*
Ingo Molnar57c0c152009-09-21 12:20:38 +02002 * Performance events core code:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
Ingo Molnare7e7ee22011-05-04 08:42:29 +02005 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
Peter Zijlstra90eec102015-11-16 11:08:45 +01006 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
Al Virod36b6912011-12-29 17:09:01 -05007 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008 *
Ingo Molnar57c0c152009-09-21 12:20:38 +02009 * For licensing details see kernel-base/COPYING
Ingo Molnarcdd6c482009-09-21 12:02:48 +020010 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
Peter Zijlstra2e80a822010-11-17 23:17:36 +010016#include <linux/idr.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020017#include <linux/file.h>
18#include <linux/poll.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090019#include <linux/slab.h>
Frederic Weisbecker76e1d902010-04-05 15:35:57 +020020#include <linux/hash.h>
Frederic Weisbecker12351ef2013-04-20 15:48:22 +020021#include <linux/tick.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020022#include <linux/sysfs.h>
23#include <linux/dcache.h>
24#include <linux/percpu.h>
25#include <linux/ptrace.h>
Peter Zijlstrac2774432010-12-08 15:29:02 +010026#include <linux/reboot.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020027#include <linux/vmstat.h>
Peter Zijlstraabe43402010-11-17 23:17:37 +010028#include <linux/device.h>
Paul Gortmaker6e5fdee2011-05-26 16:00:52 -040029#include <linux/export.h>
Peter Zijlstra906010b2009-09-21 16:08:49 +020030#include <linux/vmalloc.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020031#include <linux/hardirq.h>
32#include <linux/rculist.h>
33#include <linux/uaccess.h>
34#include <linux/syscalls.h>
35#include <linux/anon_inodes.h>
36#include <linux/kernel_stat.h>
Matt Fleming39bed6c2015-01-23 18:45:40 +000037#include <linux/cgroup.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020038#include <linux/perf_event.h>
Steven Rostedt (Red Hat)af658dc2015-04-29 14:36:05 -040039#include <linux/trace_events.h>
Jason Wessel3c502e72010-11-04 17:33:01 -050040#include <linux/hw_breakpoint.h>
Jiri Olsac5ebced2012-08-07 15:20:40 +020041#include <linux/mm_types.h>
Yan, Zhengc464c762014-03-18 16:56:41 +080042#include <linux/module.h>
Peter Zijlstraf972eb62014-05-19 15:13:47 -040043#include <linux/mman.h>
Pawel Mollb3f20782014-06-13 16:03:32 +010044#include <linux/compat.h>
Alexei Starovoitov25415172015-03-25 12:49:20 -070045#include <linux/bpf.h>
46#include <linux/filter.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020047
Frederic Weisbecker76369132011-05-19 19:55:04 +020048#include "internal.h"
49
Ingo Molnarcdd6c482009-09-21 12:02:48 +020050#include <asm/irq_regs.h>
51
Peter Zijlstra272325c2015-04-15 11:41:58 +020052typedef int (*remote_function_f)(void *);
53
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +010054struct remote_function_call {
Ingo Molnare7e7ee22011-05-04 08:42:29 +020055 struct task_struct *p;
Peter Zijlstra272325c2015-04-15 11:41:58 +020056 remote_function_f func;
Ingo Molnare7e7ee22011-05-04 08:42:29 +020057 void *info;
58 int ret;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +010059};
60
61static void remote_function(void *data)
62{
63 struct remote_function_call *tfc = data;
64 struct task_struct *p = tfc->p;
65
66 if (p) {
67 tfc->ret = -EAGAIN;
68 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
69 return;
70 }
71
72 tfc->ret = tfc->func(tfc->info);
73}
74
75/**
76 * task_function_call - call a function on the cpu on which a task runs
77 * @p: the task to evaluate
78 * @func: the function to be called
79 * @info: the function call argument
80 *
81 * Calls the function @func when the task is currently running. This might
82 * be on the current CPU, which just calls the function directly
83 *
84 * returns: @func return value, or
85 * -ESRCH - when the process isn't running
86 * -EAGAIN - when the process moved away
87 */
88static int
Peter Zijlstra272325c2015-04-15 11:41:58 +020089task_function_call(struct task_struct *p, remote_function_f func, void *info)
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +010090{
91 struct remote_function_call data = {
Ingo Molnare7e7ee22011-05-04 08:42:29 +020092 .p = p,
93 .func = func,
94 .info = info,
95 .ret = -ESRCH, /* No such (running) process */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +010096 };
97
98 if (task_curr(p))
99 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
100
101 return data.ret;
102}
103
104/**
105 * cpu_function_call - call a function on the cpu
106 * @func: the function to be called
107 * @info: the function call argument
108 *
109 * Calls the function @func on the remote cpu.
110 *
111 * returns: @func return value or -ENXIO when the cpu is offline
112 */
Peter Zijlstra272325c2015-04-15 11:41:58 +0200113static int cpu_function_call(int cpu, remote_function_f func, void *info)
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +0100114{
115 struct remote_function_call data = {
Ingo Molnare7e7ee22011-05-04 08:42:29 +0200116 .p = NULL,
117 .func = func,
118 .info = info,
119 .ret = -ENXIO, /* No such CPU */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +0100120 };
121
122 smp_call_function_single(cpu, remote_function, &data, 1);
123
124 return data.ret;
125}
126
Peter Zijlstrafae3fde2016-01-11 15:00:50 +0100127static inline struct perf_cpu_context *
128__get_cpu_context(struct perf_event_context *ctx)
129{
130 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
131}
132
133static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
134 struct perf_event_context *ctx)
135{
136 raw_spin_lock(&cpuctx->ctx.lock);
137 if (ctx)
138 raw_spin_lock(&ctx->lock);
139}
140
141static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
142 struct perf_event_context *ctx)
143{
144 if (ctx)
145 raw_spin_unlock(&ctx->lock);
146 raw_spin_unlock(&cpuctx->ctx.lock);
147}
148
Peter Zijlstra63b6da32016-01-14 16:05:37 +0100149#define TASK_TOMBSTONE ((void *)-1L)
150
151static bool is_kernel_event(struct perf_event *event)
152{
Peter Zijlstraf47c02c2016-01-26 12:30:14 +0100153 return READ_ONCE(event->owner) == TASK_TOMBSTONE;
Peter Zijlstra63b6da32016-01-14 16:05:37 +0100154}
155
Peter Zijlstra39a43642016-01-11 12:46:35 +0100156/*
157 * On task ctx scheduling...
158 *
159 * When !ctx->nr_events a task context will not be scheduled. This means
160 * we can disable the scheduler hooks (for performance) without leaving
161 * pending task ctx state.
162 *
163 * This however results in two special cases:
164 *
165 * - removing the last event from a task ctx; this is relatively straight
166 * forward and is done in __perf_remove_from_context.
167 *
168 * - adding the first event to a task ctx; this is tricky because we cannot
169 * rely on ctx->is_active and therefore cannot use event_function_call().
170 * See perf_install_in_context().
171 *
172 * This is because we need a ctx->lock serialized variable (ctx->is_active)
173 * to reliably determine if a particular task/context is scheduled in. The
174 * task_curr() use in task_function_call() is racy in that a remote context
175 * switch is not a single atomic operation.
176 *
177 * As is, the situation is 'safe' because we set rq->curr before we do the
178 * actual context switch. This means that task_curr() will fail early, but
179 * we'll continue spinning on ctx->is_active until we've passed
180 * perf_event_task_sched_out().
181 *
182 * Without this ctx->lock serialized variable we could have race where we find
183 * the task (and hence the context) would not be active while in fact they are.
184 *
185 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
186 */
187
Peter Zijlstrafae3fde2016-01-11 15:00:50 +0100188typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
189 struct perf_event_context *, void *);
190
191struct event_function_struct {
192 struct perf_event *event;
193 event_f func;
194 void *data;
195};
196
197static int event_function(void *info)
198{
199 struct event_function_struct *efs = info;
200 struct perf_event *event = efs->event;
201 struct perf_event_context *ctx = event->ctx;
202 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
203 struct perf_event_context *task_ctx = cpuctx->task_ctx;
Peter Zijlstra63b6da32016-01-14 16:05:37 +0100204 int ret = 0;
Peter Zijlstrafae3fde2016-01-11 15:00:50 +0100205
206 WARN_ON_ONCE(!irqs_disabled());
207
Peter Zijlstra63b6da32016-01-14 16:05:37 +0100208 perf_ctx_lock(cpuctx, task_ctx);
Peter Zijlstrafae3fde2016-01-11 15:00:50 +0100209 /*
210 * Since we do the IPI call without holding ctx->lock things can have
211 * changed, double check we hit the task we set out to hit.
Peter Zijlstrafae3fde2016-01-11 15:00:50 +0100212 */
213 if (ctx->task) {
Peter Zijlstra63b6da32016-01-14 16:05:37 +0100214 if (ctx->task != current) {
215 ret = -EAGAIN;
216 goto unlock;
217 }
Peter Zijlstrafae3fde2016-01-11 15:00:50 +0100218
Peter Zijlstrafae3fde2016-01-11 15:00:50 +0100219 /*
220 * We only use event_function_call() on established contexts,
221 * and event_function() is only ever called when active (or
222 * rather, we'll have bailed in task_function_call() or the
223 * above ctx->task != current test), therefore we must have
224 * ctx->is_active here.
225 */
226 WARN_ON_ONCE(!ctx->is_active);
227 /*
228 * And since we have ctx->is_active, cpuctx->task_ctx must
229 * match.
230 */
Peter Zijlstra63b6da32016-01-14 16:05:37 +0100231 WARN_ON_ONCE(task_ctx != ctx);
232 } else {
233 WARN_ON_ONCE(&cpuctx->ctx != ctx);
Peter Zijlstrafae3fde2016-01-11 15:00:50 +0100234 }
Peter Zijlstra63b6da32016-01-14 16:05:37 +0100235
Peter Zijlstrafae3fde2016-01-11 15:00:50 +0100236 efs->func(event, cpuctx, ctx, efs->data);
Peter Zijlstra63b6da32016-01-14 16:05:37 +0100237unlock:
Peter Zijlstrafae3fde2016-01-11 15:00:50 +0100238 perf_ctx_unlock(cpuctx, task_ctx);
239
Peter Zijlstra63b6da32016-01-14 16:05:37 +0100240 return ret;
Peter Zijlstrafae3fde2016-01-11 15:00:50 +0100241}
242
243static void event_function_local(struct perf_event *event, event_f func, void *data)
244{
245 struct event_function_struct efs = {
246 .event = event,
247 .func = func,
248 .data = data,
249 };
250
251 int ret = event_function(&efs);
252 WARN_ON_ONCE(ret);
253}
254
255static void event_function_call(struct perf_event *event, event_f func, void *data)
Peter Zijlstra00179602015-11-30 16:26:35 +0100256{
257 struct perf_event_context *ctx = event->ctx;
Peter Zijlstra63b6da32016-01-14 16:05:37 +0100258 struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
Peter Zijlstrafae3fde2016-01-11 15:00:50 +0100259 struct event_function_struct efs = {
260 .event = event,
261 .func = func,
262 .data = data,
263 };
Peter Zijlstra00179602015-11-30 16:26:35 +0100264
Peter Zijlstrac97f4732016-01-14 10:51:03 +0100265 if (!event->parent) {
266 /*
267 * If this is a !child event, we must hold ctx::mutex to
268 * stabilize the the event->ctx relation. See
269 * perf_event_ctx_lock().
270 */
271 lockdep_assert_held(&ctx->mutex);
272 }
Peter Zijlstra00179602015-11-30 16:26:35 +0100273
274 if (!task) {
Peter Zijlstrafae3fde2016-01-11 15:00:50 +0100275 cpu_function_call(event->cpu, event_function, &efs);
Peter Zijlstra00179602015-11-30 16:26:35 +0100276 return;
277 }
278
279again:
Peter Zijlstra63b6da32016-01-14 16:05:37 +0100280 if (task == TASK_TOMBSTONE)
281 return;
282
Peter Zijlstrafae3fde2016-01-11 15:00:50 +0100283 if (!task_function_call(task, event_function, &efs))
Peter Zijlstra00179602015-11-30 16:26:35 +0100284 return;
285
286 raw_spin_lock_irq(&ctx->lock);
Peter Zijlstra63b6da32016-01-14 16:05:37 +0100287 /*
288 * Reload the task pointer, it might have been changed by
289 * a concurrent perf_event_context_sched_out().
290 */
291 task = ctx->task;
292 if (task != TASK_TOMBSTONE) {
293 if (ctx->is_active) {
294 raw_spin_unlock_irq(&ctx->lock);
295 goto again;
296 }
297 func(event, NULL, ctx, data);
Peter Zijlstra00179602015-11-30 16:26:35 +0100298 }
Peter Zijlstra00179602015-11-30 16:26:35 +0100299 raw_spin_unlock_irq(&ctx->lock);
300}
301
Stephane Eraniane5d13672011-02-14 11:20:01 +0200302#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
303 PERF_FLAG_FD_OUTPUT |\
Yann Droneauda21b0b32014-01-05 21:36:33 +0100304 PERF_FLAG_PID_CGROUP |\
305 PERF_FLAG_FD_CLOEXEC)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200306
Stephane Eranianbce38cd2012-02-09 23:20:51 +0100307/*
308 * branch priv levels that need permission checks
309 */
310#define PERF_SAMPLE_BRANCH_PERM_PLM \
311 (PERF_SAMPLE_BRANCH_KERNEL |\
312 PERF_SAMPLE_BRANCH_HV)
313
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200314enum event_type_t {
315 EVENT_FLEXIBLE = 0x1,
316 EVENT_PINNED = 0x2,
317 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
318};
319
Stephane Eraniane5d13672011-02-14 11:20:01 +0200320/*
321 * perf_sched_events : >0 events exist
322 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
323 */
Peter Zijlstra9107c892016-02-24 18:45:45 +0100324
325static void perf_sched_delayed(struct work_struct *work);
326DEFINE_STATIC_KEY_FALSE(perf_sched_events);
327static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
328static DEFINE_MUTEX(perf_sched_mutex);
329static atomic_t perf_sched_count;
330
Stephane Eraniane5d13672011-02-14 11:20:01 +0200331static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
Yan, Zhengba532502014-11-04 21:55:58 -0500332static DEFINE_PER_CPU(int, perf_sched_cb_usages);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200333
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200334static atomic_t nr_mmap_events __read_mostly;
335static atomic_t nr_comm_events __read_mostly;
336static atomic_t nr_task_events __read_mostly;
Frederic Weisbecker948b26b2013-08-02 18:29:55 +0200337static atomic_t nr_freq_events __read_mostly;
Adrian Hunter45ac1402015-07-21 12:44:02 +0300338static atomic_t nr_switch_events __read_mostly;
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200339
Peter Zijlstra108b02c2010-09-06 14:32:03 +0200340static LIST_HEAD(pmus);
341static DEFINE_MUTEX(pmus_lock);
342static struct srcu_struct pmus_srcu;
343
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200344/*
345 * perf event paranoia level:
346 * -1 - not paranoid at all
347 * 0 - disallow raw tracepoint access for unpriv
348 * 1 - disallow cpu events for unpriv
349 * 2 - disallow kernel profiling for unpriv
350 */
351int sysctl_perf_event_paranoid __read_mostly = 1;
352
Frederic Weisbecker20443382011-03-31 03:33:29 +0200353/* Minimum for 512 kiB + 1 user control page */
354int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200355
356/*
357 * max perf event sample rate
358 */
Dave Hansen14c63f12013-06-21 08:51:36 -0700359#define DEFAULT_MAX_SAMPLE_RATE 100000
360#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
361#define DEFAULT_CPU_TIME_MAX_PERCENT 25
362
363int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
364
365static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
366static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
367
Peter Zijlstrad9494cb2013-10-17 15:36:19 +0200368static int perf_sample_allowed_ns __read_mostly =
369 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
Dave Hansen14c63f12013-06-21 08:51:36 -0700370
Geliang Tang18ab2cd2015-09-27 23:25:50 +0800371static void update_perf_cpu_limits(void)
Dave Hansen14c63f12013-06-21 08:51:36 -0700372{
373 u64 tmp = perf_sample_period_ns;
374
375 tmp *= sysctl_perf_cpu_time_max_percent;
Stephane Eraniane5302922013-07-05 00:30:11 +0200376 do_div(tmp, 100);
Peter Zijlstrad9494cb2013-10-17 15:36:19 +0200377 ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
Dave Hansen14c63f12013-06-21 08:51:36 -0700378}
Peter Zijlstra163ec432011-02-16 11:22:34 +0100379
Stephane Eranian9e630202013-04-03 14:21:33 +0200380static int perf_rotate_context(struct perf_cpu_context *cpuctx);
381
Peter Zijlstra163ec432011-02-16 11:22:34 +0100382int perf_proc_update_handler(struct ctl_table *table, int write,
383 void __user *buffer, size_t *lenp,
384 loff_t *ppos)
385{
Knut Petersen723478c2013-09-25 14:29:37 +0200386 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Peter Zijlstra163ec432011-02-16 11:22:34 +0100387
388 if (ret || !write)
389 return ret;
390
391 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
Dave Hansen14c63f12013-06-21 08:51:36 -0700392 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
393 update_perf_cpu_limits();
Peter Zijlstra163ec432011-02-16 11:22:34 +0100394
395 return 0;
396}
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200397
Dave Hansen14c63f12013-06-21 08:51:36 -0700398int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
399
400int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
401 void __user *buffer, size_t *lenp,
402 loff_t *ppos)
403{
404 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
405
406 if (ret || !write)
407 return ret;
408
409 update_perf_cpu_limits();
410
411 return 0;
412}
413
414/*
415 * perf samples are done in some very critical code paths (NMIs).
416 * If they take too much CPU time, the system can lock up and not
417 * get any real work done. This will drop the sample rate when
418 * we detect that events are taking too long.
419 */
420#define NR_ACCUMULATED_SAMPLES 128
Peter Zijlstrad9494cb2013-10-17 15:36:19 +0200421static DEFINE_PER_CPU(u64, running_sample_length);
Dave Hansen14c63f12013-06-21 08:51:36 -0700422
Peter Zijlstra6a02ad662014-02-03 18:11:08 +0100423static void perf_duration_warn(struct irq_work *w)
Dave Hansen14c63f12013-06-21 08:51:36 -0700424{
Peter Zijlstra6a02ad662014-02-03 18:11:08 +0100425 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
Dave Hansen14c63f12013-06-21 08:51:36 -0700426 u64 avg_local_sample_len;
Stephane Eraniane5302922013-07-05 00:30:11 +0200427 u64 local_samples_len;
Peter Zijlstra6a02ad662014-02-03 18:11:08 +0100428
Christoph Lameter4a32fea2014-08-17 12:30:27 -0500429 local_samples_len = __this_cpu_read(running_sample_length);
Peter Zijlstra6a02ad662014-02-03 18:11:08 +0100430 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
431
432 printk_ratelimited(KERN_WARNING
433 "perf interrupt took too long (%lld > %lld), lowering "
434 "kernel.perf_event_max_sample_rate to %d\n",
Peter Zijlstracd578ab2014-02-11 16:01:16 +0100435 avg_local_sample_len, allowed_ns >> 1,
Peter Zijlstra6a02ad662014-02-03 18:11:08 +0100436 sysctl_perf_event_sample_rate);
437}
438
439static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
440
441void perf_sample_event_took(u64 sample_len_ns)
442{
Peter Zijlstrad9494cb2013-10-17 15:36:19 +0200443 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
Peter Zijlstra6a02ad662014-02-03 18:11:08 +0100444 u64 avg_local_sample_len;
445 u64 local_samples_len;
Dave Hansen14c63f12013-06-21 08:51:36 -0700446
Peter Zijlstrad9494cb2013-10-17 15:36:19 +0200447 if (allowed_ns == 0)
Dave Hansen14c63f12013-06-21 08:51:36 -0700448 return;
449
450 /* decay the counter by 1 average sample */
Christoph Lameter4a32fea2014-08-17 12:30:27 -0500451 local_samples_len = __this_cpu_read(running_sample_length);
Dave Hansen14c63f12013-06-21 08:51:36 -0700452 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
453 local_samples_len += sample_len_ns;
Christoph Lameter4a32fea2014-08-17 12:30:27 -0500454 __this_cpu_write(running_sample_length, local_samples_len);
Dave Hansen14c63f12013-06-21 08:51:36 -0700455
456 /*
457 * note: this will be biased artifically low until we have
458 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
459 * from having to maintain a count.
460 */
461 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
462
Peter Zijlstrad9494cb2013-10-17 15:36:19 +0200463 if (avg_local_sample_len <= allowed_ns)
Dave Hansen14c63f12013-06-21 08:51:36 -0700464 return;
465
466 if (max_samples_per_tick <= 1)
467 return;
468
469 max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
470 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
471 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
472
Dave Hansen14c63f12013-06-21 08:51:36 -0700473 update_perf_cpu_limits();
Peter Zijlstra6a02ad662014-02-03 18:11:08 +0100474
Peter Zijlstracd578ab2014-02-11 16:01:16 +0100475 if (!irq_work_queue(&perf_duration_work)) {
476 early_printk("perf interrupt took too long (%lld > %lld), lowering "
477 "kernel.perf_event_max_sample_rate to %d\n",
478 avg_local_sample_len, allowed_ns >> 1,
479 sysctl_perf_event_sample_rate);
480 }
Dave Hansen14c63f12013-06-21 08:51:36 -0700481}
482
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200483static atomic64_t perf_event_id;
484
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200485static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
486 enum event_type_t event_type);
487
488static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
Stephane Eraniane5d13672011-02-14 11:20:01 +0200489 enum event_type_t event_type,
490 struct task_struct *task);
491
492static void update_context_time(struct perf_event_context *ctx);
493static u64 perf_event_time(struct perf_event *event);
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200494
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200495void __weak perf_event_print_debug(void) { }
496
Matt Fleming84c79912010-10-03 21:41:13 +0100497extern __weak const char *perf_pmu_name(void)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200498{
Matt Fleming84c79912010-10-03 21:41:13 +0100499 return "pmu";
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200500}
501
Stephane Eranian0b3fcf12011-01-03 18:20:01 +0200502static inline u64 perf_clock(void)
503{
504 return local_clock();
505}
506
Peter Zijlstra34f43922015-02-20 14:05:38 +0100507static inline u64 perf_event_clock(struct perf_event *event)
508{
509 return event->clock();
510}
511
Stephane Eraniane5d13672011-02-14 11:20:01 +0200512#ifdef CONFIG_CGROUP_PERF
513
Stephane Eraniane5d13672011-02-14 11:20:01 +0200514static inline bool
515perf_cgroup_match(struct perf_event *event)
516{
517 struct perf_event_context *ctx = event->ctx;
518 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
519
Tejun Heoef824fa2013-04-08 19:00:38 -0700520 /* @event doesn't care about cgroup */
521 if (!event->cgrp)
522 return true;
523
524 /* wants specific cgroup scope but @cpuctx isn't associated with any */
525 if (!cpuctx->cgrp)
526 return false;
527
528 /*
529 * Cgroup scoping is recursive. An event enabled for a cgroup is
530 * also enabled for all its descendant cgroups. If @cpuctx's
531 * cgroup is a descendant of @event's (the test covers identity
532 * case), it's a match.
533 */
534 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
535 event->cgrp->css.cgroup);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200536}
537
Stephane Eraniane5d13672011-02-14 11:20:01 +0200538static inline void perf_detach_cgroup(struct perf_event *event)
539{
Zefan Li4e2ba652014-09-19 16:53:14 +0800540 css_put(&event->cgrp->css);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200541 event->cgrp = NULL;
542}
543
544static inline int is_cgroup_event(struct perf_event *event)
545{
546 return event->cgrp != NULL;
547}
548
549static inline u64 perf_cgroup_event_time(struct perf_event *event)
550{
551 struct perf_cgroup_info *t;
552
553 t = per_cpu_ptr(event->cgrp->info, event->cpu);
554 return t->time;
555}
556
557static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
558{
559 struct perf_cgroup_info *info;
560 u64 now;
561
562 now = perf_clock();
563
564 info = this_cpu_ptr(cgrp->info);
565
566 info->time += now - info->timestamp;
567 info->timestamp = now;
568}
569
570static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
571{
572 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
573 if (cgrp_out)
574 __update_cgrp_time(cgrp_out);
575}
576
577static inline void update_cgrp_time_from_event(struct perf_event *event)
578{
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200579 struct perf_cgroup *cgrp;
580
Stephane Eraniane5d13672011-02-14 11:20:01 +0200581 /*
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200582 * ensure we access cgroup data only when needed and
583 * when we know the cgroup is pinned (css_get)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200584 */
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200585 if (!is_cgroup_event(event))
Stephane Eraniane5d13672011-02-14 11:20:01 +0200586 return;
587
Stephane Eranian614e4c42015-11-12 11:00:04 +0100588 cgrp = perf_cgroup_from_task(current, event->ctx);
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200589 /*
590 * Do not update time when cgroup is not active
591 */
592 if (cgrp == event->cgrp)
593 __update_cgrp_time(event->cgrp);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200594}
595
596static inline void
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200597perf_cgroup_set_timestamp(struct task_struct *task,
598 struct perf_event_context *ctx)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200599{
600 struct perf_cgroup *cgrp;
601 struct perf_cgroup_info *info;
602
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200603 /*
604 * ctx->lock held by caller
605 * ensure we do not access cgroup data
606 * unless we have the cgroup pinned (css_get)
607 */
608 if (!task || !ctx->nr_cgroups)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200609 return;
610
Stephane Eranian614e4c42015-11-12 11:00:04 +0100611 cgrp = perf_cgroup_from_task(task, ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200612 info = this_cpu_ptr(cgrp->info);
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200613 info->timestamp = ctx->timestamp;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200614}
615
616#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
617#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
618
619/*
620 * reschedule events based on the cgroup constraint of task.
621 *
622 * mode SWOUT : schedule out everything
623 * mode SWIN : schedule in based on cgroup for next
624 */
Geliang Tang18ab2cd2015-09-27 23:25:50 +0800625static void perf_cgroup_switch(struct task_struct *task, int mode)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200626{
627 struct perf_cpu_context *cpuctx;
628 struct pmu *pmu;
629 unsigned long flags;
630
631 /*
632 * disable interrupts to avoid geting nr_cgroup
633 * changes via __perf_event_disable(). Also
634 * avoids preemption.
635 */
636 local_irq_save(flags);
637
638 /*
639 * we reschedule only in the presence of cgroup
640 * constrained events.
641 */
Stephane Eraniane5d13672011-02-14 11:20:01 +0200642
643 list_for_each_entry_rcu(pmu, &pmus, entry) {
Stephane Eraniane5d13672011-02-14 11:20:01 +0200644 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
Peter Zijlstra95cf59e2012-10-02 15:41:23 +0200645 if (cpuctx->unique_pmu != pmu)
646 continue; /* ensure we process each cpuctx once */
Stephane Eraniane5d13672011-02-14 11:20:01 +0200647
Stephane Eraniane5d13672011-02-14 11:20:01 +0200648 /*
649 * perf_cgroup_events says at least one
650 * context on this CPU has cgroup events.
651 *
652 * ctx->nr_cgroups reports the number of cgroup
653 * events for a context.
654 */
655 if (cpuctx->ctx.nr_cgroups > 0) {
Peter Zijlstrafacc4302011-04-09 21:17:42 +0200656 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
657 perf_pmu_disable(cpuctx->ctx.pmu);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200658
659 if (mode & PERF_CGROUP_SWOUT) {
660 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
661 /*
662 * must not be done before ctxswout due
663 * to event_filter_match() in event_sched_out()
664 */
665 cpuctx->cgrp = NULL;
666 }
667
668 if (mode & PERF_CGROUP_SWIN) {
Stephane Eraniane566b762011-04-06 02:54:54 +0200669 WARN_ON_ONCE(cpuctx->cgrp);
Peter Zijlstra95cf59e2012-10-02 15:41:23 +0200670 /*
671 * set cgrp before ctxsw in to allow
672 * event_filter_match() to not have to pass
673 * task around
Stephane Eranian614e4c42015-11-12 11:00:04 +0100674 * we pass the cpuctx->ctx to perf_cgroup_from_task()
675 * because cgorup events are only per-cpu
Stephane Eraniane5d13672011-02-14 11:20:01 +0200676 */
Stephane Eranian614e4c42015-11-12 11:00:04 +0100677 cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200678 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
679 }
Peter Zijlstrafacc4302011-04-09 21:17:42 +0200680 perf_pmu_enable(cpuctx->ctx.pmu);
681 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200682 }
Stephane Eraniane5d13672011-02-14 11:20:01 +0200683 }
684
Stephane Eraniane5d13672011-02-14 11:20:01 +0200685 local_irq_restore(flags);
686}
687
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200688static inline void perf_cgroup_sched_out(struct task_struct *task,
689 struct task_struct *next)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200690{
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200691 struct perf_cgroup *cgrp1;
692 struct perf_cgroup *cgrp2 = NULL;
693
Stephane Eranianddaaf4e2015-11-12 11:00:03 +0100694 rcu_read_lock();
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200695 /*
696 * we come here when we know perf_cgroup_events > 0
Stephane Eranian614e4c42015-11-12 11:00:04 +0100697 * we do not need to pass the ctx here because we know
698 * we are holding the rcu lock
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200699 */
Stephane Eranian614e4c42015-11-12 11:00:04 +0100700 cgrp1 = perf_cgroup_from_task(task, NULL);
Peter Zijlstra70a01652016-01-08 09:29:16 +0100701 cgrp2 = perf_cgroup_from_task(next, NULL);
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200702
703 /*
704 * only schedule out current cgroup events if we know
705 * that we are switching to a different cgroup. Otherwise,
706 * do no touch the cgroup events.
707 */
708 if (cgrp1 != cgrp2)
709 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
Stephane Eranianddaaf4e2015-11-12 11:00:03 +0100710
711 rcu_read_unlock();
Stephane Eraniane5d13672011-02-14 11:20:01 +0200712}
713
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200714static inline void perf_cgroup_sched_in(struct task_struct *prev,
715 struct task_struct *task)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200716{
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200717 struct perf_cgroup *cgrp1;
718 struct perf_cgroup *cgrp2 = NULL;
719
Stephane Eranianddaaf4e2015-11-12 11:00:03 +0100720 rcu_read_lock();
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200721 /*
722 * we come here when we know perf_cgroup_events > 0
Stephane Eranian614e4c42015-11-12 11:00:04 +0100723 * we do not need to pass the ctx here because we know
724 * we are holding the rcu lock
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200725 */
Stephane Eranian614e4c42015-11-12 11:00:04 +0100726 cgrp1 = perf_cgroup_from_task(task, NULL);
Stephane Eranian614e4c42015-11-12 11:00:04 +0100727 cgrp2 = perf_cgroup_from_task(prev, NULL);
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200728
729 /*
730 * only need to schedule in cgroup events if we are changing
731 * cgroup during ctxsw. Cgroup events were not scheduled
732 * out of ctxsw out if that was not the case.
733 */
734 if (cgrp1 != cgrp2)
735 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
Stephane Eranianddaaf4e2015-11-12 11:00:03 +0100736
737 rcu_read_unlock();
Stephane Eraniane5d13672011-02-14 11:20:01 +0200738}
739
740static inline int perf_cgroup_connect(int fd, struct perf_event *event,
741 struct perf_event_attr *attr,
742 struct perf_event *group_leader)
743{
744 struct perf_cgroup *cgrp;
745 struct cgroup_subsys_state *css;
Al Viro2903ff02012-08-28 12:52:22 -0400746 struct fd f = fdget(fd);
747 int ret = 0;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200748
Al Viro2903ff02012-08-28 12:52:22 -0400749 if (!f.file)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200750 return -EBADF;
751
Al Virob5830432014-10-31 01:22:04 -0400752 css = css_tryget_online_from_dir(f.file->f_path.dentry,
Tejun Heoec903c02014-05-13 12:11:01 -0400753 &perf_event_cgrp_subsys);
Li Zefan3db272c2011-03-03 14:25:37 +0800754 if (IS_ERR(css)) {
755 ret = PTR_ERR(css);
756 goto out;
757 }
Stephane Eraniane5d13672011-02-14 11:20:01 +0200758
759 cgrp = container_of(css, struct perf_cgroup, css);
760 event->cgrp = cgrp;
761
762 /*
763 * all events in a group must monitor
764 * the same cgroup because a task belongs
765 * to only one perf cgroup at a time
766 */
767 if (group_leader && group_leader->cgrp != cgrp) {
768 perf_detach_cgroup(event);
769 ret = -EINVAL;
Stephane Eraniane5d13672011-02-14 11:20:01 +0200770 }
Li Zefan3db272c2011-03-03 14:25:37 +0800771out:
Al Viro2903ff02012-08-28 12:52:22 -0400772 fdput(f);
Stephane Eraniane5d13672011-02-14 11:20:01 +0200773 return ret;
774}
775
776static inline void
777perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
778{
779 struct perf_cgroup_info *t;
780 t = per_cpu_ptr(event->cgrp->info, event->cpu);
781 event->shadow_ctx_time = now - t->timestamp;
782}
783
784static inline void
785perf_cgroup_defer_enabled(struct perf_event *event)
786{
787 /*
788 * when the current task's perf cgroup does not match
789 * the event's, we need to remember to call the
790 * perf_mark_enable() function the first time a task with
791 * a matching perf cgroup is scheduled in.
792 */
793 if (is_cgroup_event(event) && !perf_cgroup_match(event))
794 event->cgrp_defer_enabled = 1;
795}
796
797static inline void
798perf_cgroup_mark_enabled(struct perf_event *event,
799 struct perf_event_context *ctx)
800{
801 struct perf_event *sub;
802 u64 tstamp = perf_event_time(event);
803
804 if (!event->cgrp_defer_enabled)
805 return;
806
807 event->cgrp_defer_enabled = 0;
808
809 event->tstamp_enabled = tstamp - event->total_time_enabled;
810 list_for_each_entry(sub, &event->sibling_list, group_entry) {
811 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
812 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
813 sub->cgrp_defer_enabled = 0;
814 }
815 }
816}
817#else /* !CONFIG_CGROUP_PERF */
818
819static inline bool
820perf_cgroup_match(struct perf_event *event)
821{
822 return true;
823}
824
825static inline void perf_detach_cgroup(struct perf_event *event)
826{}
827
828static inline int is_cgroup_event(struct perf_event *event)
829{
830 return 0;
831}
832
833static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
834{
835 return 0;
836}
837
838static inline void update_cgrp_time_from_event(struct perf_event *event)
839{
840}
841
842static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
843{
844}
845
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200846static inline void perf_cgroup_sched_out(struct task_struct *task,
847 struct task_struct *next)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200848{
849}
850
Stephane Eraniana8d757e2011-08-25 15:58:03 +0200851static inline void perf_cgroup_sched_in(struct task_struct *prev,
852 struct task_struct *task)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200853{
854}
855
856static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
857 struct perf_event_attr *attr,
858 struct perf_event *group_leader)
859{
860 return -EINVAL;
861}
862
863static inline void
Stephane Eranian3f7cce32011-02-18 14:40:01 +0200864perf_cgroup_set_timestamp(struct task_struct *task,
865 struct perf_event_context *ctx)
Stephane Eraniane5d13672011-02-14 11:20:01 +0200866{
867}
868
869void
870perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
871{
872}
873
874static inline void
875perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
876{
877}
878
879static inline u64 perf_cgroup_event_time(struct perf_event *event)
880{
881 return 0;
882}
883
884static inline void
885perf_cgroup_defer_enabled(struct perf_event *event)
886{
887}
888
889static inline void
890perf_cgroup_mark_enabled(struct perf_event *event,
891 struct perf_event_context *ctx)
892{
893}
894#endif
895
Stephane Eranian9e630202013-04-03 14:21:33 +0200896/*
897 * set default to be dependent on timer tick just
898 * like original code
899 */
900#define PERF_CPU_HRTIMER (1000 / HZ)
901/*
902 * function must be called with interrupts disbled
903 */
Peter Zijlstra272325c2015-04-15 11:41:58 +0200904static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
Stephane Eranian9e630202013-04-03 14:21:33 +0200905{
906 struct perf_cpu_context *cpuctx;
Stephane Eranian9e630202013-04-03 14:21:33 +0200907 int rotations = 0;
908
909 WARN_ON(!irqs_disabled());
910
911 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
Stephane Eranian9e630202013-04-03 14:21:33 +0200912 rotations = perf_rotate_context(cpuctx);
913
Peter Zijlstra4cfafd32015-05-14 12:23:11 +0200914 raw_spin_lock(&cpuctx->hrtimer_lock);
915 if (rotations)
Stephane Eranian9e630202013-04-03 14:21:33 +0200916 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
Peter Zijlstra4cfafd32015-05-14 12:23:11 +0200917 else
918 cpuctx->hrtimer_active = 0;
919 raw_spin_unlock(&cpuctx->hrtimer_lock);
Stephane Eranian9e630202013-04-03 14:21:33 +0200920
Peter Zijlstra4cfafd32015-05-14 12:23:11 +0200921 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
Stephane Eranian9e630202013-04-03 14:21:33 +0200922}
923
Peter Zijlstra272325c2015-04-15 11:41:58 +0200924static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
Stephane Eranian9e630202013-04-03 14:21:33 +0200925{
Peter Zijlstra272325c2015-04-15 11:41:58 +0200926 struct hrtimer *timer = &cpuctx->hrtimer;
Stephane Eranian9e630202013-04-03 14:21:33 +0200927 struct pmu *pmu = cpuctx->ctx.pmu;
Peter Zijlstra272325c2015-04-15 11:41:58 +0200928 u64 interval;
Stephane Eranian9e630202013-04-03 14:21:33 +0200929
930 /* no multiplexing needed for SW PMU */
931 if (pmu->task_ctx_nr == perf_sw_context)
932 return;
933
Stephane Eranian62b85632013-04-03 14:21:34 +0200934 /*
935 * check default is sane, if not set then force to
936 * default interval (1/tick)
937 */
Peter Zijlstra272325c2015-04-15 11:41:58 +0200938 interval = pmu->hrtimer_interval_ms;
939 if (interval < 1)
940 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
Stephane Eranian62b85632013-04-03 14:21:34 +0200941
Peter Zijlstra272325c2015-04-15 11:41:58 +0200942 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
Stephane Eranian9e630202013-04-03 14:21:33 +0200943
Peter Zijlstra4cfafd32015-05-14 12:23:11 +0200944 raw_spin_lock_init(&cpuctx->hrtimer_lock);
945 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
Peter Zijlstra272325c2015-04-15 11:41:58 +0200946 timer->function = perf_mux_hrtimer_handler;
Stephane Eranian9e630202013-04-03 14:21:33 +0200947}
948
Peter Zijlstra272325c2015-04-15 11:41:58 +0200949static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
Stephane Eranian9e630202013-04-03 14:21:33 +0200950{
Peter Zijlstra272325c2015-04-15 11:41:58 +0200951 struct hrtimer *timer = &cpuctx->hrtimer;
Stephane Eranian9e630202013-04-03 14:21:33 +0200952 struct pmu *pmu = cpuctx->ctx.pmu;
Peter Zijlstra4cfafd32015-05-14 12:23:11 +0200953 unsigned long flags;
Stephane Eranian9e630202013-04-03 14:21:33 +0200954
955 /* not for SW PMU */
956 if (pmu->task_ctx_nr == perf_sw_context)
Peter Zijlstra272325c2015-04-15 11:41:58 +0200957 return 0;
Stephane Eranian9e630202013-04-03 14:21:33 +0200958
Peter Zijlstra4cfafd32015-05-14 12:23:11 +0200959 raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
960 if (!cpuctx->hrtimer_active) {
961 cpuctx->hrtimer_active = 1;
962 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
963 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
964 }
965 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
Stephane Eranian9e630202013-04-03 14:21:33 +0200966
Peter Zijlstra272325c2015-04-15 11:41:58 +0200967 return 0;
Stephane Eranian9e630202013-04-03 14:21:33 +0200968}
969
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200970void perf_pmu_disable(struct pmu *pmu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200971{
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200972 int *count = this_cpu_ptr(pmu->pmu_disable_count);
973 if (!(*count)++)
974 pmu->pmu_disable(pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200975}
976
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200977void perf_pmu_enable(struct pmu *pmu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200978{
Peter Zijlstra33696fc2010-06-14 08:49:00 +0200979 int *count = this_cpu_ptr(pmu->pmu_disable_count);
980 if (!--(*count))
981 pmu->pmu_enable(pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +0200982}
983
Mark Rutland2fde4f92015-01-07 15:01:54 +0000984static DEFINE_PER_CPU(struct list_head, active_ctx_list);
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200985
986/*
Mark Rutland2fde4f92015-01-07 15:01:54 +0000987 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
988 * perf_event_task_tick() are fully serialized because they're strictly cpu
989 * affine and perf_event_ctx{activate,deactivate} are called with IRQs
990 * disabled, while perf_event_task_tick is called from IRQ context.
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200991 */
Mark Rutland2fde4f92015-01-07 15:01:54 +0000992static void perf_event_ctx_activate(struct perf_event_context *ctx)
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200993{
Mark Rutland2fde4f92015-01-07 15:01:54 +0000994 struct list_head *head = this_cpu_ptr(&active_ctx_list);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200995
Peter Zijlstrae9d2b062010-09-17 11:28:50 +0200996 WARN_ON(!irqs_disabled());
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +0200997
Mark Rutland2fde4f92015-01-07 15:01:54 +0000998 WARN_ON(!list_empty(&ctx->active_ctx_list));
999
1000 list_add(&ctx->active_ctx_list, head);
1001}
1002
1003static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1004{
1005 WARN_ON(!irqs_disabled());
1006
1007 WARN_ON(list_empty(&ctx->active_ctx_list));
1008
1009 list_del_init(&ctx->active_ctx_list);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001010}
1011
1012static void get_ctx(struct perf_event_context *ctx)
1013{
1014 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
1015}
1016
Yan, Zheng4af57ef282014-11-04 21:56:01 -05001017static void free_ctx(struct rcu_head *head)
1018{
1019 struct perf_event_context *ctx;
1020
1021 ctx = container_of(head, struct perf_event_context, rcu_head);
1022 kfree(ctx->task_ctx_data);
1023 kfree(ctx);
1024}
1025
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001026static void put_ctx(struct perf_event_context *ctx)
1027{
1028 if (atomic_dec_and_test(&ctx->refcount)) {
1029 if (ctx->parent_ctx)
1030 put_ctx(ctx->parent_ctx);
Peter Zijlstra63b6da32016-01-14 16:05:37 +01001031 if (ctx->task && ctx->task != TASK_TOMBSTONE)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001032 put_task_struct(ctx->task);
Yan, Zheng4af57ef282014-11-04 21:56:01 -05001033 call_rcu(&ctx->rcu_head, free_ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001034 }
1035}
1036
Peter Zijlstra211de6e2014-09-30 19:23:08 +02001037/*
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01001038 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1039 * perf_pmu_migrate_context() we need some magic.
1040 *
1041 * Those places that change perf_event::ctx will hold both
1042 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1043 *
Peter Zijlstra8b10c5e2015-05-01 16:08:46 +02001044 * Lock ordering is by mutex address. There are two other sites where
1045 * perf_event_context::mutex nests and those are:
1046 *
1047 * - perf_event_exit_task_context() [ child , 0 ]
Peter Zijlstra8ba289b2016-01-26 13:06:56 +01001048 * perf_event_exit_event()
1049 * put_event() [ parent, 1 ]
Peter Zijlstra8b10c5e2015-05-01 16:08:46 +02001050 *
1051 * - perf_event_init_context() [ parent, 0 ]
1052 * inherit_task_group()
1053 * inherit_group()
1054 * inherit_event()
1055 * perf_event_alloc()
1056 * perf_init_event()
1057 * perf_try_init_event() [ child , 1 ]
1058 *
1059 * While it appears there is an obvious deadlock here -- the parent and child
1060 * nesting levels are inverted between the two. This is in fact safe because
1061 * life-time rules separate them. That is an exiting task cannot fork, and a
1062 * spawning task cannot (yet) exit.
1063 *
1064 * But remember that that these are parent<->child context relations, and
1065 * migration does not affect children, therefore these two orderings should not
1066 * interact.
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01001067 *
1068 * The change in perf_event::ctx does not affect children (as claimed above)
1069 * because the sys_perf_event_open() case will install a new event and break
1070 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1071 * concerned with cpuctx and that doesn't have children.
1072 *
1073 * The places that change perf_event::ctx will issue:
1074 *
1075 * perf_remove_from_context();
1076 * synchronize_rcu();
1077 * perf_install_in_context();
1078 *
1079 * to affect the change. The remove_from_context() + synchronize_rcu() should
1080 * quiesce the event, after which we can install it in the new location. This
1081 * means that only external vectors (perf_fops, prctl) can perturb the event
1082 * while in transit. Therefore all such accessors should also acquire
1083 * perf_event_context::mutex to serialize against this.
1084 *
1085 * However; because event->ctx can change while we're waiting to acquire
1086 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1087 * function.
1088 *
1089 * Lock order:
1090 * task_struct::perf_event_mutex
1091 * perf_event_context::mutex
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01001092 * perf_event::child_mutex;
Peter Zijlstra07c4a772016-01-26 12:15:37 +01001093 * perf_event_context::lock
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01001094 * perf_event::mmap_mutex
1095 * mmap_sem
1096 */
Peter Zijlstraa83fe282015-01-29 14:44:34 +01001097static struct perf_event_context *
1098perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01001099{
1100 struct perf_event_context *ctx;
1101
1102again:
1103 rcu_read_lock();
1104 ctx = ACCESS_ONCE(event->ctx);
1105 if (!atomic_inc_not_zero(&ctx->refcount)) {
1106 rcu_read_unlock();
1107 goto again;
1108 }
1109 rcu_read_unlock();
1110
Peter Zijlstraa83fe282015-01-29 14:44:34 +01001111 mutex_lock_nested(&ctx->mutex, nesting);
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01001112 if (event->ctx != ctx) {
1113 mutex_unlock(&ctx->mutex);
1114 put_ctx(ctx);
1115 goto again;
1116 }
1117
1118 return ctx;
1119}
1120
Peter Zijlstraa83fe282015-01-29 14:44:34 +01001121static inline struct perf_event_context *
1122perf_event_ctx_lock(struct perf_event *event)
1123{
1124 return perf_event_ctx_lock_nested(event, 0);
1125}
1126
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01001127static void perf_event_ctx_unlock(struct perf_event *event,
1128 struct perf_event_context *ctx)
1129{
1130 mutex_unlock(&ctx->mutex);
1131 put_ctx(ctx);
1132}
1133
1134/*
Peter Zijlstra211de6e2014-09-30 19:23:08 +02001135 * This must be done under the ctx->lock, such as to serialize against
1136 * context_equiv(), therefore we cannot call put_ctx() since that might end up
1137 * calling scheduler related locks and ctx->lock nests inside those.
1138 */
1139static __must_check struct perf_event_context *
1140unclone_ctx(struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001141{
Peter Zijlstra211de6e2014-09-30 19:23:08 +02001142 struct perf_event_context *parent_ctx = ctx->parent_ctx;
1143
1144 lockdep_assert_held(&ctx->lock);
1145
1146 if (parent_ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001147 ctx->parent_ctx = NULL;
Peter Zijlstra5a3126d2013-10-07 17:12:48 +02001148 ctx->generation++;
Peter Zijlstra211de6e2014-09-30 19:23:08 +02001149
1150 return parent_ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001151}
1152
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02001153static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1154{
1155 /*
1156 * only top level events have the pid namespace they were created in
1157 */
1158 if (event->parent)
1159 event = event->parent;
1160
1161 return task_tgid_nr_ns(p, event->ns);
1162}
1163
1164static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1165{
1166 /*
1167 * only top level events have the pid namespace they were created in
1168 */
1169 if (event->parent)
1170 event = event->parent;
1171
1172 return task_pid_nr_ns(p, event->ns);
1173}
1174
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001175/*
1176 * If we inherit events we want to return the parent event id
1177 * to userspace.
1178 */
1179static u64 primary_event_id(struct perf_event *event)
1180{
1181 u64 id = event->id;
1182
1183 if (event->parent)
1184 id = event->parent->id;
1185
1186 return id;
1187}
1188
1189/*
1190 * Get the perf_event_context for a task and lock it.
Peter Zijlstra63b6da32016-01-14 16:05:37 +01001191 *
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001192 * This has to cope with with the fact that until it is locked,
1193 * the context could get moved to another task.
1194 */
1195static struct perf_event_context *
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001196perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001197{
1198 struct perf_event_context *ctx;
1199
Peter Zijlstra9ed60602010-06-11 17:36:35 +02001200retry:
Peter Zijlstra058ebd02013-07-12 11:08:33 +02001201 /*
1202 * One of the few rules of preemptible RCU is that one cannot do
1203 * rcu_read_unlock() while holding a scheduler (or nested) lock when
Paul E. McKenney2fd59072015-11-04 05:48:38 -08001204 * part of the read side critical section was irqs-enabled -- see
Peter Zijlstra058ebd02013-07-12 11:08:33 +02001205 * rcu_read_unlock_special().
1206 *
1207 * Since ctx->lock nests under rq->lock we must ensure the entire read
Paul E. McKenney2fd59072015-11-04 05:48:38 -08001208 * side critical section has interrupts disabled.
Peter Zijlstra058ebd02013-07-12 11:08:33 +02001209 */
Paul E. McKenney2fd59072015-11-04 05:48:38 -08001210 local_irq_save(*flags);
Peter Zijlstra058ebd02013-07-12 11:08:33 +02001211 rcu_read_lock();
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001212 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001213 if (ctx) {
1214 /*
1215 * If this context is a clone of another, it might
1216 * get swapped for another underneath us by
1217 * perf_event_task_sched_out, though the
1218 * rcu_read_lock() protects us from any context
1219 * getting freed. Lock the context and check if it
1220 * got swapped before we could get the lock, and retry
1221 * if so. If we locked the right context, then it
1222 * can't get swapped on us any more.
1223 */
Paul E. McKenney2fd59072015-11-04 05:48:38 -08001224 raw_spin_lock(&ctx->lock);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001225 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
Paul E. McKenney2fd59072015-11-04 05:48:38 -08001226 raw_spin_unlock(&ctx->lock);
Peter Zijlstra058ebd02013-07-12 11:08:33 +02001227 rcu_read_unlock();
Paul E. McKenney2fd59072015-11-04 05:48:38 -08001228 local_irq_restore(*flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001229 goto retry;
1230 }
1231
Peter Zijlstra63b6da32016-01-14 16:05:37 +01001232 if (ctx->task == TASK_TOMBSTONE ||
1233 !atomic_inc_not_zero(&ctx->refcount)) {
Paul E. McKenney2fd59072015-11-04 05:48:38 -08001234 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001235 ctx = NULL;
Peter Zijlstra828b6f02016-01-27 21:59:04 +01001236 } else {
1237 WARN_ON_ONCE(ctx->task != task);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001238 }
1239 }
1240 rcu_read_unlock();
Paul E. McKenney2fd59072015-11-04 05:48:38 -08001241 if (!ctx)
1242 local_irq_restore(*flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001243 return ctx;
1244}
1245
1246/*
1247 * Get the context for a task and increment its pin_count so it
1248 * can't get swapped to another task. This also increments its
1249 * reference count so that the context can't get freed.
1250 */
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001251static struct perf_event_context *
1252perf_pin_task_context(struct task_struct *task, int ctxn)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001253{
1254 struct perf_event_context *ctx;
1255 unsigned long flags;
1256
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02001257 ctx = perf_lock_task_context(task, ctxn, &flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001258 if (ctx) {
1259 ++ctx->pin_count;
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001260 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001261 }
1262 return ctx;
1263}
1264
1265static void perf_unpin_context(struct perf_event_context *ctx)
1266{
1267 unsigned long flags;
1268
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001269 raw_spin_lock_irqsave(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001270 --ctx->pin_count;
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001271 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001272}
1273
Peter Zijlstraf67218c2009-11-23 11:37:27 +01001274/*
1275 * Update the record of the current time in a context.
1276 */
1277static void update_context_time(struct perf_event_context *ctx)
1278{
1279 u64 now = perf_clock();
1280
1281 ctx->time += now - ctx->timestamp;
1282 ctx->timestamp = now;
1283}
1284
Stephane Eranian41587552011-01-03 18:20:01 +02001285static u64 perf_event_time(struct perf_event *event)
1286{
1287 struct perf_event_context *ctx = event->ctx;
Stephane Eraniane5d13672011-02-14 11:20:01 +02001288
1289 if (is_cgroup_event(event))
1290 return perf_cgroup_event_time(event);
1291
Stephane Eranian41587552011-01-03 18:20:01 +02001292 return ctx ? ctx->time : 0;
1293}
1294
Peter Zijlstraf67218c2009-11-23 11:37:27 +01001295/*
1296 * Update the total_time_enabled and total_time_running fields for a event.
Eric B Munsonb7526f02011-06-23 16:34:37 -04001297 * The caller of this function needs to hold the ctx->lock.
Peter Zijlstraf67218c2009-11-23 11:37:27 +01001298 */
1299static void update_event_times(struct perf_event *event)
1300{
1301 struct perf_event_context *ctx = event->ctx;
1302 u64 run_end;
1303
1304 if (event->state < PERF_EVENT_STATE_INACTIVE ||
1305 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1306 return;
Stephane Eraniane5d13672011-02-14 11:20:01 +02001307 /*
1308 * in cgroup mode, time_enabled represents
1309 * the time the event was enabled AND active
1310 * tasks were in the monitored cgroup. This is
1311 * independent of the activity of the context as
1312 * there may be a mix of cgroup and non-cgroup events.
1313 *
1314 * That is why we treat cgroup events differently
1315 * here.
1316 */
1317 if (is_cgroup_event(event))
Namhyung Kim46cd6a7f2012-01-20 10:12:46 +09001318 run_end = perf_cgroup_event_time(event);
Stephane Eraniane5d13672011-02-14 11:20:01 +02001319 else if (ctx->is_active)
1320 run_end = ctx->time;
Peter Zijlstraacd1d7c2009-11-23 15:00:36 +01001321 else
1322 run_end = event->tstamp_stopped;
1323
1324 event->total_time_enabled = run_end - event->tstamp_enabled;
Peter Zijlstraf67218c2009-11-23 11:37:27 +01001325
1326 if (event->state == PERF_EVENT_STATE_INACTIVE)
1327 run_end = event->tstamp_stopped;
1328 else
Stephane Eranian41587552011-01-03 18:20:01 +02001329 run_end = perf_event_time(event);
Peter Zijlstraf67218c2009-11-23 11:37:27 +01001330
1331 event->total_time_running = run_end - event->tstamp_running;
Stephane Eraniane5d13672011-02-14 11:20:01 +02001332
Peter Zijlstraf67218c2009-11-23 11:37:27 +01001333}
1334
Peter Zijlstra96c21a42010-05-11 16:19:10 +02001335/*
1336 * Update total_time_enabled and total_time_running for all events in a group.
1337 */
1338static void update_group_times(struct perf_event *leader)
1339{
1340 struct perf_event *event;
1341
1342 update_event_times(leader);
1343 list_for_each_entry(event, &leader->sibling_list, group_entry)
1344 update_event_times(event);
1345}
1346
Frederic Weisbecker889ff012010-01-09 20:04:47 +01001347static struct list_head *
1348ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1349{
1350 if (event->attr.pinned)
1351 return &ctx->pinned_groups;
1352 else
1353 return &ctx->flexible_groups;
1354}
1355
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001356/*
1357 * Add a event from the lists for its context.
1358 * Must be called with ctx->mutex and ctx->lock held.
1359 */
1360static void
1361list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1362{
Peter Zijlstrac994d612016-01-08 09:20:23 +01001363 lockdep_assert_held(&ctx->lock);
1364
Peter Zijlstra8a495422010-05-27 15:47:49 +02001365 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1366 event->attach_state |= PERF_ATTACH_CONTEXT;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001367
1368 /*
Peter Zijlstra8a495422010-05-27 15:47:49 +02001369 * If we're a stand alone event or group leader, we go to the context
1370 * list, group events are kept attached to the group so that
1371 * perf_group_detach can, at all times, locate all siblings.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001372 */
Peter Zijlstra8a495422010-05-27 15:47:49 +02001373 if (event->group_leader == event) {
Frederic Weisbecker889ff012010-01-09 20:04:47 +01001374 struct list_head *list;
1375
Frederic Weisbeckerd6f962b2010-01-10 01:25:51 +01001376 if (is_software_event(event))
1377 event->group_flags |= PERF_GROUP_SOFTWARE;
1378
Frederic Weisbecker889ff012010-01-09 20:04:47 +01001379 list = ctx_group_list(event, ctx);
1380 list_add_tail(&event->group_entry, list);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001381 }
1382
Peter Zijlstra08309372011-03-03 11:31:20 +01001383 if (is_cgroup_event(event))
Stephane Eraniane5d13672011-02-14 11:20:01 +02001384 ctx->nr_cgroups++;
Stephane Eraniane5d13672011-02-14 11:20:01 +02001385
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001386 list_add_rcu(&event->event_entry, &ctx->event_list);
1387 ctx->nr_events++;
1388 if (event->attr.inherit_stat)
1389 ctx->nr_stat++;
Peter Zijlstra5a3126d2013-10-07 17:12:48 +02001390
1391 ctx->generation++;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001392}
1393
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001394/*
Jiri Olsa0231bb52013-02-01 11:23:45 +01001395 * Initialize event state based on the perf_event_attr::disabled.
1396 */
1397static inline void perf_event__state_init(struct perf_event *event)
1398{
1399 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1400 PERF_EVENT_STATE_INACTIVE;
1401}
1402
Peter Zijlstraa7239682015-09-09 19:06:33 +02001403static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001404{
1405 int entry = sizeof(u64); /* value */
1406 int size = 0;
1407 int nr = 1;
1408
1409 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1410 size += sizeof(u64);
1411
1412 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1413 size += sizeof(u64);
1414
1415 if (event->attr.read_format & PERF_FORMAT_ID)
1416 entry += sizeof(u64);
1417
1418 if (event->attr.read_format & PERF_FORMAT_GROUP) {
Peter Zijlstraa7239682015-09-09 19:06:33 +02001419 nr += nr_siblings;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001420 size += sizeof(u64);
1421 }
1422
1423 size += entry * nr;
1424 event->read_size = size;
1425}
1426
Peter Zijlstraa7239682015-09-09 19:06:33 +02001427static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001428{
1429 struct perf_sample_data *data;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001430 u16 size = 0;
1431
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001432 if (sample_type & PERF_SAMPLE_IP)
1433 size += sizeof(data->ip);
1434
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02001435 if (sample_type & PERF_SAMPLE_ADDR)
1436 size += sizeof(data->addr);
1437
1438 if (sample_type & PERF_SAMPLE_PERIOD)
1439 size += sizeof(data->period);
1440
Andi Kleenc3feedf2013-01-24 16:10:28 +01001441 if (sample_type & PERF_SAMPLE_WEIGHT)
1442 size += sizeof(data->weight);
1443
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02001444 if (sample_type & PERF_SAMPLE_READ)
1445 size += event->read_size;
1446
Stephane Eraniand6be9ad2013-01-24 16:10:31 +01001447 if (sample_type & PERF_SAMPLE_DATA_SRC)
1448 size += sizeof(data->data_src.val);
1449
Andi Kleenfdfbbd02013-09-20 07:40:39 -07001450 if (sample_type & PERF_SAMPLE_TRANSACTION)
1451 size += sizeof(data->txn);
1452
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02001453 event->header_size = size;
1454}
1455
Peter Zijlstraa7239682015-09-09 19:06:33 +02001456/*
1457 * Called at perf_event creation and when events are attached/detached from a
1458 * group.
1459 */
1460static void perf_event__header_size(struct perf_event *event)
1461{
1462 __perf_event_read_size(event,
1463 event->group_leader->nr_siblings);
1464 __perf_event_header_size(event, event->attr.sample_type);
1465}
1466
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02001467static void perf_event__id_header_size(struct perf_event *event)
1468{
1469 struct perf_sample_data *data;
1470 u64 sample_type = event->attr.sample_type;
1471 u16 size = 0;
1472
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001473 if (sample_type & PERF_SAMPLE_TID)
1474 size += sizeof(data->tid_entry);
1475
1476 if (sample_type & PERF_SAMPLE_TIME)
1477 size += sizeof(data->time);
1478
Adrian Hunterff3d5272013-08-27 11:23:07 +03001479 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1480 size += sizeof(data->id);
1481
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001482 if (sample_type & PERF_SAMPLE_ID)
1483 size += sizeof(data->id);
1484
1485 if (sample_type & PERF_SAMPLE_STREAM_ID)
1486 size += sizeof(data->stream_id);
1487
1488 if (sample_type & PERF_SAMPLE_CPU)
1489 size += sizeof(data->cpu_entry);
1490
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02001491 event->id_header_size = size;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001492}
1493
Peter Zijlstraa7239682015-09-09 19:06:33 +02001494static bool perf_event_validate_size(struct perf_event *event)
1495{
1496 /*
1497 * The values computed here will be over-written when we actually
1498 * attach the event.
1499 */
1500 __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1501 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1502 perf_event__id_header_size(event);
1503
1504 /*
1505 * Sum the lot; should not exceed the 64k limit we have on records.
1506 * Conservative limit to allow for callchains and other variable fields.
1507 */
1508 if (event->read_size + event->header_size +
1509 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1510 return false;
1511
1512 return true;
1513}
1514
Peter Zijlstra8a495422010-05-27 15:47:49 +02001515static void perf_group_attach(struct perf_event *event)
1516{
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001517 struct perf_event *group_leader = event->group_leader, *pos;
Peter Zijlstra8a495422010-05-27 15:47:49 +02001518
Peter Zijlstra74c33372010-10-15 11:40:29 +02001519 /*
1520 * We can have double attach due to group movement in perf_event_open.
1521 */
1522 if (event->attach_state & PERF_ATTACH_GROUP)
1523 return;
1524
Peter Zijlstra8a495422010-05-27 15:47:49 +02001525 event->attach_state |= PERF_ATTACH_GROUP;
1526
1527 if (group_leader == event)
1528 return;
1529
Peter Zijlstra652884f2015-01-23 11:20:10 +01001530 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1531
Peter Zijlstra8a495422010-05-27 15:47:49 +02001532 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1533 !is_software_event(event))
1534 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1535
1536 list_add_tail(&event->group_entry, &group_leader->sibling_list);
1537 group_leader->nr_siblings++;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001538
1539 perf_event__header_size(group_leader);
1540
1541 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1542 perf_event__header_size(pos);
Peter Zijlstra8a495422010-05-27 15:47:49 +02001543}
1544
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001545/*
1546 * Remove a event from the lists for its context.
1547 * Must be called with ctx->mutex and ctx->lock held.
1548 */
1549static void
1550list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1551{
Stephane Eranian68cacd22011-03-23 16:03:06 +01001552 struct perf_cpu_context *cpuctx;
Peter Zijlstra652884f2015-01-23 11:20:10 +01001553
1554 WARN_ON_ONCE(event->ctx != ctx);
1555 lockdep_assert_held(&ctx->lock);
1556
Peter Zijlstra8a495422010-05-27 15:47:49 +02001557 /*
1558 * We can have double detach due to exit/hot-unplug + close.
1559 */
1560 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001561 return;
Peter Zijlstra8a495422010-05-27 15:47:49 +02001562
1563 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1564
Stephane Eranian68cacd22011-03-23 16:03:06 +01001565 if (is_cgroup_event(event)) {
Stephane Eraniane5d13672011-02-14 11:20:01 +02001566 ctx->nr_cgroups--;
Peter Zijlstra70a01652016-01-08 09:29:16 +01001567 /*
1568 * Because cgroup events are always per-cpu events, this will
1569 * always be called from the right CPU.
1570 */
Stephane Eranian68cacd22011-03-23 16:03:06 +01001571 cpuctx = __get_cpu_context(ctx);
1572 /*
Peter Zijlstra70a01652016-01-08 09:29:16 +01001573 * If there are no more cgroup events then clear cgrp to avoid
1574 * stale pointer in update_cgrp_time_from_cpuctx().
Stephane Eranian68cacd22011-03-23 16:03:06 +01001575 */
1576 if (!ctx->nr_cgroups)
1577 cpuctx->cgrp = NULL;
1578 }
Stephane Eraniane5d13672011-02-14 11:20:01 +02001579
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001580 ctx->nr_events--;
1581 if (event->attr.inherit_stat)
1582 ctx->nr_stat--;
1583
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001584 list_del_rcu(&event->event_entry);
1585
Peter Zijlstra8a495422010-05-27 15:47:49 +02001586 if (event->group_leader == event)
1587 list_del_init(&event->group_entry);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001588
Peter Zijlstra96c21a42010-05-11 16:19:10 +02001589 update_group_times(event);
Stephane Eranianb2e74a22009-11-26 09:24:30 -08001590
1591 /*
1592 * If event was in error state, then keep it
1593 * that way, otherwise bogus counts will be
1594 * returned on read(). The only way to get out
1595 * of error state is by explicit re-enabling
1596 * of the event
1597 */
1598 if (event->state > PERF_EVENT_STATE_OFF)
1599 event->state = PERF_EVENT_STATE_OFF;
Peter Zijlstra5a3126d2013-10-07 17:12:48 +02001600
1601 ctx->generation++;
Peter Zijlstra050735b2010-05-11 11:51:53 +02001602}
1603
Peter Zijlstra8a495422010-05-27 15:47:49 +02001604static void perf_group_detach(struct perf_event *event)
Peter Zijlstra050735b2010-05-11 11:51:53 +02001605{
1606 struct perf_event *sibling, *tmp;
Peter Zijlstra8a495422010-05-27 15:47:49 +02001607 struct list_head *list = NULL;
1608
1609 /*
1610 * We can have double detach due to exit/hot-unplug + close.
1611 */
1612 if (!(event->attach_state & PERF_ATTACH_GROUP))
1613 return;
1614
1615 event->attach_state &= ~PERF_ATTACH_GROUP;
1616
1617 /*
1618 * If this is a sibling, remove it from its group.
1619 */
1620 if (event->group_leader != event) {
1621 list_del_init(&event->group_entry);
1622 event->group_leader->nr_siblings--;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001623 goto out;
Peter Zijlstra8a495422010-05-27 15:47:49 +02001624 }
1625
1626 if (!list_empty(&event->group_entry))
1627 list = &event->group_entry;
Peter Zijlstra2e2af502009-11-23 11:37:25 +01001628
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001629 /*
1630 * If this was a group event with sibling events then
1631 * upgrade the siblings to singleton events by adding them
Peter Zijlstra8a495422010-05-27 15:47:49 +02001632 * to whatever list we are on.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001633 */
1634 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
Peter Zijlstra8a495422010-05-27 15:47:49 +02001635 if (list)
1636 list_move_tail(&sibling->group_entry, list);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001637 sibling->group_leader = sibling;
Frederic Weisbeckerd6f962b2010-01-10 01:25:51 +01001638
1639 /* Inherit group flags from the previous leader */
1640 sibling->group_flags = event->group_flags;
Peter Zijlstra652884f2015-01-23 11:20:10 +01001641
1642 WARN_ON_ONCE(sibling->ctx != event->ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001643 }
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02001644
1645out:
1646 perf_event__header_size(event->group_leader);
1647
1648 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1649 perf_event__header_size(tmp);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001650}
1651
Jiri Olsafadfe7b2014-08-01 14:33:02 +02001652static bool is_orphaned_event(struct perf_event *event)
1653{
Peter Zijlstraa69b0ca2016-02-24 18:45:44 +01001654 return event->state == PERF_EVENT_STATE_DEAD;
Jiri Olsafadfe7b2014-08-01 14:33:02 +02001655}
1656
Mark Rutland66eb5792015-05-13 17:12:23 +01001657static inline int pmu_filter_match(struct perf_event *event)
1658{
1659 struct pmu *pmu = event->pmu;
1660 return pmu->filter_match ? pmu->filter_match(event) : 1;
1661}
1662
Stephane Eranianfa66f072010-08-26 16:40:01 +02001663static inline int
1664event_filter_match(struct perf_event *event)
1665{
Stephane Eraniane5d13672011-02-14 11:20:01 +02001666 return (event->cpu == -1 || event->cpu == smp_processor_id())
Mark Rutland66eb5792015-05-13 17:12:23 +01001667 && perf_cgroup_match(event) && pmu_filter_match(event);
Stephane Eranianfa66f072010-08-26 16:40:01 +02001668}
1669
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001670static void
1671event_sched_out(struct perf_event *event,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001672 struct perf_cpu_context *cpuctx,
1673 struct perf_event_context *ctx)
1674{
Stephane Eranian41587552011-01-03 18:20:01 +02001675 u64 tstamp = perf_event_time(event);
Stephane Eranianfa66f072010-08-26 16:40:01 +02001676 u64 delta;
Peter Zijlstra652884f2015-01-23 11:20:10 +01001677
1678 WARN_ON_ONCE(event->ctx != ctx);
1679 lockdep_assert_held(&ctx->lock);
1680
Stephane Eranianfa66f072010-08-26 16:40:01 +02001681 /*
1682 * An event which could not be activated because of
1683 * filter mismatch still needs to have its timings
1684 * maintained, otherwise bogus information is return
1685 * via read() for time_enabled, time_running:
1686 */
1687 if (event->state == PERF_EVENT_STATE_INACTIVE
1688 && !event_filter_match(event)) {
Stephane Eraniane5d13672011-02-14 11:20:01 +02001689 delta = tstamp - event->tstamp_stopped;
Stephane Eranianfa66f072010-08-26 16:40:01 +02001690 event->tstamp_running += delta;
Stephane Eranian41587552011-01-03 18:20:01 +02001691 event->tstamp_stopped = tstamp;
Stephane Eranianfa66f072010-08-26 16:40:01 +02001692 }
1693
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001694 if (event->state != PERF_EVENT_STATE_ACTIVE)
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001695 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001696
Alexander Shishkin44377272013-12-16 14:17:36 +02001697 perf_pmu_disable(event->pmu);
1698
Peter Zijlstra28a967c2016-02-24 18:45:46 +01001699 event->tstamp_stopped = tstamp;
1700 event->pmu->del(event, 0);
1701 event->oncpu = -1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001702 event->state = PERF_EVENT_STATE_INACTIVE;
1703 if (event->pending_disable) {
1704 event->pending_disable = 0;
1705 event->state = PERF_EVENT_STATE_OFF;
1706 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001707
1708 if (!is_software_event(event))
1709 cpuctx->active_oncpu--;
Mark Rutland2fde4f92015-01-07 15:01:54 +00001710 if (!--ctx->nr_active)
1711 perf_event_ctx_deactivate(ctx);
Peter Zijlstra0f5a2602011-11-16 14:38:16 +01001712 if (event->attr.freq && event->attr.sample_freq)
1713 ctx->nr_freq--;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001714 if (event->attr.exclusive || !cpuctx->active_oncpu)
1715 cpuctx->exclusive = 0;
Alexander Shishkin44377272013-12-16 14:17:36 +02001716
1717 perf_pmu_enable(event->pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001718}
1719
1720static void
1721group_sched_out(struct perf_event *group_event,
1722 struct perf_cpu_context *cpuctx,
1723 struct perf_event_context *ctx)
1724{
1725 struct perf_event *event;
Stephane Eranianfa66f072010-08-26 16:40:01 +02001726 int state = group_event->state;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001727
1728 event_sched_out(group_event, cpuctx, ctx);
1729
1730 /*
1731 * Schedule out siblings (if any):
1732 */
1733 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1734 event_sched_out(event, cpuctx, ctx);
1735
Stephane Eranianfa66f072010-08-26 16:40:01 +02001736 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001737 cpuctx->exclusive = 0;
1738}
1739
Peter Zijlstra45a0e072016-01-26 13:09:48 +01001740#define DETACH_GROUP 0x01UL
Peter Zijlstra00179602015-11-30 16:26:35 +01001741
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001742/*
1743 * Cross CPU call to remove a performance event
1744 *
1745 * We disable the event on the hardware level first. After that we
1746 * remove it from the context list.
1747 */
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01001748static void
1749__perf_remove_from_context(struct perf_event *event,
1750 struct perf_cpu_context *cpuctx,
1751 struct perf_event_context *ctx,
1752 void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001753{
Peter Zijlstra45a0e072016-01-26 13:09:48 +01001754 unsigned long flags = (unsigned long)info;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001755
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001756 event_sched_out(event, cpuctx, ctx);
Peter Zijlstra45a0e072016-01-26 13:09:48 +01001757 if (flags & DETACH_GROUP)
Peter Zijlstra46ce0fe2014-05-02 16:56:01 +02001758 perf_group_detach(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001759 list_del_event(event, ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001760
Peter Zijlstra39a43642016-01-11 12:46:35 +01001761 if (!ctx->nr_events && ctx->is_active) {
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01001762 ctx->is_active = 0;
Peter Zijlstra39a43642016-01-11 12:46:35 +01001763 if (ctx->task) {
1764 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
1765 cpuctx->task_ctx = NULL;
1766 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001767 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001768}
1769
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001770/*
1771 * Remove the event from a task's (or a CPU's) list of events.
1772 *
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001773 * If event->ctx is a cloned context, callers must make sure that
1774 * every task struct that event->ctx->task could possibly point to
1775 * remains valid. This is OK when called from perf_release since
1776 * that only calls us on the top-level context, which can't be a clone.
1777 * When called from perf_event_exit_task, it's OK because the
1778 * context has been detached from its task.
1779 */
Peter Zijlstra45a0e072016-01-26 13:09:48 +01001780static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001781{
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01001782 lockdep_assert_held(&event->ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001783
Peter Zijlstra45a0e072016-01-26 13:09:48 +01001784 event_function_call(event, __perf_remove_from_context, (void *)flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001785}
1786
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001787/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001788 * Cross CPU call to disable a performance event
1789 */
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01001790static void __perf_event_disable(struct perf_event *event,
1791 struct perf_cpu_context *cpuctx,
1792 struct perf_event_context *ctx,
1793 void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001794{
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01001795 if (event->state < PERF_EVENT_STATE_INACTIVE)
1796 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001797
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01001798 update_context_time(ctx);
1799 update_cgrp_time_from_event(event);
1800 update_group_times(event);
1801 if (event == event->group_leader)
1802 group_sched_out(event, cpuctx, ctx);
1803 else
1804 event_sched_out(event, cpuctx, ctx);
1805 event->state = PERF_EVENT_STATE_OFF;
Peter Zijlstra7b648012015-12-03 18:35:21 +01001806}
1807
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001808/*
1809 * Disable a event.
1810 *
1811 * If event->ctx is a cloned context, callers must make sure that
1812 * every task struct that event->ctx->task could possibly point to
1813 * remains valid. This condition is satisifed when called through
1814 * perf_event_for_each_child or perf_event_for_each because they
1815 * hold the top-level event's child_mutex, so any descendant that
Peter Zijlstra8ba289b2016-01-26 13:06:56 +01001816 * goes to exit will block in perf_event_exit_event().
1817 *
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001818 * When called from perf_pending_event it's OK because event->ctx
1819 * is the current context on this CPU and preemption is disabled,
1820 * hence we can't get into perf_event_task_sched_out for this context.
1821 */
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01001822static void _perf_event_disable(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001823{
1824 struct perf_event_context *ctx = event->ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001825
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001826 raw_spin_lock_irq(&ctx->lock);
Peter Zijlstra7b648012015-12-03 18:35:21 +01001827 if (event->state <= PERF_EVENT_STATE_OFF) {
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001828 raw_spin_unlock_irq(&ctx->lock);
Peter Zijlstra7b648012015-12-03 18:35:21 +01001829 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001830 }
Thomas Gleixnere625cce12009-11-17 18:02:06 +01001831 raw_spin_unlock_irq(&ctx->lock);
Peter Zijlstra7b648012015-12-03 18:35:21 +01001832
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01001833 event_function_call(event, __perf_event_disable, NULL);
1834}
1835
1836void perf_event_disable_local(struct perf_event *event)
1837{
1838 event_function_local(event, __perf_event_disable, NULL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001839}
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01001840
1841/*
1842 * Strictly speaking kernel users cannot create groups and therefore this
1843 * interface does not need the perf_event_ctx_lock() magic.
1844 */
1845void perf_event_disable(struct perf_event *event)
1846{
1847 struct perf_event_context *ctx;
1848
1849 ctx = perf_event_ctx_lock(event);
1850 _perf_event_disable(event);
1851 perf_event_ctx_unlock(event, ctx);
1852}
Robert Richterdcfce4a2011-10-11 17:11:08 +02001853EXPORT_SYMBOL_GPL(perf_event_disable);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001854
Stephane Eraniane5d13672011-02-14 11:20:01 +02001855static void perf_set_shadow_time(struct perf_event *event,
1856 struct perf_event_context *ctx,
1857 u64 tstamp)
1858{
1859 /*
1860 * use the correct time source for the time snapshot
1861 *
1862 * We could get by without this by leveraging the
1863 * fact that to get to this function, the caller
1864 * has most likely already called update_context_time()
1865 * and update_cgrp_time_xx() and thus both timestamp
1866 * are identical (or very close). Given that tstamp is,
1867 * already adjusted for cgroup, we could say that:
1868 * tstamp - ctx->timestamp
1869 * is equivalent to
1870 * tstamp - cgrp->timestamp.
1871 *
1872 * Then, in perf_output_read(), the calculation would
1873 * work with no changes because:
1874 * - event is guaranteed scheduled in
1875 * - no scheduled out in between
1876 * - thus the timestamp would be the same
1877 *
1878 * But this is a bit hairy.
1879 *
1880 * So instead, we have an explicit cgroup call to remain
1881 * within the time time source all along. We believe it
1882 * is cleaner and simpler to understand.
1883 */
1884 if (is_cgroup_event(event))
1885 perf_cgroup_set_shadow_time(event, tstamp);
1886 else
1887 event->shadow_ctx_time = tstamp - ctx->timestamp;
1888}
1889
Peter Zijlstra4fe757d2011-02-15 22:26:07 +01001890#define MAX_INTERRUPTS (~0ULL)
1891
1892static void perf_log_throttle(struct perf_event *event, int enable);
Alexander Shishkinec0d7722015-01-14 14:18:23 +02001893static void perf_log_itrace_start(struct perf_event *event);
Peter Zijlstra4fe757d2011-02-15 22:26:07 +01001894
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001895static int
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001896event_sched_in(struct perf_event *event,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001897 struct perf_cpu_context *cpuctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01001898 struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001899{
Stephane Eranian41587552011-01-03 18:20:01 +02001900 u64 tstamp = perf_event_time(event);
Alexander Shishkin44377272013-12-16 14:17:36 +02001901 int ret = 0;
Stephane Eranian41587552011-01-03 18:20:01 +02001902
Peter Zijlstra63342412014-05-05 11:49:16 +02001903 lockdep_assert_held(&ctx->lock);
1904
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001905 if (event->state <= PERF_EVENT_STATE_OFF)
1906 return 0;
1907
1908 event->state = PERF_EVENT_STATE_ACTIVE;
Peter Zijlstra6e377382010-02-11 13:21:58 +01001909 event->oncpu = smp_processor_id();
Peter Zijlstra4fe757d2011-02-15 22:26:07 +01001910
1911 /*
1912 * Unthrottle events, since we scheduled we might have missed several
1913 * ticks already, also for a heavily scheduling task there is little
1914 * guarantee it'll get a tick in a timely manner.
1915 */
1916 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1917 perf_log_throttle(event, 1);
1918 event->hw.interrupts = 0;
1919 }
1920
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001921 /*
1922 * The new state must be visible before we turn it on in the hardware:
1923 */
1924 smp_wmb();
1925
Alexander Shishkin44377272013-12-16 14:17:36 +02001926 perf_pmu_disable(event->pmu);
1927
Shaohua Li72f669c2015-02-05 15:55:31 -08001928 perf_set_shadow_time(event, ctx, tstamp);
1929
Alexander Shishkinec0d7722015-01-14 14:18:23 +02001930 perf_log_itrace_start(event);
1931
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02001932 if (event->pmu->add(event, PERF_EF_START)) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001933 event->state = PERF_EVENT_STATE_INACTIVE;
1934 event->oncpu = -1;
Alexander Shishkin44377272013-12-16 14:17:36 +02001935 ret = -EAGAIN;
1936 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001937 }
1938
Peter Zijlstra00a29162015-07-27 10:35:07 +02001939 event->tstamp_running += tstamp - event->tstamp_stopped;
1940
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001941 if (!is_software_event(event))
1942 cpuctx->active_oncpu++;
Mark Rutland2fde4f92015-01-07 15:01:54 +00001943 if (!ctx->nr_active++)
1944 perf_event_ctx_activate(ctx);
Peter Zijlstra0f5a2602011-11-16 14:38:16 +01001945 if (event->attr.freq && event->attr.sample_freq)
1946 ctx->nr_freq++;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001947
1948 if (event->attr.exclusive)
1949 cpuctx->exclusive = 1;
1950
Alexander Shishkin44377272013-12-16 14:17:36 +02001951out:
1952 perf_pmu_enable(event->pmu);
1953
1954 return ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001955}
1956
1957static int
1958group_sched_in(struct perf_event *group_event,
1959 struct perf_cpu_context *cpuctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01001960 struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001961{
Lin Ming6bde9b62010-04-23 13:56:00 +08001962 struct perf_event *event, *partial_group = NULL;
Peter Zijlstra4a234592014-02-24 12:43:31 +01001963 struct pmu *pmu = ctx->pmu;
Stephane Eraniand7842da2010-10-20 15:25:01 +02001964 u64 now = ctx->time;
1965 bool simulate = false;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001966
1967 if (group_event->state == PERF_EVENT_STATE_OFF)
1968 return 0;
1969
Sukadev Bhattiprolufbbe0702015-09-03 20:07:45 -07001970 pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
Lin Ming6bde9b62010-04-23 13:56:00 +08001971
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001972 if (event_sched_in(group_event, cpuctx, ctx)) {
Peter Zijlstraad5133b2010-06-15 12:22:39 +02001973 pmu->cancel_txn(pmu);
Peter Zijlstra272325c2015-04-15 11:41:58 +02001974 perf_mux_hrtimer_restart(cpuctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001975 return -EAGAIN;
Stephane Eranian90151c352010-05-25 16:23:10 +02001976 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001977
1978 /*
1979 * Schedule in siblings as one group (if any):
1980 */
1981 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001982 if (event_sched_in(event, cpuctx, ctx)) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001983 partial_group = event;
1984 goto group_error;
1985 }
1986 }
1987
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001988 if (!pmu->commit_txn(pmu))
Paul Mackerras6e851582010-05-08 20:58:00 +10001989 return 0;
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02001990
Ingo Molnarcdd6c482009-09-21 12:02:48 +02001991group_error:
1992 /*
1993 * Groups can be scheduled in as one unit only, so undo any
1994 * partial group before returning:
Stephane Eraniand7842da2010-10-20 15:25:01 +02001995 * The events up to the failed event are scheduled out normally,
1996 * tstamp_stopped will be updated.
1997 *
1998 * The failed events and the remaining siblings need to have
1999 * their timings updated as if they had gone thru event_sched_in()
2000 * and event_sched_out(). This is required to get consistent timings
2001 * across the group. This also takes care of the case where the group
2002 * could never be scheduled by ensuring tstamp_stopped is set to mark
2003 * the time the event was actually stopped, such that time delta
2004 * calculation in update_event_times() is correct.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002005 */
2006 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2007 if (event == partial_group)
Stephane Eraniand7842da2010-10-20 15:25:01 +02002008 simulate = true;
2009
2010 if (simulate) {
2011 event->tstamp_running += now - event->tstamp_stopped;
2012 event->tstamp_stopped = now;
2013 } else {
2014 event_sched_out(event, cpuctx, ctx);
2015 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002016 }
Stephane Eranian9ffcfa62010-10-20 15:25:01 +02002017 event_sched_out(group_event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002018
Peter Zijlstraad5133b2010-06-15 12:22:39 +02002019 pmu->cancel_txn(pmu);
Stephane Eranian90151c352010-05-25 16:23:10 +02002020
Peter Zijlstra272325c2015-04-15 11:41:58 +02002021 perf_mux_hrtimer_restart(cpuctx);
Stephane Eranian9e630202013-04-03 14:21:33 +02002022
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002023 return -EAGAIN;
2024}
2025
2026/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002027 * Work out whether we can put this event group on the CPU now.
2028 */
2029static int group_can_go_on(struct perf_event *event,
2030 struct perf_cpu_context *cpuctx,
2031 int can_add_hw)
2032{
2033 /*
2034 * Groups consisting entirely of software events can always go on.
2035 */
Frederic Weisbeckerd6f962b2010-01-10 01:25:51 +01002036 if (event->group_flags & PERF_GROUP_SOFTWARE)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002037 return 1;
2038 /*
2039 * If an exclusive group is already on, no other hardware
2040 * events can go on.
2041 */
2042 if (cpuctx->exclusive)
2043 return 0;
2044 /*
2045 * If this group is exclusive and there are already
2046 * events on the CPU, it can't go on.
2047 */
2048 if (event->attr.exclusive && cpuctx->active_oncpu)
2049 return 0;
2050 /*
2051 * Otherwise, try to add it if all previous groups were able
2052 * to go on.
2053 */
2054 return can_add_hw;
2055}
2056
2057static void add_event_to_ctx(struct perf_event *event,
2058 struct perf_event_context *ctx)
2059{
Stephane Eranian41587552011-01-03 18:20:01 +02002060 u64 tstamp = perf_event_time(event);
2061
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002062 list_add_event(event, ctx);
Peter Zijlstra8a495422010-05-27 15:47:49 +02002063 perf_group_attach(event);
Stephane Eranian41587552011-01-03 18:20:01 +02002064 event->tstamp_enabled = tstamp;
2065 event->tstamp_running = tstamp;
2066 event->tstamp_stopped = tstamp;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002067}
2068
Peter Zijlstra3e349502016-01-08 10:01:18 +01002069static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2070 struct perf_event_context *ctx);
Peter Zijlstra2c29ef02011-04-09 21:17:44 +02002071static void
2072ctx_sched_in(struct perf_event_context *ctx,
2073 struct perf_cpu_context *cpuctx,
2074 enum event_type_t event_type,
2075 struct task_struct *task);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002076
Peter Zijlstradce58552011-04-09 21:17:46 +02002077static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2078 struct perf_event_context *ctx,
2079 struct task_struct *task)
2080{
2081 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2082 if (ctx)
2083 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2084 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2085 if (ctx)
2086 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2087}
2088
Peter Zijlstra3e349502016-01-08 10:01:18 +01002089static void ctx_resched(struct perf_cpu_context *cpuctx,
2090 struct perf_event_context *task_ctx)
Peter Zijlstra00179602015-11-30 16:26:35 +01002091{
Peter Zijlstra3e349502016-01-08 10:01:18 +01002092 perf_pmu_disable(cpuctx->ctx.pmu);
2093 if (task_ctx)
2094 task_ctx_sched_out(cpuctx, task_ctx);
2095 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
2096 perf_event_sched_in(cpuctx, task_ctx, current);
2097 perf_pmu_enable(cpuctx->ctx.pmu);
Peter Zijlstra00179602015-11-30 16:26:35 +01002098}
2099
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002100/*
2101 * Cross CPU call to install and enable a performance event
2102 *
2103 * Must be called with ctx->mutex held
2104 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002105static int __perf_install_in_context(void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002106{
Peter Zijlstra39a43642016-01-11 12:46:35 +01002107 struct perf_event_context *ctx = info;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002108 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Peter Zijlstra2c29ef02011-04-09 21:17:44 +02002109 struct perf_event_context *task_ctx = cpuctx->task_ctx;
Peter Zijlstra2c29ef02011-04-09 21:17:44 +02002110
Peter Zijlstra63b6da32016-01-14 16:05:37 +01002111 raw_spin_lock(&cpuctx->ctx.lock);
Peter Zijlstra39a43642016-01-11 12:46:35 +01002112 if (ctx->task) {
Peter Zijlstrab58f6b02011-06-07 00:23:28 +02002113 raw_spin_lock(&ctx->lock);
Peter Zijlstra39a43642016-01-11 12:46:35 +01002114 /*
2115 * If we hit the 'wrong' task, we've since scheduled and
2116 * everything should be sorted, nothing to do!
2117 */
Peter Zijlstrab58f6b02011-06-07 00:23:28 +02002118 task_ctx = ctx;
Peter Zijlstra39a43642016-01-11 12:46:35 +01002119 if (ctx->task != current)
Peter Zijlstra63b6da32016-01-14 16:05:37 +01002120 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002121
Peter Zijlstra39a43642016-01-11 12:46:35 +01002122 /*
2123 * If task_ctx is set, it had better be to us.
2124 */
2125 WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx);
Peter Zijlstra63b6da32016-01-14 16:05:37 +01002126 } else if (task_ctx) {
2127 raw_spin_lock(&task_ctx->lock);
Peter Zijlstrab58f6b02011-06-07 00:23:28 +02002128 }
2129
Peter Zijlstra39a43642016-01-11 12:46:35 +01002130 ctx_resched(cpuctx, task_ctx);
Peter Zijlstra63b6da32016-01-14 16:05:37 +01002131unlock:
Peter Zijlstra2c29ef02011-04-09 21:17:44 +02002132 perf_ctx_unlock(cpuctx, task_ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002133
2134 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002135}
2136
2137/*
2138 * Attach a performance event to a context
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002139 */
2140static void
2141perf_install_in_context(struct perf_event_context *ctx,
2142 struct perf_event *event,
2143 int cpu)
2144{
Peter Zijlstra39a43642016-01-11 12:46:35 +01002145 struct task_struct *task = NULL;
2146
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002147 lockdep_assert_held(&ctx->mutex);
2148
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02002149 event->ctx = ctx;
Yan, Zheng0cda4c02012-06-15 14:31:33 +08002150 if (event->cpu != -1)
2151 event->cpu = cpu;
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02002152
Peter Zijlstra39a43642016-01-11 12:46:35 +01002153 /*
2154 * Installing events is tricky because we cannot rely on ctx->is_active
2155 * to be set in case this is the nr_events 0 -> 1 transition.
2156 *
2157 * So what we do is we add the event to the list here, which will allow
2158 * a future context switch to DTRT and then send a racy IPI. If the IPI
2159 * fails to hit the right task, this means a context switch must have
2160 * happened and that will have taken care of business.
2161 */
2162 raw_spin_lock_irq(&ctx->lock);
Peter Zijlstra63b6da32016-01-14 16:05:37 +01002163 task = ctx->task;
Peter Zijlstra84c4e622016-02-24 18:45:40 +01002164
Peter Zijlstra63b6da32016-01-14 16:05:37 +01002165 /*
Peter Zijlstra84c4e622016-02-24 18:45:40 +01002166 * If between ctx = find_get_context() and mutex_lock(&ctx->mutex) the
2167 * ctx gets destroyed, we must not install an event into it.
2168 *
2169 * This is normally tested for after we acquire the mutex, so this is
2170 * a sanity check.
Peter Zijlstra63b6da32016-01-14 16:05:37 +01002171 */
Peter Zijlstra84c4e622016-02-24 18:45:40 +01002172 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
Peter Zijlstra63b6da32016-01-14 16:05:37 +01002173 raw_spin_unlock_irq(&ctx->lock);
2174 return;
2175 }
Peter Zijlstra6f932e52016-02-24 18:45:43 +01002176
2177 if (ctx->is_active) {
2178 update_context_time(ctx);
2179 update_cgrp_time_from_event(event);
2180 }
2181
Peter Zijlstra39a43642016-01-11 12:46:35 +01002182 add_event_to_ctx(event, ctx);
Peter Zijlstra39a43642016-01-11 12:46:35 +01002183 raw_spin_unlock_irq(&ctx->lock);
2184
2185 if (task)
2186 task_function_call(task, __perf_install_in_context, ctx);
2187 else
2188 cpu_function_call(cpu, __perf_install_in_context, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002189}
2190
2191/*
2192 * Put a event into inactive state and update time fields.
2193 * Enabling the leader of a group effectively enables all
2194 * the group members that aren't explicitly disabled, so we
2195 * have to update their ->tstamp_enabled also.
2196 * Note: this works for group members as well as group leaders
2197 * since the non-leader members' sibling_lists will be empty.
2198 */
Peter Zijlstra1d9b4822011-11-23 12:34:20 +01002199static void __perf_event_mark_enabled(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002200{
2201 struct perf_event *sub;
Stephane Eranian41587552011-01-03 18:20:01 +02002202 u64 tstamp = perf_event_time(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002203
2204 event->state = PERF_EVENT_STATE_INACTIVE;
Stephane Eranian41587552011-01-03 18:20:01 +02002205 event->tstamp_enabled = tstamp - event->total_time_enabled;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002206 list_for_each_entry(sub, &event->sibling_list, group_entry) {
Stephane Eranian41587552011-01-03 18:20:01 +02002207 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2208 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002209 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002210}
2211
2212/*
2213 * Cross CPU call to enable a performance event
2214 */
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01002215static void __perf_event_enable(struct perf_event *event,
2216 struct perf_cpu_context *cpuctx,
2217 struct perf_event_context *ctx,
2218 void *info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002219{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002220 struct perf_event *leader = event->group_leader;
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01002221 struct perf_event_context *task_ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002222
Peter Zijlstra6e801e012016-01-26 12:17:08 +01002223 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2224 event->state <= PERF_EVENT_STATE_ERROR)
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01002225 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002226
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002227 update_context_time(ctx);
Peter Zijlstra1d9b4822011-11-23 12:34:20 +01002228 __perf_event_mark_enabled(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002229
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01002230 if (!ctx->is_active)
2231 return;
2232
Stephane Eraniane5d13672011-02-14 11:20:01 +02002233 if (!event_filter_match(event)) {
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01002234 if (is_cgroup_event(event)) {
2235 perf_cgroup_set_timestamp(current, ctx); // XXX ?
Stephane Eraniane5d13672011-02-14 11:20:01 +02002236 perf_cgroup_defer_enabled(event);
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01002237 }
2238 return;
Stephane Eraniane5d13672011-02-14 11:20:01 +02002239 }
Peter Zijlstraf4c41762009-12-16 17:55:54 +01002240
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002241 /*
2242 * If the event is in a group and isn't the group leader,
2243 * then don't put it on unless the group is on.
2244 */
2245 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01002246 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002247
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01002248 task_ctx = cpuctx->task_ctx;
2249 if (ctx->task)
2250 WARN_ON_ONCE(task_ctx != ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002251
Peter Zijlstraaee7dbc2016-01-08 10:45:11 +01002252 ctx_resched(cpuctx, task_ctx);
Peter Zijlstra7b648012015-12-03 18:35:21 +01002253}
2254
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002255/*
2256 * Enable a event.
2257 *
2258 * If event->ctx is a cloned context, callers must make sure that
2259 * every task struct that event->ctx->task could possibly point to
2260 * remains valid. This condition is satisfied when called through
2261 * perf_event_for_each_child or perf_event_for_each as described
2262 * for perf_event_disable.
2263 */
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01002264static void _perf_event_enable(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002265{
2266 struct perf_event_context *ctx = event->ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002267
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002268 raw_spin_lock_irq(&ctx->lock);
Peter Zijlstra6e801e012016-01-26 12:17:08 +01002269 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2270 event->state < PERF_EVENT_STATE_ERROR) {
Peter Zijlstra7b648012015-12-03 18:35:21 +01002271 raw_spin_unlock_irq(&ctx->lock);
2272 return;
2273 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002274
2275 /*
2276 * If the event is in error state, clear that first.
Peter Zijlstra7b648012015-12-03 18:35:21 +01002277 *
2278 * That way, if we see the event in error state below, we know that it
2279 * has gone back into error state, as distinct from the task having
2280 * been scheduled away before the cross-call arrived.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002281 */
2282 if (event->state == PERF_EVENT_STATE_ERROR)
2283 event->state = PERF_EVENT_STATE_OFF;
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002284 raw_spin_unlock_irq(&ctx->lock);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002285
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01002286 event_function_call(event, __perf_event_enable, NULL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002287}
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01002288
2289/*
2290 * See perf_event_disable();
2291 */
2292void perf_event_enable(struct perf_event *event)
2293{
2294 struct perf_event_context *ctx;
2295
2296 ctx = perf_event_ctx_lock(event);
2297 _perf_event_enable(event);
2298 perf_event_ctx_unlock(event, ctx);
2299}
Robert Richterdcfce4a2011-10-11 17:11:08 +02002300EXPORT_SYMBOL_GPL(perf_event_enable);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002301
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01002302static int _perf_event_refresh(struct perf_event *event, int refresh)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002303{
2304 /*
2305 * not supported on inherited events
2306 */
Franck Bui-Huu2e939d12010-11-23 16:21:44 +01002307 if (event->attr.inherit || !is_sampling_event(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002308 return -EINVAL;
2309
2310 atomic_add(refresh, &event->event_limit);
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01002311 _perf_event_enable(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002312
2313 return 0;
2314}
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01002315
2316/*
2317 * See perf_event_disable()
2318 */
2319int perf_event_refresh(struct perf_event *event, int refresh)
2320{
2321 struct perf_event_context *ctx;
2322 int ret;
2323
2324 ctx = perf_event_ctx_lock(event);
2325 ret = _perf_event_refresh(event, refresh);
2326 perf_event_ctx_unlock(event, ctx);
2327
2328 return ret;
2329}
Avi Kivity26ca5c12011-06-29 18:42:37 +03002330EXPORT_SYMBOL_GPL(perf_event_refresh);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002331
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002332static void ctx_sched_out(struct perf_event_context *ctx,
2333 struct perf_cpu_context *cpuctx,
2334 enum event_type_t event_type)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002335{
Peter Zijlstradb24d332011-04-09 21:17:45 +02002336 int is_active = ctx->is_active;
Peter Zijlstrac994d612016-01-08 09:20:23 +01002337 struct perf_event *event;
2338
2339 lockdep_assert_held(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002340
Peter Zijlstra39a43642016-01-11 12:46:35 +01002341 if (likely(!ctx->nr_events)) {
2342 /*
2343 * See __perf_remove_from_context().
2344 */
2345 WARN_ON_ONCE(ctx->is_active);
2346 if (ctx->task)
2347 WARN_ON_ONCE(cpuctx->task_ctx);
2348 return;
2349 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002350
Peter Zijlstradb24d332011-04-09 21:17:45 +02002351 ctx->is_active &= ~event_type;
Peter Zijlstra63e30d32016-01-08 11:39:10 +01002352 if (ctx->task) {
2353 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2354 if (!ctx->is_active)
2355 cpuctx->task_ctx = NULL;
2356 }
Peter Zijlstrafacc4302011-04-09 21:17:42 +02002357
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002358 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02002359 update_cgrp_time_from_cpuctx(cpuctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002360 if (!ctx->nr_active)
Peter Zijlstrafacc4302011-04-09 21:17:42 +02002361 return;
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002362
Peter Zijlstra075e0b02011-04-09 21:17:40 +02002363 perf_pmu_disable(ctx->pmu);
Peter Zijlstradb24d332011-04-09 21:17:45 +02002364 if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002365 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2366 group_sched_out(event, cpuctx, ctx);
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002367 }
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002368
Peter Zijlstradb24d332011-04-09 21:17:45 +02002369 if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002370 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
Xiao Guangrong8c9ed8e2009-09-25 13:51:17 +08002371 group_sched_out(event, cpuctx, ctx);
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002372 }
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02002373 perf_pmu_enable(ctx->pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002374}
2375
2376/*
Peter Zijlstra5a3126d2013-10-07 17:12:48 +02002377 * Test whether two contexts are equivalent, i.e. whether they have both been
2378 * cloned from the same version of the same context.
2379 *
2380 * Equivalence is measured using a generation number in the context that is
2381 * incremented on each modification to it; see unclone_ctx(), list_add_event()
2382 * and list_del_event().
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002383 */
2384static int context_equiv(struct perf_event_context *ctx1,
2385 struct perf_event_context *ctx2)
2386{
Peter Zijlstra211de6e2014-09-30 19:23:08 +02002387 lockdep_assert_held(&ctx1->lock);
2388 lockdep_assert_held(&ctx2->lock);
2389
Peter Zijlstra5a3126d2013-10-07 17:12:48 +02002390 /* Pinning disables the swap optimization */
2391 if (ctx1->pin_count || ctx2->pin_count)
2392 return 0;
2393
2394 /* If ctx1 is the parent of ctx2 */
2395 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2396 return 1;
2397
2398 /* If ctx2 is the parent of ctx1 */
2399 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2400 return 1;
2401
2402 /*
2403 * If ctx1 and ctx2 have the same parent; we flatten the parent
2404 * hierarchy, see perf_event_init_context().
2405 */
2406 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2407 ctx1->parent_gen == ctx2->parent_gen)
2408 return 1;
2409
2410 /* Unmatched */
2411 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002412}
2413
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002414static void __perf_event_sync_stat(struct perf_event *event,
2415 struct perf_event *next_event)
2416{
2417 u64 value;
2418
2419 if (!event->attr.inherit_stat)
2420 return;
2421
2422 /*
2423 * Update the event value, we cannot use perf_event_read()
2424 * because we're in the middle of a context switch and have IRQs
2425 * disabled, which upsets smp_call_function_single(), however
2426 * we know the event must be on the current CPU, therefore we
2427 * don't need to use it.
2428 */
2429 switch (event->state) {
2430 case PERF_EVENT_STATE_ACTIVE:
Peter Zijlstra3dbebf12009-11-20 22:19:52 +01002431 event->pmu->read(event);
2432 /* fall-through */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002433
2434 case PERF_EVENT_STATE_INACTIVE:
2435 update_event_times(event);
2436 break;
2437
2438 default:
2439 break;
2440 }
2441
2442 /*
2443 * In order to keep per-task stats reliable we need to flip the event
2444 * values when we flip the contexts.
2445 */
Peter Zijlstrae7850592010-05-21 14:43:08 +02002446 value = local64_read(&next_event->count);
2447 value = local64_xchg(&event->count, value);
2448 local64_set(&next_event->count, value);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002449
2450 swap(event->total_time_enabled, next_event->total_time_enabled);
2451 swap(event->total_time_running, next_event->total_time_running);
2452
2453 /*
2454 * Since we swizzled the values, update the user visible data too.
2455 */
2456 perf_event_update_userpage(event);
2457 perf_event_update_userpage(next_event);
2458}
2459
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002460static void perf_event_sync_stat(struct perf_event_context *ctx,
2461 struct perf_event_context *next_ctx)
2462{
2463 struct perf_event *event, *next_event;
2464
2465 if (!ctx->nr_stat)
2466 return;
2467
Peter Zijlstra02ffdbc2009-11-20 22:19:50 +01002468 update_context_time(ctx);
2469
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002470 event = list_first_entry(&ctx->event_list,
2471 struct perf_event, event_entry);
2472
2473 next_event = list_first_entry(&next_ctx->event_list,
2474 struct perf_event, event_entry);
2475
2476 while (&event->event_entry != &ctx->event_list &&
2477 &next_event->event_entry != &next_ctx->event_list) {
2478
2479 __perf_event_sync_stat(event, next_event);
2480
2481 event = list_next_entry(event, event_entry);
2482 next_event = list_next_entry(next_event, event_entry);
2483 }
2484}
2485
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002486static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2487 struct task_struct *next)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002488{
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002489 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002490 struct perf_event_context *next_ctx;
Peter Zijlstra5a3126d2013-10-07 17:12:48 +02002491 struct perf_event_context *parent, *next_parent;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002492 struct perf_cpu_context *cpuctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002493 int do_switch = 1;
2494
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002495 if (likely(!ctx))
2496 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002497
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002498 cpuctx = __get_cpu_context(ctx);
2499 if (!cpuctx->task_ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002500 return;
2501
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002502 rcu_read_lock();
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002503 next_ctx = next->perf_event_ctxp[ctxn];
Peter Zijlstra5a3126d2013-10-07 17:12:48 +02002504 if (!next_ctx)
2505 goto unlock;
2506
2507 parent = rcu_dereference(ctx->parent_ctx);
2508 next_parent = rcu_dereference(next_ctx->parent_ctx);
2509
2510 /* If neither context have a parent context; they cannot be clones. */
Jiri Olsa802c8a62014-09-12 13:18:28 +02002511 if (!parent && !next_parent)
Peter Zijlstra5a3126d2013-10-07 17:12:48 +02002512 goto unlock;
2513
2514 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002515 /*
2516 * Looks like the two contexts are clones, so we might be
2517 * able to optimize the context switch. We lock both
2518 * contexts and check that they are clones under the
2519 * lock (including re-checking that neither has been
2520 * uncloned in the meantime). It doesn't matter which
2521 * order we take the locks because no other cpu could
2522 * be trying to lock both of these tasks.
2523 */
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002524 raw_spin_lock(&ctx->lock);
2525 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002526 if (context_equiv(ctx, next_ctx)) {
Peter Zijlstra63b6da32016-01-14 16:05:37 +01002527 WRITE_ONCE(ctx->task, next);
2528 WRITE_ONCE(next_ctx->task, task);
Yan, Zheng5a158c32014-11-04 21:56:02 -05002529
2530 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2531
Peter Zijlstra63b6da32016-01-14 16:05:37 +01002532 /*
2533 * RCU_INIT_POINTER here is safe because we've not
2534 * modified the ctx and the above modification of
2535 * ctx->task and ctx->task_ctx_data are immaterial
2536 * since those values are always verified under
2537 * ctx->lock which we're now holding.
2538 */
2539 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
2540 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
2541
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002542 do_switch = 0;
2543
2544 perf_event_sync_stat(ctx, next_ctx);
2545 }
Thomas Gleixnere625cce12009-11-17 18:02:06 +01002546 raw_spin_unlock(&next_ctx->lock);
2547 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002548 }
Peter Zijlstra5a3126d2013-10-07 17:12:48 +02002549unlock:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002550 rcu_read_unlock();
2551
2552 if (do_switch) {
Peter Zijlstrafacc4302011-04-09 21:17:42 +02002553 raw_spin_lock(&ctx->lock);
Peter Zijlstra8833d0e2016-01-08 10:02:37 +01002554 task_ctx_sched_out(cpuctx, ctx);
Peter Zijlstrafacc4302011-04-09 21:17:42 +02002555 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002556 }
2557}
2558
Yan, Zhengba532502014-11-04 21:55:58 -05002559void perf_sched_cb_dec(struct pmu *pmu)
2560{
2561 this_cpu_dec(perf_sched_cb_usages);
2562}
2563
2564void perf_sched_cb_inc(struct pmu *pmu)
2565{
2566 this_cpu_inc(perf_sched_cb_usages);
2567}
2568
2569/*
2570 * This function provides the context switch callback to the lower code
2571 * layer. It is invoked ONLY when the context switch callback is enabled.
2572 */
2573static void perf_pmu_sched_task(struct task_struct *prev,
2574 struct task_struct *next,
2575 bool sched_in)
2576{
2577 struct perf_cpu_context *cpuctx;
2578 struct pmu *pmu;
2579 unsigned long flags;
2580
2581 if (prev == next)
2582 return;
2583
2584 local_irq_save(flags);
2585
2586 rcu_read_lock();
2587
2588 list_for_each_entry_rcu(pmu, &pmus, entry) {
2589 if (pmu->sched_task) {
2590 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2591
2592 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2593
2594 perf_pmu_disable(pmu);
2595
2596 pmu->sched_task(cpuctx->task_ctx, sched_in);
2597
2598 perf_pmu_enable(pmu);
2599
2600 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2601 }
2602 }
2603
2604 rcu_read_unlock();
2605
2606 local_irq_restore(flags);
2607}
2608
Adrian Hunter45ac1402015-07-21 12:44:02 +03002609static void perf_event_switch(struct task_struct *task,
2610 struct task_struct *next_prev, bool sched_in);
2611
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002612#define for_each_task_context_nr(ctxn) \
2613 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2614
2615/*
2616 * Called from scheduler to remove the events of the current task,
2617 * with interrupts disabled.
2618 *
2619 * We stop each event and update the event value in event->count.
2620 *
2621 * This does not protect us against NMI, but disable()
2622 * sets the disabled bit in the control field of event _before_
2623 * accessing the event control register. If a NMI hits, then it will
2624 * not restart the event.
2625 */
Jiri Olsaab0cce52012-05-23 13:13:02 +02002626void __perf_event_task_sched_out(struct task_struct *task,
2627 struct task_struct *next)
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002628{
2629 int ctxn;
2630
Yan, Zhengba532502014-11-04 21:55:58 -05002631 if (__this_cpu_read(perf_sched_cb_usages))
2632 perf_pmu_sched_task(task, next, false);
2633
Adrian Hunter45ac1402015-07-21 12:44:02 +03002634 if (atomic_read(&nr_switch_events))
2635 perf_event_switch(task, next, false);
2636
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002637 for_each_task_context_nr(ctxn)
2638 perf_event_context_sched_out(task, ctxn, next);
Stephane Eraniane5d13672011-02-14 11:20:01 +02002639
2640 /*
2641 * if cgroup events exist on this CPU, then we need
2642 * to check if we have to switch out PMU state.
2643 * cgroup event are system-wide mode only
2644 */
Christoph Lameter4a32fea2014-08-17 12:30:27 -05002645 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
Stephane Eraniana8d757e2011-08-25 15:58:03 +02002646 perf_cgroup_sched_out(task, next);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002647}
2648
Peter Zijlstra3e349502016-01-08 10:01:18 +01002649static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2650 struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002651{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002652 if (!cpuctx->task_ctx)
2653 return;
2654
2655 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2656 return;
2657
Peter Zijlstra04dc2db2011-04-09 21:17:43 +02002658 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002659}
2660
2661/*
2662 * Called with IRQs disabled
2663 */
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002664static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2665 enum event_type_t event_type)
2666{
2667 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002668}
2669
2670static void
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002671ctx_pinned_sched_in(struct perf_event_context *ctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01002672 struct perf_cpu_context *cpuctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002673{
2674 struct perf_event *event;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002675
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002676 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2677 if (event->state <= PERF_EVENT_STATE_OFF)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002678 continue;
Stephane Eranian5632ab12011-01-03 18:20:01 +02002679 if (!event_filter_match(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002680 continue;
2681
Stephane Eraniane5d13672011-02-14 11:20:01 +02002682 /* may need to reset tstamp_enabled */
2683 if (is_cgroup_event(event))
2684 perf_cgroup_mark_enabled(event, ctx);
2685
Xiao Guangrong8c9ed8e2009-09-25 13:51:17 +08002686 if (group_can_go_on(event, cpuctx, 1))
Peter Zijlstra6e377382010-02-11 13:21:58 +01002687 group_sched_in(event, cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002688
2689 /*
2690 * If this pinned group hasn't been scheduled,
2691 * put it in error state.
2692 */
2693 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2694 update_group_times(event);
2695 event->state = PERF_EVENT_STATE_ERROR;
2696 }
2697 }
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002698}
2699
2700static void
2701ctx_flexible_sched_in(struct perf_event_context *ctx,
Peter Zijlstra6e377382010-02-11 13:21:58 +01002702 struct perf_cpu_context *cpuctx)
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002703{
2704 struct perf_event *event;
2705 int can_add_hw = 1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002706
Frederic Weisbecker889ff012010-01-09 20:04:47 +01002707 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2708 /* Ignore events in OFF or ERROR state */
2709 if (event->state <= PERF_EVENT_STATE_OFF)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002710 continue;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002711 /*
2712 * Listen to the 'cpu' scheduling filter constraint
2713 * of events:
2714 */
Stephane Eranian5632ab12011-01-03 18:20:01 +02002715 if (!event_filter_match(event))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002716 continue;
2717
Stephane Eraniane5d13672011-02-14 11:20:01 +02002718 /* may need to reset tstamp_enabled */
2719 if (is_cgroup_event(event))
2720 perf_cgroup_mark_enabled(event, ctx);
2721
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002722 if (group_can_go_on(event, cpuctx, can_add_hw)) {
Peter Zijlstra6e377382010-02-11 13:21:58 +01002723 if (group_sched_in(event, cpuctx, ctx))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002724 can_add_hw = 0;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02002725 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002726 }
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002727}
2728
2729static void
2730ctx_sched_in(struct perf_event_context *ctx,
2731 struct perf_cpu_context *cpuctx,
Stephane Eraniane5d13672011-02-14 11:20:01 +02002732 enum event_type_t event_type,
2733 struct task_struct *task)
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002734{
Peter Zijlstradb24d332011-04-09 21:17:45 +02002735 int is_active = ctx->is_active;
Peter Zijlstrac994d612016-01-08 09:20:23 +01002736 u64 now;
Stephane Eraniane5d13672011-02-14 11:20:01 +02002737
Peter Zijlstrac994d612016-01-08 09:20:23 +01002738 lockdep_assert_held(&ctx->lock);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002739
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002740 if (likely(!ctx->nr_events))
Peter Zijlstrafacc4302011-04-09 21:17:42 +02002741 return;
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002742
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002743 ctx->is_active |= event_type;
Peter Zijlstra63e30d32016-01-08 11:39:10 +01002744 if (ctx->task) {
2745 if (!is_active)
2746 cpuctx->task_ctx = ctx;
2747 else
2748 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2749 }
2750
Stephane Eraniane5d13672011-02-14 11:20:01 +02002751 now = perf_clock();
2752 ctx->timestamp = now;
Stephane Eranian3f7cce32011-02-18 14:40:01 +02002753 perf_cgroup_set_timestamp(task, ctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002754 /*
2755 * First go through the list and put on any pinned groups
2756 * in order to give them the best chance of going on.
2757 */
Peter Zijlstradb24d332011-04-09 21:17:45 +02002758 if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
Peter Zijlstra6e377382010-02-11 13:21:58 +01002759 ctx_pinned_sched_in(ctx, cpuctx);
Frederic Weisbecker5b0311e2010-01-17 11:59:13 +01002760
2761 /* Then walk through the lower prio flexible groups */
Peter Zijlstradb24d332011-04-09 21:17:45 +02002762 if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
Peter Zijlstra6e377382010-02-11 13:21:58 +01002763 ctx_flexible_sched_in(ctx, cpuctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002764}
2765
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002766static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
Stephane Eraniane5d13672011-02-14 11:20:01 +02002767 enum event_type_t event_type,
2768 struct task_struct *task)
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002769{
2770 struct perf_event_context *ctx = &cpuctx->ctx;
2771
Stephane Eraniane5d13672011-02-14 11:20:01 +02002772 ctx_sched_in(ctx, cpuctx, event_type, task);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002773}
2774
Stephane Eraniane5d13672011-02-14 11:20:01 +02002775static void perf_event_context_sched_in(struct perf_event_context *ctx,
2776 struct task_struct *task)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002777{
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002778 struct perf_cpu_context *cpuctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002779
Peter Zijlstra108b02c2010-09-06 14:32:03 +02002780 cpuctx = __get_cpu_context(ctx);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002781 if (cpuctx->task_ctx == ctx)
2782 return;
2783
Peter Zijlstrafacc4302011-04-09 21:17:42 +02002784 perf_ctx_lock(cpuctx, ctx);
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02002785 perf_pmu_disable(ctx->pmu);
Frederic Weisbecker329c0e02010-01-17 12:56:05 +01002786 /*
2787 * We want to keep the following priority order:
2788 * cpu pinned (that don't need to move), task pinned,
2789 * cpu flexible, task flexible.
2790 */
2791 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
Peter Zijlstra63e30d32016-01-08 11:39:10 +01002792 perf_event_sched_in(cpuctx, ctx, task);
Peter Zijlstrafacc4302011-04-09 21:17:42 +02002793 perf_pmu_enable(ctx->pmu);
2794 perf_ctx_unlock(cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002795}
2796
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002797/*
2798 * Called from scheduler to add the events of the current task
2799 * with interrupts disabled.
2800 *
2801 * We restore the event value and then enable it.
2802 *
2803 * This does not protect us against NMI, but enable()
2804 * sets the enabled bit in the control field of event _before_
2805 * accessing the event control register. If a NMI hits, then it will
2806 * keep the event running.
2807 */
Jiri Olsaab0cce52012-05-23 13:13:02 +02002808void __perf_event_task_sched_in(struct task_struct *prev,
2809 struct task_struct *task)
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002810{
2811 struct perf_event_context *ctx;
2812 int ctxn;
2813
Peter Zijlstra7e41d172016-01-08 09:21:40 +01002814 /*
2815 * If cgroup events exist on this CPU, then we need to check if we have
2816 * to switch in PMU state; cgroup event are system-wide mode only.
2817 *
2818 * Since cgroup events are CPU events, we must schedule these in before
2819 * we schedule in the task events.
2820 */
2821 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2822 perf_cgroup_sched_in(prev, task);
2823
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002824 for_each_task_context_nr(ctxn) {
2825 ctx = task->perf_event_ctxp[ctxn];
2826 if (likely(!ctx))
2827 continue;
2828
Stephane Eraniane5d13672011-02-14 11:20:01 +02002829 perf_event_context_sched_in(ctx, task);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02002830 }
Stephane Eraniand010b332012-02-09 23:21:00 +01002831
Adrian Hunter45ac1402015-07-21 12:44:02 +03002832 if (atomic_read(&nr_switch_events))
2833 perf_event_switch(task, prev, true);
2834
Yan, Zhengba532502014-11-04 21:55:58 -05002835 if (__this_cpu_read(perf_sched_cb_usages))
2836 perf_pmu_sched_task(prev, task, true);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002837}
2838
Peter Zijlstraabd50712010-01-26 18:50:16 +01002839static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2840{
2841 u64 frequency = event->attr.sample_freq;
2842 u64 sec = NSEC_PER_SEC;
2843 u64 divisor, dividend;
2844
2845 int count_fls, nsec_fls, frequency_fls, sec_fls;
2846
2847 count_fls = fls64(count);
2848 nsec_fls = fls64(nsec);
2849 frequency_fls = fls64(frequency);
2850 sec_fls = 30;
2851
2852 /*
2853 * We got @count in @nsec, with a target of sample_freq HZ
2854 * the target period becomes:
2855 *
2856 * @count * 10^9
2857 * period = -------------------
2858 * @nsec * sample_freq
2859 *
2860 */
2861
2862 /*
2863 * Reduce accuracy by one bit such that @a and @b converge
2864 * to a similar magnitude.
2865 */
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002866#define REDUCE_FLS(a, b) \
Peter Zijlstraabd50712010-01-26 18:50:16 +01002867do { \
2868 if (a##_fls > b##_fls) { \
2869 a >>= 1; \
2870 a##_fls--; \
2871 } else { \
2872 b >>= 1; \
2873 b##_fls--; \
2874 } \
2875} while (0)
2876
2877 /*
2878 * Reduce accuracy until either term fits in a u64, then proceed with
2879 * the other, so that finally we can do a u64/u64 division.
2880 */
2881 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2882 REDUCE_FLS(nsec, frequency);
2883 REDUCE_FLS(sec, count);
2884 }
2885
2886 if (count_fls + sec_fls > 64) {
2887 divisor = nsec * frequency;
2888
2889 while (count_fls + sec_fls > 64) {
2890 REDUCE_FLS(count, sec);
2891 divisor >>= 1;
2892 }
2893
2894 dividend = count * sec;
2895 } else {
2896 dividend = count * sec;
2897
2898 while (nsec_fls + frequency_fls > 64) {
2899 REDUCE_FLS(nsec, frequency);
2900 dividend >>= 1;
2901 }
2902
2903 divisor = nsec * frequency;
2904 }
2905
Peter Zijlstraf6ab91a2010-06-04 15:18:01 +02002906 if (!divisor)
2907 return dividend;
2908
Peter Zijlstraabd50712010-01-26 18:50:16 +01002909 return div64_u64(dividend, divisor);
2910}
2911
Stephane Eraniane050e3f2012-01-26 17:03:19 +01002912static DEFINE_PER_CPU(int, perf_throttled_count);
2913static DEFINE_PER_CPU(u64, perf_throttled_seq);
2914
Stephane Eranianf39d47f2012-02-07 14:39:57 +01002915static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002916{
2917 struct hw_perf_event *hwc = &event->hw;
Peter Zijlstraf6ab91a2010-06-04 15:18:01 +02002918 s64 period, sample_period;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002919 s64 delta;
2920
Peter Zijlstraabd50712010-01-26 18:50:16 +01002921 period = perf_calculate_period(event, nsec, count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002922
2923 delta = (s64)(period - hwc->sample_period);
2924 delta = (delta + 7) / 8; /* low pass filter */
2925
2926 sample_period = hwc->sample_period + delta;
2927
2928 if (!sample_period)
2929 sample_period = 1;
2930
2931 hwc->sample_period = sample_period;
Peter Zijlstraabd50712010-01-26 18:50:16 +01002932
Peter Zijlstrae7850592010-05-21 14:43:08 +02002933 if (local64_read(&hwc->period_left) > 8*sample_period) {
Stephane Eranianf39d47f2012-02-07 14:39:57 +01002934 if (disable)
2935 event->pmu->stop(event, PERF_EF_UPDATE);
2936
Peter Zijlstrae7850592010-05-21 14:43:08 +02002937 local64_set(&hwc->period_left, 0);
Stephane Eranianf39d47f2012-02-07 14:39:57 +01002938
2939 if (disable)
2940 event->pmu->start(event, PERF_EF_RELOAD);
Peter Zijlstraabd50712010-01-26 18:50:16 +01002941 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002942}
2943
Stephane Eraniane050e3f2012-01-26 17:03:19 +01002944/*
2945 * combine freq adjustment with unthrottling to avoid two passes over the
2946 * events. At the same time, make sure, having freq events does not change
2947 * the rate of unthrottling as that would introduce bias.
2948 */
2949static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2950 int needs_unthr)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002951{
2952 struct perf_event *event;
2953 struct hw_perf_event *hwc;
Stephane Eraniane050e3f2012-01-26 17:03:19 +01002954 u64 now, period = TICK_NSEC;
Peter Zijlstraabd50712010-01-26 18:50:16 +01002955 s64 delta;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002956
Stephane Eraniane050e3f2012-01-26 17:03:19 +01002957 /*
2958 * only need to iterate over all events iff:
2959 * - context have events in frequency mode (needs freq adjust)
2960 * - there are events to unthrottle on this cpu
2961 */
2962 if (!(ctx->nr_freq || needs_unthr))
Peter Zijlstra0f5a2602011-11-16 14:38:16 +01002963 return;
2964
Stephane Eraniane050e3f2012-01-26 17:03:19 +01002965 raw_spin_lock(&ctx->lock);
Stephane Eranianf39d47f2012-02-07 14:39:57 +01002966 perf_pmu_disable(ctx->pmu);
Stephane Eraniane050e3f2012-01-26 17:03:19 +01002967
Paul Mackerras03541f82009-10-14 16:58:03 +11002968 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002969 if (event->state != PERF_EVENT_STATE_ACTIVE)
2970 continue;
2971
Stephane Eranian5632ab12011-01-03 18:20:01 +02002972 if (!event_filter_match(event))
Peter Zijlstra5d27c232009-12-17 13:16:32 +01002973 continue;
2974
Alexander Shishkin44377272013-12-16 14:17:36 +02002975 perf_pmu_disable(event->pmu);
2976
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002977 hwc = &event->hw;
2978
Jiri Olsaae23bff2013-08-24 16:45:54 +02002979 if (hwc->interrupts == MAX_INTERRUPTS) {
Stephane Eraniane050e3f2012-01-26 17:03:19 +01002980 hwc->interrupts = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002981 perf_log_throttle(event, 1);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02002982 event->pmu->start(event, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002983 }
2984
2985 if (!event->attr.freq || !event->attr.sample_freq)
Alexander Shishkin44377272013-12-16 14:17:36 +02002986 goto next;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002987
Stephane Eraniane050e3f2012-01-26 17:03:19 +01002988 /*
2989 * stop the event and update event->count
2990 */
2991 event->pmu->stop(event, PERF_EF_UPDATE);
2992
Peter Zijlstrae7850592010-05-21 14:43:08 +02002993 now = local64_read(&event->count);
Peter Zijlstraabd50712010-01-26 18:50:16 +01002994 delta = now - hwc->freq_count_stamp;
2995 hwc->freq_count_stamp = now;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02002996
Stephane Eraniane050e3f2012-01-26 17:03:19 +01002997 /*
2998 * restart the event
2999 * reload only if value has changed
Stephane Eranianf39d47f2012-02-07 14:39:57 +01003000 * we have stopped the event so tell that
3001 * to perf_adjust_period() to avoid stopping it
3002 * twice.
Stephane Eraniane050e3f2012-01-26 17:03:19 +01003003 */
Peter Zijlstraabd50712010-01-26 18:50:16 +01003004 if (delta > 0)
Stephane Eranianf39d47f2012-02-07 14:39:57 +01003005 perf_adjust_period(event, period, delta, false);
Stephane Eraniane050e3f2012-01-26 17:03:19 +01003006
3007 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
Alexander Shishkin44377272013-12-16 14:17:36 +02003008 next:
3009 perf_pmu_enable(event->pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003010 }
Stephane Eraniane050e3f2012-01-26 17:03:19 +01003011
Stephane Eranianf39d47f2012-02-07 14:39:57 +01003012 perf_pmu_enable(ctx->pmu);
Stephane Eraniane050e3f2012-01-26 17:03:19 +01003013 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003014}
3015
3016/*
3017 * Round-robin a context's events:
3018 */
3019static void rotate_ctx(struct perf_event_context *ctx)
3020{
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01003021 /*
3022 * Rotate the first entry last of non-pinned groups. Rotation might be
3023 * disabled by the inheritance code.
3024 */
3025 if (!ctx->rotate_disable)
3026 list_rotate_left(&ctx->flexible_groups);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003027}
3028
Stephane Eranian9e630202013-04-03 14:21:33 +02003029static int perf_rotate_context(struct perf_cpu_context *cpuctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003030{
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02003031 struct perf_event_context *ctx = NULL;
Mark Rutland2fde4f92015-01-07 15:01:54 +00003032 int rotate = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003033
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02003034 if (cpuctx->ctx.nr_events) {
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02003035 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3036 rotate = 1;
3037 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003038
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02003039 ctx = cpuctx->task_ctx;
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02003040 if (ctx && ctx->nr_events) {
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02003041 if (ctx->nr_events != ctx->nr_active)
3042 rotate = 1;
3043 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003044
Stephane Eraniane050e3f2012-01-26 17:03:19 +01003045 if (!rotate)
Peter Zijlstra0f5a2602011-11-16 14:38:16 +01003046 goto done;
3047
Peter Zijlstrafacc4302011-04-09 21:17:42 +02003048 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
Peter Zijlstra1b9a6442010-09-07 18:32:22 +02003049 perf_pmu_disable(cpuctx->ctx.pmu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003050
Stephane Eraniane050e3f2012-01-26 17:03:19 +01003051 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3052 if (ctx)
3053 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
Peter Zijlstrad4944a02010-03-08 13:51:20 +01003054
Stephane Eraniane050e3f2012-01-26 17:03:19 +01003055 rotate_ctx(&cpuctx->ctx);
3056 if (ctx)
3057 rotate_ctx(ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003058
Stephane Eraniane050e3f2012-01-26 17:03:19 +01003059 perf_event_sched_in(cpuctx, ctx, current);
Peter Zijlstra0f5a2602011-11-16 14:38:16 +01003060
3061 perf_pmu_enable(cpuctx->ctx.pmu);
3062 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
Peter Zijlstrab5ab4cd2010-09-06 16:32:21 +02003063done:
Stephane Eranian9e630202013-04-03 14:21:33 +02003064
3065 return rotate;
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02003066}
3067
Frederic Weisbecker026249e2013-04-20 15:58:34 +02003068#ifdef CONFIG_NO_HZ_FULL
3069bool perf_event_can_stop_tick(void)
3070{
Frederic Weisbecker948b26b2013-08-02 18:29:55 +02003071 if (atomic_read(&nr_freq_events) ||
Frederic Weisbeckerd84153d2013-07-23 02:31:05 +02003072 __this_cpu_read(perf_throttled_count))
Frederic Weisbecker026249e2013-04-20 15:58:34 +02003073 return false;
Frederic Weisbeckerd84153d2013-07-23 02:31:05 +02003074 else
3075 return true;
Frederic Weisbecker026249e2013-04-20 15:58:34 +02003076}
3077#endif
3078
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02003079void perf_event_task_tick(void)
3080{
Mark Rutland2fde4f92015-01-07 15:01:54 +00003081 struct list_head *head = this_cpu_ptr(&active_ctx_list);
3082 struct perf_event_context *ctx, *tmp;
Stephane Eraniane050e3f2012-01-26 17:03:19 +01003083 int throttled;
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02003084
3085 WARN_ON(!irqs_disabled());
3086
Stephane Eraniane050e3f2012-01-26 17:03:19 +01003087 __this_cpu_inc(perf_throttled_seq);
3088 throttled = __this_cpu_xchg(perf_throttled_count, 0);
3089
Mark Rutland2fde4f92015-01-07 15:01:54 +00003090 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
Stephane Eraniane050e3f2012-01-26 17:03:19 +01003091 perf_adjust_freq_unthr_context(ctx, throttled);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003092}
3093
Frederic Weisbecker889ff012010-01-09 20:04:47 +01003094static int event_enable_on_exec(struct perf_event *event,
3095 struct perf_event_context *ctx)
3096{
3097 if (!event->attr.enable_on_exec)
3098 return 0;
3099
3100 event->attr.enable_on_exec = 0;
3101 if (event->state >= PERF_EVENT_STATE_INACTIVE)
3102 return 0;
3103
Peter Zijlstra1d9b4822011-11-23 12:34:20 +01003104 __perf_event_mark_enabled(event);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01003105
3106 return 1;
3107}
3108
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003109/*
3110 * Enable all of a task's events that have been marked enable-on-exec.
3111 * This expects task == current.
3112 */
Peter Zijlstrac1274492015-12-10 20:57:40 +01003113static void perf_event_enable_on_exec(int ctxn)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003114{
Peter Zijlstrac1274492015-12-10 20:57:40 +01003115 struct perf_event_context *ctx, *clone_ctx = NULL;
Peter Zijlstra3e349502016-01-08 10:01:18 +01003116 struct perf_cpu_context *cpuctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003117 struct perf_event *event;
3118 unsigned long flags;
3119 int enabled = 0;
3120
3121 local_irq_save(flags);
Peter Zijlstrac1274492015-12-10 20:57:40 +01003122 ctx = current->perf_event_ctxp[ctxn];
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003123 if (!ctx || !ctx->nr_events)
3124 goto out;
3125
Peter Zijlstra3e349502016-01-08 10:01:18 +01003126 cpuctx = __get_cpu_context(ctx);
3127 perf_ctx_lock(cpuctx, ctx);
3128 list_for_each_entry(event, &ctx->event_list, event_entry)
3129 enabled |= event_enable_on_exec(event, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003130
3131 /*
Peter Zijlstra3e349502016-01-08 10:01:18 +01003132 * Unclone and reschedule this context if we enabled any event.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003133 */
Peter Zijlstra3e349502016-01-08 10:01:18 +01003134 if (enabled) {
Peter Zijlstra211de6e2014-09-30 19:23:08 +02003135 clone_ctx = unclone_ctx(ctx);
Peter Zijlstra3e349502016-01-08 10:01:18 +01003136 ctx_resched(cpuctx, ctx);
3137 }
3138 perf_ctx_unlock(cpuctx, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003139
Peter Zijlstra9ed60602010-06-11 17:36:35 +02003140out:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003141 local_irq_restore(flags);
Peter Zijlstra211de6e2014-09-30 19:23:08 +02003142
3143 if (clone_ctx)
3144 put_ctx(clone_ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003145}
3146
Peter Zijlstrae041e322014-05-21 17:32:19 +02003147void perf_event_exec(void)
3148{
Peter Zijlstrae041e322014-05-21 17:32:19 +02003149 int ctxn;
3150
3151 rcu_read_lock();
Peter Zijlstrac1274492015-12-10 20:57:40 +01003152 for_each_task_context_nr(ctxn)
3153 perf_event_enable_on_exec(ctxn);
Peter Zijlstrae041e322014-05-21 17:32:19 +02003154 rcu_read_unlock();
3155}
3156
Peter Zijlstra0492d4c2015-09-03 20:07:48 -07003157struct perf_read_data {
3158 struct perf_event *event;
3159 bool group;
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07003160 int ret;
Peter Zijlstra0492d4c2015-09-03 20:07:48 -07003161};
3162
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003163/*
3164 * Cross CPU call to read the hardware event
3165 */
3166static void __perf_event_read(void *info)
3167{
Peter Zijlstra0492d4c2015-09-03 20:07:48 -07003168 struct perf_read_data *data = info;
3169 struct perf_event *sub, *event = data->event;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003170 struct perf_event_context *ctx = event->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02003171 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
Sukadev Bhattiprolu4a00c162015-09-03 20:07:51 -07003172 struct pmu *pmu = event->pmu;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003173
3174 /*
3175 * If this is a task context, we need to check whether it is
3176 * the current task context of this cpu. If not it has been
3177 * scheduled out before the smp call arrived. In that case
3178 * event->count would have been updated to a recent sample
3179 * when the event was scheduled out.
3180 */
3181 if (ctx->task && cpuctx->task_ctx != ctx)
3182 return;
3183
Thomas Gleixnere625cce12009-11-17 18:02:06 +01003184 raw_spin_lock(&ctx->lock);
Stephane Eraniane5d13672011-02-14 11:20:01 +02003185 if (ctx->is_active) {
Peter Zijlstra542e72f2011-01-26 15:38:35 +01003186 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02003187 update_cgrp_time_from_event(event);
3188 }
Peter Zijlstra0492d4c2015-09-03 20:07:48 -07003189
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003190 update_event_times(event);
Sukadev Bhattiprolu4a00c162015-09-03 20:07:51 -07003191 if (event->state != PERF_EVENT_STATE_ACTIVE)
Peter Zijlstra0492d4c2015-09-03 20:07:48 -07003192 goto unlock;
3193
Sukadev Bhattiprolu4a00c162015-09-03 20:07:51 -07003194 if (!data->group) {
3195 pmu->read(event);
3196 data->ret = 0;
3197 goto unlock;
3198 }
3199
3200 pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3201
3202 pmu->read(event);
3203
Peter Zijlstra0492d4c2015-09-03 20:07:48 -07003204 list_for_each_entry(sub, &event->sibling_list, group_entry) {
3205 update_event_times(sub);
Sukadev Bhattiprolu4a00c162015-09-03 20:07:51 -07003206 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3207 /*
3208 * Use sibling's PMU rather than @event's since
3209 * sibling could be on different (eg: software) PMU.
3210 */
Peter Zijlstra0492d4c2015-09-03 20:07:48 -07003211 sub->pmu->read(sub);
Sukadev Bhattiprolu4a00c162015-09-03 20:07:51 -07003212 }
Peter Zijlstra0492d4c2015-09-03 20:07:48 -07003213 }
Sukadev Bhattiprolu4a00c162015-09-03 20:07:51 -07003214
3215 data->ret = pmu->commit_txn(pmu);
Peter Zijlstra0492d4c2015-09-03 20:07:48 -07003216
3217unlock:
Thomas Gleixnere625cce12009-11-17 18:02:06 +01003218 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003219}
3220
Peter Zijlstrab5e58792010-05-21 14:43:12 +02003221static inline u64 perf_event_count(struct perf_event *event)
3222{
Matt Flemingeacd3ec2015-01-23 18:45:41 +00003223 if (event->pmu->count)
3224 return event->pmu->count(event);
3225
3226 return __perf_event_count(event);
Peter Zijlstrab5e58792010-05-21 14:43:12 +02003227}
3228
Kaixu Xiaffe86902015-08-06 07:02:32 +00003229/*
3230 * NMI-safe method to read a local event, that is an event that
3231 * is:
3232 * - either for the current task, or for this CPU
3233 * - does not have inherit set, for inherited task events
3234 * will not be local and we cannot read them atomically
3235 * - must not have a pmu::count method
3236 */
3237u64 perf_event_read_local(struct perf_event *event)
3238{
3239 unsigned long flags;
3240 u64 val;
3241
3242 /*
3243 * Disabling interrupts avoids all counter scheduling (context
3244 * switches, timer based rotation and IPIs).
3245 */
3246 local_irq_save(flags);
3247
3248 /* If this is a per-task event, it must be for current */
3249 WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
3250 event->hw.target != current);
3251
3252 /* If this is a per-CPU event, it must be for this CPU */
3253 WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
3254 event->cpu != smp_processor_id());
3255
3256 /*
3257 * It must not be an event with inherit set, we cannot read
3258 * all child counters from atomic context.
3259 */
3260 WARN_ON_ONCE(event->attr.inherit);
3261
3262 /*
3263 * It must not have a pmu::count method, those are not
3264 * NMI safe.
3265 */
3266 WARN_ON_ONCE(event->pmu->count);
3267
3268 /*
3269 * If the event is currently on this CPU, its either a per-task event,
3270 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
3271 * oncpu == -1).
3272 */
3273 if (event->oncpu == smp_processor_id())
3274 event->pmu->read(event);
3275
3276 val = local64_read(&event->count);
3277 local_irq_restore(flags);
3278
3279 return val;
3280}
3281
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07003282static int perf_event_read(struct perf_event *event, bool group)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003283{
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07003284 int ret = 0;
3285
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003286 /*
3287 * If event is enabled and currently active on a CPU, update the
3288 * value in the event structure:
3289 */
3290 if (event->state == PERF_EVENT_STATE_ACTIVE) {
Peter Zijlstra0492d4c2015-09-03 20:07:48 -07003291 struct perf_read_data data = {
3292 .event = event,
3293 .group = group,
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07003294 .ret = 0,
Peter Zijlstra0492d4c2015-09-03 20:07:48 -07003295 };
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003296 smp_call_function_single(event->oncpu,
Peter Zijlstra0492d4c2015-09-03 20:07:48 -07003297 __perf_event_read, &data, 1);
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07003298 ret = data.ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003299 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
Peter Zijlstra2b8988c2009-11-20 22:19:54 +01003300 struct perf_event_context *ctx = event->ctx;
3301 unsigned long flags;
3302
Thomas Gleixnere625cce12009-11-17 18:02:06 +01003303 raw_spin_lock_irqsave(&ctx->lock, flags);
Stephane Eranianc530ccd2010-10-15 15:26:01 +02003304 /*
3305 * may read while context is not active
3306 * (e.g., thread is blocked), in that case
3307 * we cannot update context time
3308 */
Stephane Eraniane5d13672011-02-14 11:20:01 +02003309 if (ctx->is_active) {
Stephane Eranianc530ccd2010-10-15 15:26:01 +02003310 update_context_time(ctx);
Stephane Eraniane5d13672011-02-14 11:20:01 +02003311 update_cgrp_time_from_event(event);
3312 }
Peter Zijlstra0492d4c2015-09-03 20:07:48 -07003313 if (group)
3314 update_group_times(event);
3315 else
3316 update_event_times(event);
Thomas Gleixnere625cce12009-11-17 18:02:06 +01003317 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003318 }
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07003319
3320 return ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003321}
3322
3323/*
3324 * Initialize the perf_event context in a task_struct:
3325 */
Peter Zijlstraeb184472010-09-07 15:55:13 +02003326static void __perf_event_init_context(struct perf_event_context *ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003327{
Thomas Gleixnere625cce12009-11-17 18:02:06 +01003328 raw_spin_lock_init(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003329 mutex_init(&ctx->mutex);
Mark Rutland2fde4f92015-01-07 15:01:54 +00003330 INIT_LIST_HEAD(&ctx->active_ctx_list);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01003331 INIT_LIST_HEAD(&ctx->pinned_groups);
3332 INIT_LIST_HEAD(&ctx->flexible_groups);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003333 INIT_LIST_HEAD(&ctx->event_list);
3334 atomic_set(&ctx->refcount, 1);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003335}
3336
Peter Zijlstraeb184472010-09-07 15:55:13 +02003337static struct perf_event_context *
3338alloc_perf_context(struct pmu *pmu, struct task_struct *task)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003339{
3340 struct perf_event_context *ctx;
Peter Zijlstraeb184472010-09-07 15:55:13 +02003341
3342 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3343 if (!ctx)
3344 return NULL;
3345
3346 __perf_event_init_context(ctx);
3347 if (task) {
3348 ctx->task = task;
3349 get_task_struct(task);
3350 }
3351 ctx->pmu = pmu;
3352
3353 return ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003354}
3355
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07003356static struct task_struct *
3357find_lively_task_by_vpid(pid_t vpid)
3358{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003359 struct task_struct *task;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003360 int err;
3361
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003362 rcu_read_lock();
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07003363 if (!vpid)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003364 task = current;
3365 else
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07003366 task = find_task_by_vpid(vpid);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003367 if (task)
3368 get_task_struct(task);
3369 rcu_read_unlock();
3370
3371 if (!task)
3372 return ERR_PTR(-ESRCH);
3373
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003374 /* Reuse ptrace permission checks for now. */
3375 err = -EACCES;
Jann Horncaaee622016-01-20 15:00:04 -08003376 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003377 goto errout;
3378
Matt Helsley2ebd4ff2010-09-13 13:01:19 -07003379 return task;
3380errout:
3381 put_task_struct(task);
3382 return ERR_PTR(err);
3383
3384}
3385
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01003386/*
3387 * Returns a matching context with refcount and pincount.
3388 */
Peter Zijlstra108b02c2010-09-06 14:32:03 +02003389static struct perf_event_context *
Yan, Zheng4af57ef282014-11-04 21:56:01 -05003390find_get_context(struct pmu *pmu, struct task_struct *task,
3391 struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003392{
Peter Zijlstra211de6e2014-09-30 19:23:08 +02003393 struct perf_event_context *ctx, *clone_ctx = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003394 struct perf_cpu_context *cpuctx;
Yan, Zheng4af57ef282014-11-04 21:56:01 -05003395 void *task_ctx_data = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003396 unsigned long flags;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02003397 int ctxn, err;
Yan, Zheng4af57ef282014-11-04 21:56:01 -05003398 int cpu = event->cpu;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003399
Oleg Nesterov22a4ec72011-01-18 17:10:08 +01003400 if (!task) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003401 /* Must be root to operate on a CPU event: */
3402 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3403 return ERR_PTR(-EACCES);
3404
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003405 /*
3406 * We could be clever and allow to attach a event to an
3407 * offline CPU and activate it when the CPU comes up, but
3408 * that's for later.
3409 */
3410 if (!cpu_online(cpu))
3411 return ERR_PTR(-ENODEV);
3412
Peter Zijlstra108b02c2010-09-06 14:32:03 +02003413 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003414 ctx = &cpuctx->ctx;
3415 get_ctx(ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01003416 ++ctx->pin_count;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003417
3418 return ctx;
3419 }
3420
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02003421 err = -EINVAL;
3422 ctxn = pmu->task_ctx_nr;
3423 if (ctxn < 0)
3424 goto errout;
3425
Yan, Zheng4af57ef282014-11-04 21:56:01 -05003426 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3427 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3428 if (!task_ctx_data) {
3429 err = -ENOMEM;
3430 goto errout;
3431 }
3432 }
3433
Peter Zijlstra9ed60602010-06-11 17:36:35 +02003434retry:
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02003435 ctx = perf_lock_task_context(task, ctxn, &flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003436 if (ctx) {
Peter Zijlstra211de6e2014-09-30 19:23:08 +02003437 clone_ctx = unclone_ctx(ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01003438 ++ctx->pin_count;
Yan, Zheng4af57ef282014-11-04 21:56:01 -05003439
3440 if (task_ctx_data && !ctx->task_ctx_data) {
3441 ctx->task_ctx_data = task_ctx_data;
3442 task_ctx_data = NULL;
3443 }
Thomas Gleixnere625cce12009-11-17 18:02:06 +01003444 raw_spin_unlock_irqrestore(&ctx->lock, flags);
Peter Zijlstra211de6e2014-09-30 19:23:08 +02003445
3446 if (clone_ctx)
3447 put_ctx(clone_ctx);
Peter Zijlstra9137fb22011-04-09 21:17:41 +02003448 } else {
Peter Zijlstraeb184472010-09-07 15:55:13 +02003449 ctx = alloc_perf_context(pmu, task);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003450 err = -ENOMEM;
3451 if (!ctx)
3452 goto errout;
Peter Zijlstraeb184472010-09-07 15:55:13 +02003453
Yan, Zheng4af57ef282014-11-04 21:56:01 -05003454 if (task_ctx_data) {
3455 ctx->task_ctx_data = task_ctx_data;
3456 task_ctx_data = NULL;
3457 }
3458
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01003459 err = 0;
3460 mutex_lock(&task->perf_event_mutex);
3461 /*
3462 * If it has already passed perf_event_exit_task().
3463 * we must see PF_EXITING, it takes this mutex too.
3464 */
3465 if (task->flags & PF_EXITING)
3466 err = -ESRCH;
3467 else if (task->perf_event_ctxp[ctxn])
3468 err = -EAGAIN;
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01003469 else {
Peter Zijlstra9137fb22011-04-09 21:17:41 +02003470 get_ctx(ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01003471 ++ctx->pin_count;
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01003472 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01003473 }
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01003474 mutex_unlock(&task->perf_event_mutex);
3475
3476 if (unlikely(err)) {
Peter Zijlstra9137fb22011-04-09 21:17:41 +02003477 put_ctx(ctx);
Oleg Nesterovdbe08d82011-01-19 19:22:07 +01003478
3479 if (err == -EAGAIN)
3480 goto retry;
3481 goto errout;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003482 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003483 }
3484
Yan, Zheng4af57ef282014-11-04 21:56:01 -05003485 kfree(task_ctx_data);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003486 return ctx;
3487
Peter Zijlstra9ed60602010-06-11 17:36:35 +02003488errout:
Yan, Zheng4af57ef282014-11-04 21:56:01 -05003489 kfree(task_ctx_data);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003490 return ERR_PTR(err);
3491}
3492
Li Zefan6fb29152009-10-15 11:21:42 +08003493static void perf_event_free_filter(struct perf_event *event);
Alexei Starovoitov25415172015-03-25 12:49:20 -07003494static void perf_event_free_bpf_prog(struct perf_event *event);
Li Zefan6fb29152009-10-15 11:21:42 +08003495
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003496static void free_event_rcu(struct rcu_head *head)
3497{
3498 struct perf_event *event;
3499
3500 event = container_of(head, struct perf_event, rcu_head);
3501 if (event->ns)
3502 put_pid_ns(event->ns);
Li Zefan6fb29152009-10-15 11:21:42 +08003503 perf_event_free_filter(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003504 kfree(event);
3505}
3506
Peter Zijlstrab69cf532014-03-14 10:50:33 +01003507static void ring_buffer_attach(struct perf_event *event,
3508 struct ring_buffer *rb);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003509
Frederic Weisbecker4beb31f2013-07-23 02:31:02 +02003510static void unaccount_event_cpu(struct perf_event *event, int cpu)
3511{
3512 if (event->parent)
3513 return;
3514
Frederic Weisbecker4beb31f2013-07-23 02:31:02 +02003515 if (is_cgroup_event(event))
3516 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3517}
3518
3519static void unaccount_event(struct perf_event *event)
3520{
Peter Zijlstra25432ae2016-01-08 11:05:09 +01003521 bool dec = false;
3522
Frederic Weisbecker4beb31f2013-07-23 02:31:02 +02003523 if (event->parent)
3524 return;
3525
3526 if (event->attach_state & PERF_ATTACH_TASK)
Peter Zijlstra25432ae2016-01-08 11:05:09 +01003527 dec = true;
Frederic Weisbecker4beb31f2013-07-23 02:31:02 +02003528 if (event->attr.mmap || event->attr.mmap_data)
3529 atomic_dec(&nr_mmap_events);
3530 if (event->attr.comm)
3531 atomic_dec(&nr_comm_events);
3532 if (event->attr.task)
3533 atomic_dec(&nr_task_events);
Frederic Weisbecker948b26b2013-08-02 18:29:55 +02003534 if (event->attr.freq)
3535 atomic_dec(&nr_freq_events);
Adrian Hunter45ac1402015-07-21 12:44:02 +03003536 if (event->attr.context_switch) {
Peter Zijlstra25432ae2016-01-08 11:05:09 +01003537 dec = true;
Adrian Hunter45ac1402015-07-21 12:44:02 +03003538 atomic_dec(&nr_switch_events);
3539 }
Frederic Weisbecker4beb31f2013-07-23 02:31:02 +02003540 if (is_cgroup_event(event))
Peter Zijlstra25432ae2016-01-08 11:05:09 +01003541 dec = true;
Frederic Weisbecker4beb31f2013-07-23 02:31:02 +02003542 if (has_branch_stack(event))
Peter Zijlstra25432ae2016-01-08 11:05:09 +01003543 dec = true;
3544
Peter Zijlstra9107c892016-02-24 18:45:45 +01003545 if (dec) {
3546 if (!atomic_add_unless(&perf_sched_count, -1, 1))
3547 schedule_delayed_work(&perf_sched_work, HZ);
3548 }
Frederic Weisbecker4beb31f2013-07-23 02:31:02 +02003549
3550 unaccount_event_cpu(event, event->cpu);
3551}
3552
Peter Zijlstra9107c892016-02-24 18:45:45 +01003553static void perf_sched_delayed(struct work_struct *work)
3554{
3555 mutex_lock(&perf_sched_mutex);
3556 if (atomic_dec_and_test(&perf_sched_count))
3557 static_branch_disable(&perf_sched_events);
3558 mutex_unlock(&perf_sched_mutex);
3559}
3560
Alexander Shishkinbed5b252015-01-30 12:31:06 +02003561/*
3562 * The following implement mutual exclusion of events on "exclusive" pmus
3563 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
3564 * at a time, so we disallow creating events that might conflict, namely:
3565 *
3566 * 1) cpu-wide events in the presence of per-task events,
3567 * 2) per-task events in the presence of cpu-wide events,
3568 * 3) two matching events on the same context.
3569 *
3570 * The former two cases are handled in the allocation path (perf_event_alloc(),
Peter Zijlstraa0733e62016-01-26 12:14:40 +01003571 * _free_event()), the latter -- before the first perf_install_in_context().
Alexander Shishkinbed5b252015-01-30 12:31:06 +02003572 */
3573static int exclusive_event_init(struct perf_event *event)
3574{
3575 struct pmu *pmu = event->pmu;
3576
3577 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3578 return 0;
3579
3580 /*
3581 * Prevent co-existence of per-task and cpu-wide events on the
3582 * same exclusive pmu.
3583 *
3584 * Negative pmu::exclusive_cnt means there are cpu-wide
3585 * events on this "exclusive" pmu, positive means there are
3586 * per-task events.
3587 *
3588 * Since this is called in perf_event_alloc() path, event::ctx
3589 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
3590 * to mean "per-task event", because unlike other attach states it
3591 * never gets cleared.
3592 */
3593 if (event->attach_state & PERF_ATTACH_TASK) {
3594 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
3595 return -EBUSY;
3596 } else {
3597 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
3598 return -EBUSY;
3599 }
3600
3601 return 0;
3602}
3603
3604static void exclusive_event_destroy(struct perf_event *event)
3605{
3606 struct pmu *pmu = event->pmu;
3607
3608 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3609 return;
3610
3611 /* see comment in exclusive_event_init() */
3612 if (event->attach_state & PERF_ATTACH_TASK)
3613 atomic_dec(&pmu->exclusive_cnt);
3614 else
3615 atomic_inc(&pmu->exclusive_cnt);
3616}
3617
3618static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
3619{
3620 if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
3621 (e1->cpu == e2->cpu ||
3622 e1->cpu == -1 ||
3623 e2->cpu == -1))
3624 return true;
3625 return false;
3626}
3627
3628/* Called under the same ctx::mutex as perf_install_in_context() */
3629static bool exclusive_event_installable(struct perf_event *event,
3630 struct perf_event_context *ctx)
3631{
3632 struct perf_event *iter_event;
3633 struct pmu *pmu = event->pmu;
3634
3635 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3636 return true;
3637
3638 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
3639 if (exclusive_event_match(iter_event, event))
3640 return false;
3641 }
3642
3643 return true;
3644}
3645
Peter Zijlstra683ede42014-05-05 12:11:24 +02003646static void _free_event(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003647{
Peter Zijlstrae360adb2010-10-14 14:01:34 +08003648 irq_work_sync(&event->pending);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003649
Frederic Weisbecker4beb31f2013-07-23 02:31:02 +02003650 unaccount_event(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003651
Frederic Weisbecker76369132011-05-19 19:55:04 +02003652 if (event->rb) {
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02003653 /*
3654 * Can happen when we close an event with re-directed output.
3655 *
3656 * Since we have a 0 refcount, perf_mmap_close() will skip
3657 * over us; possibly making our ring_buffer_put() the last.
3658 */
3659 mutex_lock(&event->mmap_mutex);
Peter Zijlstrab69cf532014-03-14 10:50:33 +01003660 ring_buffer_attach(event, NULL);
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02003661 mutex_unlock(&event->mmap_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003662 }
3663
Stephane Eraniane5d13672011-02-14 11:20:01 +02003664 if (is_cgroup_event(event))
3665 perf_detach_cgroup(event);
3666
Peter Zijlstraa0733e62016-01-26 12:14:40 +01003667 if (!event->parent) {
3668 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3669 put_callchain_buffers();
3670 }
3671
3672 perf_event_free_bpf_prog(event);
3673
3674 if (event->destroy)
3675 event->destroy(event);
3676
3677 if (event->ctx)
3678 put_ctx(event->ctx);
3679
3680 if (event->pmu) {
3681 exclusive_event_destroy(event);
3682 module_put(event->pmu->module);
3683 }
3684
3685 call_rcu(&event->rcu_head, free_event_rcu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003686}
3687
Peter Zijlstra683ede42014-05-05 12:11:24 +02003688/*
3689 * Used to free events which have a known refcount of 1, such as in error paths
3690 * where the event isn't exposed yet and inherited events.
3691 */
3692static void free_event(struct perf_event *event)
Arjan van de Venfb0459d2009-09-25 12:25:56 +02003693{
Peter Zijlstra683ede42014-05-05 12:11:24 +02003694 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
3695 "unexpected event refcount: %ld; ptr=%p\n",
3696 atomic_long_read(&event->refcount), event)) {
3697 /* leak to avoid use-after-free */
3698 return;
3699 }
Arjan van de Venfb0459d2009-09-25 12:25:56 +02003700
Peter Zijlstra683ede42014-05-05 12:11:24 +02003701 _free_event(event);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02003702}
Arjan van de Venfb0459d2009-09-25 12:25:56 +02003703
Peter Zijlstraa66a3052009-11-23 11:37:23 +01003704/*
Jiri Olsaf8697762014-08-01 14:33:01 +02003705 * Remove user event from the owner task.
Peter Zijlstraa66a3052009-11-23 11:37:23 +01003706 */
Jiri Olsaf8697762014-08-01 14:33:01 +02003707static void perf_remove_from_owner(struct perf_event *event)
Peter Zijlstraa66a3052009-11-23 11:37:23 +01003708{
Peter Zijlstra88821352010-11-09 19:01:43 +01003709 struct task_struct *owner;
Peter Zijlstraa66a3052009-11-23 11:37:23 +01003710
Peter Zijlstra88821352010-11-09 19:01:43 +01003711 rcu_read_lock();
Peter Zijlstra88821352010-11-09 19:01:43 +01003712 /*
Peter Zijlstraf47c02c2016-01-26 12:30:14 +01003713 * Matches the smp_store_release() in perf_event_exit_task(). If we
3714 * observe !owner it means the list deletion is complete and we can
3715 * indeed free this event, otherwise we need to serialize on
Peter Zijlstra88821352010-11-09 19:01:43 +01003716 * owner->perf_event_mutex.
3717 */
Peter Zijlstraf47c02c2016-01-26 12:30:14 +01003718 owner = lockless_dereference(event->owner);
Peter Zijlstra88821352010-11-09 19:01:43 +01003719 if (owner) {
3720 /*
3721 * Since delayed_put_task_struct() also drops the last
3722 * task reference we can safely take a new reference
3723 * while holding the rcu_read_lock().
3724 */
3725 get_task_struct(owner);
3726 }
3727 rcu_read_unlock();
3728
3729 if (owner) {
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01003730 /*
3731 * If we're here through perf_event_exit_task() we're already
3732 * holding ctx->mutex which would be an inversion wrt. the
3733 * normal lock order.
3734 *
3735 * However we can safely take this lock because its the child
3736 * ctx->mutex.
3737 */
3738 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
3739
Peter Zijlstra88821352010-11-09 19:01:43 +01003740 /*
3741 * We have to re-check the event->owner field, if it is cleared
3742 * we raced with perf_event_exit_task(), acquiring the mutex
3743 * ensured they're done, and we can proceed with freeing the
3744 * event.
3745 */
Peter Zijlstraf47c02c2016-01-26 12:30:14 +01003746 if (event->owner) {
Peter Zijlstra88821352010-11-09 19:01:43 +01003747 list_del_init(&event->owner_entry);
Peter Zijlstraf47c02c2016-01-26 12:30:14 +01003748 smp_store_release(&event->owner, NULL);
3749 }
Peter Zijlstra88821352010-11-09 19:01:43 +01003750 mutex_unlock(&owner->perf_event_mutex);
3751 put_task_struct(owner);
3752 }
Jiri Olsaf8697762014-08-01 14:33:01 +02003753}
3754
Jiri Olsaf8697762014-08-01 14:33:01 +02003755static void put_event(struct perf_event *event)
3756{
Jiri Olsaf8697762014-08-01 14:33:01 +02003757 if (!atomic_long_dec_and_test(&event->refcount))
3758 return;
3759
Peter Zijlstra683ede42014-05-05 12:11:24 +02003760 _free_event(event);
Al Viroa6fa9412012-08-20 14:59:25 +01003761}
3762
Peter Zijlstrac6e5b732016-01-15 16:07:41 +02003763/*
3764 * Kill an event dead; while event:refcount will preserve the event
3765 * object, it will not preserve its functionality. Once the last 'user'
3766 * gives up the object, we'll destroy the thing.
3767 */
Peter Zijlstra683ede42014-05-05 12:11:24 +02003768int perf_event_release_kernel(struct perf_event *event)
3769{
Peter Zijlstraa4f4bb62016-02-24 18:45:42 +01003770 struct perf_event_context *ctx = event->ctx;
Peter Zijlstrac6e5b732016-01-15 16:07:41 +02003771 struct perf_event *child, *tmp;
3772
Peter Zijlstraa4f4bb62016-02-24 18:45:42 +01003773 /*
3774 * If we got here through err_file: fput(event_file); we will not have
3775 * attached to a context yet.
3776 */
3777 if (!ctx) {
3778 WARN_ON_ONCE(event->attach_state &
3779 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
3780 goto no_ctx;
3781 }
3782
Peter Zijlstra88821352010-11-09 19:01:43 +01003783 if (!is_kernel_event(event))
3784 perf_remove_from_owner(event);
3785
Peter Zijlstra5fa7c8e2016-01-26 15:25:15 +01003786 ctx = perf_event_ctx_lock(event);
Peter Zijlstra683ede42014-05-05 12:11:24 +02003787 WARN_ON_ONCE(ctx->parent_ctx);
Peter Zijlstraa69b0ca2016-02-24 18:45:44 +01003788 perf_remove_from_context(event, DETACH_GROUP);
Peter Zijlstra88821352010-11-09 19:01:43 +01003789
Peter Zijlstraa69b0ca2016-02-24 18:45:44 +01003790 raw_spin_lock_irq(&ctx->lock);
Peter Zijlstra60beda82016-01-26 14:55:02 +01003791 /*
Peter Zijlstraa69b0ca2016-02-24 18:45:44 +01003792 * Mark this even as STATE_DEAD, there is no external reference to it
3793 * anymore.
Peter Zijlstrac6e5b732016-01-15 16:07:41 +02003794 *
Peter Zijlstraa69b0ca2016-02-24 18:45:44 +01003795 * Anybody acquiring event->child_mutex after the below loop _must_
3796 * also see this, most importantly inherit_event() which will avoid
3797 * placing more children on the list.
Peter Zijlstrac6e5b732016-01-15 16:07:41 +02003798 *
3799 * Thus this guarantees that we will in fact observe and kill _ALL_
3800 * child events.
Peter Zijlstra60beda82016-01-26 14:55:02 +01003801 */
Peter Zijlstraa69b0ca2016-02-24 18:45:44 +01003802 event->state = PERF_EVENT_STATE_DEAD;
3803 raw_spin_unlock_irq(&ctx->lock);
3804
3805 perf_event_ctx_unlock(event, ctx);
Peter Zijlstra60beda82016-01-26 14:55:02 +01003806
Peter Zijlstrac6e5b732016-01-15 16:07:41 +02003807again:
3808 mutex_lock(&event->child_mutex);
3809 list_for_each_entry(child, &event->child_list, child_list) {
Al Viroa6fa9412012-08-20 14:59:25 +01003810
Peter Zijlstrac6e5b732016-01-15 16:07:41 +02003811 /*
3812 * Cannot change, child events are not migrated, see the
3813 * comment with perf_event_ctx_lock_nested().
3814 */
3815 ctx = lockless_dereference(child->ctx);
3816 /*
3817 * Since child_mutex nests inside ctx::mutex, we must jump
3818 * through hoops. We start by grabbing a reference on the ctx.
3819 *
3820 * Since the event cannot get freed while we hold the
3821 * child_mutex, the context must also exist and have a !0
3822 * reference count.
3823 */
3824 get_ctx(ctx);
3825
3826 /*
3827 * Now that we have a ctx ref, we can drop child_mutex, and
3828 * acquire ctx::mutex without fear of it going away. Then we
3829 * can re-acquire child_mutex.
3830 */
3831 mutex_unlock(&event->child_mutex);
3832 mutex_lock(&ctx->mutex);
3833 mutex_lock(&event->child_mutex);
3834
3835 /*
3836 * Now that we hold ctx::mutex and child_mutex, revalidate our
3837 * state, if child is still the first entry, it didn't get freed
3838 * and we can continue doing so.
3839 */
3840 tmp = list_first_entry_or_null(&event->child_list,
3841 struct perf_event, child_list);
3842 if (tmp == child) {
3843 perf_remove_from_context(child, DETACH_GROUP);
3844 list_del(&child->child_list);
3845 free_event(child);
3846 /*
3847 * This matches the refcount bump in inherit_event();
3848 * this can't be the last reference.
3849 */
3850 put_event(event);
3851 }
3852
3853 mutex_unlock(&event->child_mutex);
3854 mutex_unlock(&ctx->mutex);
3855 put_ctx(ctx);
3856 goto again;
3857 }
3858 mutex_unlock(&event->child_mutex);
3859
Peter Zijlstraa4f4bb62016-02-24 18:45:42 +01003860no_ctx:
3861 put_event(event); /* Must be the 'last' reference */
Peter Zijlstra683ede42014-05-05 12:11:24 +02003862 return 0;
3863}
3864EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3865
Peter Zijlstra8b10c5e2015-05-01 16:08:46 +02003866/*
3867 * Called when the last reference to the file is gone.
3868 */
Al Viroa6fa9412012-08-20 14:59:25 +01003869static int perf_release(struct inode *inode, struct file *file)
3870{
Peter Zijlstrac6e5b732016-01-15 16:07:41 +02003871 perf_event_release_kernel(file->private_data);
Al Viroa6fa9412012-08-20 14:59:25 +01003872 return 0;
Peter Zijlstraa66a3052009-11-23 11:37:23 +01003873}
3874
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003875u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003876{
3877 struct perf_event *child;
3878 u64 total = 0;
3879
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003880 *enabled = 0;
3881 *running = 0;
3882
Peter Zijlstra6f105812009-11-20 22:19:56 +01003883 mutex_lock(&event->child_mutex);
Sukadev Bhattiprolu01add3e2015-09-03 20:07:46 -07003884
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07003885 (void)perf_event_read(event, false);
Sukadev Bhattiprolu01add3e2015-09-03 20:07:46 -07003886 total += perf_event_count(event);
3887
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003888 *enabled += event->total_time_enabled +
3889 atomic64_read(&event->child_total_time_enabled);
3890 *running += event->total_time_running +
3891 atomic64_read(&event->child_total_time_running);
3892
3893 list_for_each_entry(child, &event->child_list, child_list) {
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07003894 (void)perf_event_read(child, false);
Sukadev Bhattiprolu01add3e2015-09-03 20:07:46 -07003895 total += perf_event_count(child);
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003896 *enabled += child->total_time_enabled;
3897 *running += child->total_time_running;
3898 }
Peter Zijlstra6f105812009-11-20 22:19:56 +01003899 mutex_unlock(&event->child_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003900
3901 return total;
3902}
Arjan van de Venfb0459d2009-09-25 12:25:56 +02003903EXPORT_SYMBOL_GPL(perf_event_read_value);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003904
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07003905static int __perf_read_group_add(struct perf_event *leader,
Peter Zijlstrafa8c2692015-09-03 20:07:49 -07003906 u64 read_format, u64 *values)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003907{
Peter Zijlstrafa8c2692015-09-03 20:07:49 -07003908 struct perf_event *sub;
3909 int n = 1; /* skip @nr */
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07003910 int ret;
Peter Zijlstraabf48682009-11-20 22:19:49 +01003911
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07003912 ret = perf_event_read(leader, true);
3913 if (ret)
3914 return ret;
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01003915
Peter Zijlstrafa8c2692015-09-03 20:07:49 -07003916 /*
3917 * Since we co-schedule groups, {enabled,running} times of siblings
3918 * will be identical to those of the leader, so we only publish one
3919 * set.
3920 */
3921 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3922 values[n++] += leader->total_time_enabled +
3923 atomic64_read(&leader->child_total_time_enabled);
3924 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003925
Peter Zijlstrafa8c2692015-09-03 20:07:49 -07003926 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3927 values[n++] += leader->total_time_running +
3928 atomic64_read(&leader->child_total_time_running);
3929 }
3930
3931 /*
3932 * Write {count,id} tuples for every sibling.
3933 */
3934 values[n++] += perf_event_count(leader);
Peter Zijlstraabf48682009-11-20 22:19:49 +01003935 if (read_format & PERF_FORMAT_ID)
3936 values[n++] = primary_event_id(leader);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003937
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003938 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
Peter Zijlstrafa8c2692015-09-03 20:07:49 -07003939 values[n++] += perf_event_count(sub);
Peter Zijlstraabf48682009-11-20 22:19:49 +01003940 if (read_format & PERF_FORMAT_ID)
3941 values[n++] = primary_event_id(sub);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003942 }
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07003943
3944 return 0;
Peter Zijlstrafa8c2692015-09-03 20:07:49 -07003945}
3946
3947static int perf_read_group(struct perf_event *event,
3948 u64 read_format, char __user *buf)
3949{
3950 struct perf_event *leader = event->group_leader, *child;
3951 struct perf_event_context *ctx = leader->ctx;
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07003952 int ret;
Peter Zijlstrafa8c2692015-09-03 20:07:49 -07003953 u64 *values;
3954
3955 lockdep_assert_held(&ctx->mutex);
3956
3957 values = kzalloc(event->read_size, GFP_KERNEL);
3958 if (!values)
3959 return -ENOMEM;
3960
3961 values[0] = 1 + leader->nr_siblings;
3962
3963 /*
3964 * By locking the child_mutex of the leader we effectively
3965 * lock the child list of all siblings.. XXX explain how.
3966 */
3967 mutex_lock(&leader->child_mutex);
3968
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07003969 ret = __perf_read_group_add(leader, read_format, values);
3970 if (ret)
3971 goto unlock;
3972
3973 list_for_each_entry(child, &leader->child_list, child_list) {
3974 ret = __perf_read_group_add(child, read_format, values);
3975 if (ret)
3976 goto unlock;
3977 }
Peter Zijlstrafa8c2692015-09-03 20:07:49 -07003978
3979 mutex_unlock(&leader->child_mutex);
3980
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07003981 ret = event->read_size;
Peter Zijlstrafa8c2692015-09-03 20:07:49 -07003982 if (copy_to_user(buf, values, event->read_size))
3983 ret = -EFAULT;
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07003984 goto out;
Peter Zijlstrafa8c2692015-09-03 20:07:49 -07003985
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07003986unlock:
3987 mutex_unlock(&leader->child_mutex);
3988out:
Peter Zijlstrafa8c2692015-09-03 20:07:49 -07003989 kfree(values);
Peter Zijlstraabf48682009-11-20 22:19:49 +01003990 return ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003991}
3992
Peter Zijlstra (Intel)b15f4952015-09-03 20:07:47 -07003993static int perf_read_one(struct perf_event *event,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003994 u64 read_format, char __user *buf)
3995{
Peter Zijlstra59ed4462009-11-20 22:19:55 +01003996 u64 enabled, running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02003997 u64 values[4];
3998 int n = 0;
3999
Peter Zijlstra59ed4462009-11-20 22:19:55 +01004000 values[n++] = perf_event_read_value(event, &enabled, &running);
4001 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4002 values[n++] = enabled;
4003 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4004 values[n++] = running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004005 if (read_format & PERF_FORMAT_ID)
4006 values[n++] = primary_event_id(event);
4007
4008 if (copy_to_user(buf, values, n * sizeof(u64)))
4009 return -EFAULT;
4010
4011 return n * sizeof(u64);
4012}
4013
Jiri Olsadc633982014-09-12 13:18:26 +02004014static bool is_event_hup(struct perf_event *event)
4015{
4016 bool no_children;
4017
Peter Zijlstraa69b0ca2016-02-24 18:45:44 +01004018 if (event->state > PERF_EVENT_STATE_EXIT)
Jiri Olsadc633982014-09-12 13:18:26 +02004019 return false;
4020
4021 mutex_lock(&event->child_mutex);
4022 no_children = list_empty(&event->child_list);
4023 mutex_unlock(&event->child_mutex);
4024 return no_children;
4025}
4026
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004027/*
4028 * Read the performance event - simple non blocking version for now
4029 */
4030static ssize_t
Peter Zijlstra (Intel)b15f4952015-09-03 20:07:47 -07004031__perf_read(struct perf_event *event, char __user *buf, size_t count)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004032{
4033 u64 read_format = event->attr.read_format;
4034 int ret;
4035
4036 /*
4037 * Return end-of-file for a read on a event that is in
4038 * error state (i.e. because it was pinned but it couldn't be
4039 * scheduled on to the CPU at some point).
4040 */
4041 if (event->state == PERF_EVENT_STATE_ERROR)
4042 return 0;
4043
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02004044 if (count < event->read_size)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004045 return -ENOSPC;
4046
4047 WARN_ON_ONCE(event->ctx->parent_ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004048 if (read_format & PERF_FORMAT_GROUP)
Peter Zijlstra (Intel)b15f4952015-09-03 20:07:47 -07004049 ret = perf_read_group(event, read_format, buf);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004050 else
Peter Zijlstra (Intel)b15f4952015-09-03 20:07:47 -07004051 ret = perf_read_one(event, read_format, buf);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004052
4053 return ret;
4054}
4055
4056static ssize_t
4057perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4058{
4059 struct perf_event *event = file->private_data;
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01004060 struct perf_event_context *ctx;
4061 int ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004062
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01004063 ctx = perf_event_ctx_lock(event);
Peter Zijlstra (Intel)b15f4952015-09-03 20:07:47 -07004064 ret = __perf_read(event, buf, count);
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01004065 perf_event_ctx_unlock(event, ctx);
4066
4067 return ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004068}
4069
4070static unsigned int perf_poll(struct file *file, poll_table *wait)
4071{
4072 struct perf_event *event = file->private_data;
Frederic Weisbecker76369132011-05-19 19:55:04 +02004073 struct ring_buffer *rb;
Jiri Olsa61b67682014-08-13 19:39:56 +02004074 unsigned int events = POLLHUP;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004075
Sebastian Andrzej Siewiore708d7a2014-08-04 15:31:08 +02004076 poll_wait(file, &event->waitq, wait);
Jiri Olsa179033b2014-08-07 11:48:26 -04004077
Jiri Olsadc633982014-09-12 13:18:26 +02004078 if (is_event_hup(event))
Jiri Olsa179033b2014-08-07 11:48:26 -04004079 return events;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004080
Peter Zijlstra10c6db12011-11-26 02:47:31 +01004081 /*
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004082 * Pin the event->rb by taking event->mmap_mutex; otherwise
4083 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
Peter Zijlstra10c6db12011-11-26 02:47:31 +01004084 */
4085 mutex_lock(&event->mmap_mutex);
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004086 rb = event->rb;
4087 if (rb)
Frederic Weisbecker76369132011-05-19 19:55:04 +02004088 events = atomic_xchg(&rb->poll, 0);
Peter Zijlstra10c6db12011-11-26 02:47:31 +01004089 mutex_unlock(&event->mmap_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004090 return events;
4091}
4092
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01004093static void _perf_event_reset(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004094{
Sukadev Bhattiprolu7d889622015-09-03 20:07:50 -07004095 (void)perf_event_read(event, false);
Peter Zijlstrae7850592010-05-21 14:43:08 +02004096 local64_set(&event->count, 0);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004097 perf_event_update_userpage(event);
4098}
4099
4100/*
4101 * Holding the top-level event's child_mutex means that any
4102 * descendant process that has inherited this event will block
Peter Zijlstra8ba289b2016-01-26 13:06:56 +01004103 * in perf_event_exit_event() if it goes to exit, thus satisfying the
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004104 * task existence requirements of perf_event_enable/disable.
4105 */
4106static void perf_event_for_each_child(struct perf_event *event,
4107 void (*func)(struct perf_event *))
4108{
4109 struct perf_event *child;
4110
4111 WARN_ON_ONCE(event->ctx->parent_ctx);
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01004112
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004113 mutex_lock(&event->child_mutex);
4114 func(event);
4115 list_for_each_entry(child, &event->child_list, child_list)
4116 func(child);
4117 mutex_unlock(&event->child_mutex);
4118}
4119
4120static void perf_event_for_each(struct perf_event *event,
4121 void (*func)(struct perf_event *))
4122{
4123 struct perf_event_context *ctx = event->ctx;
4124 struct perf_event *sibling;
4125
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01004126 lockdep_assert_held(&ctx->mutex);
4127
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004128 event = event->group_leader;
4129
4130 perf_event_for_each_child(event, func);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004131 list_for_each_entry(sibling, &event->sibling_list, group_entry)
Michael Ellerman724b6da2012-04-11 11:54:13 +10004132 perf_event_for_each_child(sibling, func);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004133}
4134
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01004135static void __perf_event_period(struct perf_event *event,
4136 struct perf_cpu_context *cpuctx,
4137 struct perf_event_context *ctx,
4138 void *info)
Peter Zijlstra00179602015-11-30 16:26:35 +01004139{
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01004140 u64 value = *((u64 *)info);
Peter Zijlstrac7999c62015-08-04 19:22:49 +02004141 bool active;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004142
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004143 if (event->attr.freq) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004144 event->attr.sample_freq = value;
4145 } else {
4146 event->attr.sample_period = value;
4147 event->hw.sample_period = value;
4148 }
Peter Zijlstrabad71922013-11-27 13:54:38 +00004149
4150 active = (event->state == PERF_EVENT_STATE_ACTIVE);
4151 if (active) {
4152 perf_pmu_disable(ctx->pmu);
4153 event->pmu->stop(event, PERF_EF_UPDATE);
4154 }
4155
4156 local64_set(&event->hw.period_left, 0);
4157
4158 if (active) {
4159 event->pmu->start(event, PERF_EF_RELOAD);
4160 perf_pmu_enable(ctx->pmu);
4161 }
Peter Zijlstrac7999c62015-08-04 19:22:49 +02004162}
4163
4164static int perf_event_period(struct perf_event *event, u64 __user *arg)
4165{
Peter Zijlstrac7999c62015-08-04 19:22:49 +02004166 u64 value;
4167
4168 if (!is_sampling_event(event))
4169 return -EINVAL;
4170
4171 if (copy_from_user(&value, arg, sizeof(value)))
4172 return -EFAULT;
4173
4174 if (!value)
4175 return -EINVAL;
4176
4177 if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4178 return -EINVAL;
4179
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01004180 event_function_call(event, __perf_event_period, &value);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004181
Peter Zijlstrac7999c62015-08-04 19:22:49 +02004182 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004183}
4184
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004185static const struct file_operations perf_fops;
4186
Al Viro2903ff02012-08-28 12:52:22 -04004187static inline int perf_fget_light(int fd, struct fd *p)
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004188{
Al Viro2903ff02012-08-28 12:52:22 -04004189 struct fd f = fdget(fd);
4190 if (!f.file)
4191 return -EBADF;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004192
Al Viro2903ff02012-08-28 12:52:22 -04004193 if (f.file->f_op != &perf_fops) {
4194 fdput(f);
4195 return -EBADF;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004196 }
Al Viro2903ff02012-08-28 12:52:22 -04004197 *p = f;
4198 return 0;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004199}
4200
4201static int perf_event_set_output(struct perf_event *event,
4202 struct perf_event *output_event);
Li Zefan6fb29152009-10-15 11:21:42 +08004203static int perf_event_set_filter(struct perf_event *event, void __user *arg);
Alexei Starovoitov25415172015-03-25 12:49:20 -07004204static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004205
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01004206static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004207{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004208 void (*func)(struct perf_event *);
4209 u32 flags = arg;
4210
4211 switch (cmd) {
4212 case PERF_EVENT_IOC_ENABLE:
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01004213 func = _perf_event_enable;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004214 break;
4215 case PERF_EVENT_IOC_DISABLE:
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01004216 func = _perf_event_disable;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004217 break;
4218 case PERF_EVENT_IOC_RESET:
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01004219 func = _perf_event_reset;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004220 break;
4221
4222 case PERF_EVENT_IOC_REFRESH:
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01004223 return _perf_event_refresh(event, arg);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004224
4225 case PERF_EVENT_IOC_PERIOD:
4226 return perf_event_period(event, (u64 __user *)arg);
4227
Jiri Olsacf4957f2012-10-24 13:37:58 +02004228 case PERF_EVENT_IOC_ID:
4229 {
4230 u64 id = primary_event_id(event);
4231
4232 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4233 return -EFAULT;
4234 return 0;
4235 }
4236
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004237 case PERF_EVENT_IOC_SET_OUTPUT:
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004238 {
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004239 int ret;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004240 if (arg != -1) {
Al Viro2903ff02012-08-28 12:52:22 -04004241 struct perf_event *output_event;
4242 struct fd output;
4243 ret = perf_fget_light(arg, &output);
4244 if (ret)
4245 return ret;
4246 output_event = output.file->private_data;
4247 ret = perf_event_set_output(event, output_event);
4248 fdput(output);
4249 } else {
4250 ret = perf_event_set_output(event, NULL);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004251 }
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004252 return ret;
4253 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004254
Li Zefan6fb29152009-10-15 11:21:42 +08004255 case PERF_EVENT_IOC_SET_FILTER:
4256 return perf_event_set_filter(event, (void __user *)arg);
4257
Alexei Starovoitov25415172015-03-25 12:49:20 -07004258 case PERF_EVENT_IOC_SET_BPF:
4259 return perf_event_set_bpf_prog(event, arg);
4260
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004261 default:
4262 return -ENOTTY;
4263 }
4264
4265 if (flags & PERF_IOC_FLAG_GROUP)
4266 perf_event_for_each(event, func);
4267 else
4268 perf_event_for_each_child(event, func);
4269
4270 return 0;
4271}
4272
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01004273static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4274{
4275 struct perf_event *event = file->private_data;
4276 struct perf_event_context *ctx;
4277 long ret;
4278
4279 ctx = perf_event_ctx_lock(event);
4280 ret = _perf_ioctl(event, cmd, arg);
4281 perf_event_ctx_unlock(event, ctx);
4282
4283 return ret;
4284}
4285
Pawel Mollb3f20782014-06-13 16:03:32 +01004286#ifdef CONFIG_COMPAT
4287static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4288 unsigned long arg)
4289{
4290 switch (_IOC_NR(cmd)) {
4291 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4292 case _IOC_NR(PERF_EVENT_IOC_ID):
4293 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
4294 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4295 cmd &= ~IOCSIZE_MASK;
4296 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4297 }
4298 break;
4299 }
4300 return perf_ioctl(file, cmd, arg);
4301}
4302#else
4303# define perf_compat_ioctl NULL
4304#endif
4305
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004306int perf_event_task_enable(void)
4307{
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01004308 struct perf_event_context *ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004309 struct perf_event *event;
4310
4311 mutex_lock(&current->perf_event_mutex);
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01004312 list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4313 ctx = perf_event_ctx_lock(event);
4314 perf_event_for_each_child(event, _perf_event_enable);
4315 perf_event_ctx_unlock(event, ctx);
4316 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004317 mutex_unlock(&current->perf_event_mutex);
4318
4319 return 0;
4320}
4321
4322int perf_event_task_disable(void)
4323{
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01004324 struct perf_event_context *ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004325 struct perf_event *event;
4326
4327 mutex_lock(&current->perf_event_mutex);
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01004328 list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4329 ctx = perf_event_ctx_lock(event);
4330 perf_event_for_each_child(event, _perf_event_disable);
4331 perf_event_ctx_unlock(event, ctx);
4332 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004333 mutex_unlock(&current->perf_event_mutex);
4334
4335 return 0;
4336}
4337
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004338static int perf_event_index(struct perf_event *event)
4339{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02004340 if (event->hw.state & PERF_HES_STOPPED)
4341 return 0;
4342
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004343 if (event->state != PERF_EVENT_STATE_ACTIVE)
4344 return 0;
4345
Peter Zijlstra35edc2a2011-11-20 20:36:02 +01004346 return event->pmu->event_idx(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004347}
4348
Eric B Munsonc4794292011-06-23 16:34:38 -04004349static void calc_timer_values(struct perf_event *event,
Peter Zijlstrae3f35412011-11-21 11:43:53 +01004350 u64 *now,
Eric B Munson7f310a52011-06-23 16:34:38 -04004351 u64 *enabled,
4352 u64 *running)
Eric B Munsonc4794292011-06-23 16:34:38 -04004353{
Peter Zijlstrae3f35412011-11-21 11:43:53 +01004354 u64 ctx_time;
Eric B Munsonc4794292011-06-23 16:34:38 -04004355
Peter Zijlstrae3f35412011-11-21 11:43:53 +01004356 *now = perf_clock();
4357 ctx_time = event->shadow_ctx_time + *now;
Eric B Munsonc4794292011-06-23 16:34:38 -04004358 *enabled = ctx_time - event->tstamp_enabled;
4359 *running = ctx_time - event->tstamp_running;
4360}
4361
Peter Zijlstrafa731582013-09-19 10:16:42 +02004362static void perf_event_init_userpage(struct perf_event *event)
4363{
4364 struct perf_event_mmap_page *userpg;
4365 struct ring_buffer *rb;
4366
4367 rcu_read_lock();
4368 rb = rcu_dereference(event->rb);
4369 if (!rb)
4370 goto unlock;
4371
4372 userpg = rb->user_page;
4373
4374 /* Allow new userspace to detect that bit 0 is deprecated */
4375 userpg->cap_bit0_is_deprecated = 1;
4376 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
Alexander Shishkine8c6dea2015-01-14 14:18:10 +02004377 userpg->data_offset = PAGE_SIZE;
4378 userpg->data_size = perf_data_size(rb);
Peter Zijlstrafa731582013-09-19 10:16:42 +02004379
4380unlock:
4381 rcu_read_unlock();
4382}
4383
Andy Lutomirskic1317ec2014-10-24 15:58:11 -07004384void __weak arch_perf_update_userpage(
4385 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
Peter Zijlstrae3f35412011-11-21 11:43:53 +01004386{
4387}
4388
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004389/*
4390 * Callers need to ensure there can be no nesting of this function, otherwise
4391 * the seqlock logic goes bad. We can not serialize this because the arch
4392 * code calls this from NMI context.
4393 */
4394void perf_event_update_userpage(struct perf_event *event)
4395{
4396 struct perf_event_mmap_page *userpg;
Frederic Weisbecker76369132011-05-19 19:55:04 +02004397 struct ring_buffer *rb;
Peter Zijlstrae3f35412011-11-21 11:43:53 +01004398 u64 enabled, running, now;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004399
4400 rcu_read_lock();
Peter Zijlstra5ec4c592013-08-02 21:16:30 +02004401 rb = rcu_dereference(event->rb);
4402 if (!rb)
4403 goto unlock;
4404
Eric B Munson0d641202011-06-24 12:26:26 -04004405 /*
4406 * compute total_time_enabled, total_time_running
4407 * based on snapshot values taken when the event
4408 * was last scheduled in.
4409 *
4410 * we cannot simply called update_context_time()
4411 * because of locking issue as we can be called in
4412 * NMI context
4413 */
Peter Zijlstrae3f35412011-11-21 11:43:53 +01004414 calc_timer_values(event, &now, &enabled, &running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004415
Frederic Weisbecker76369132011-05-19 19:55:04 +02004416 userpg = rb->user_page;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004417 /*
4418 * Disable preemption so as to not let the corresponding user-space
4419 * spin too long if we get preempted.
4420 */
4421 preempt_disable();
4422 ++userpg->lock;
4423 barrier();
4424 userpg->index = perf_event_index(event);
Peter Zijlstrab5e58792010-05-21 14:43:12 +02004425 userpg->offset = perf_event_count(event);
Peter Zijlstra365a4032011-11-21 20:58:59 +01004426 if (userpg->index)
Peter Zijlstrae7850592010-05-21 14:43:08 +02004427 userpg->offset -= local64_read(&event->hw.prev_count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004428
Eric B Munson0d641202011-06-24 12:26:26 -04004429 userpg->time_enabled = enabled +
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004430 atomic64_read(&event->child_total_time_enabled);
4431
Eric B Munson0d641202011-06-24 12:26:26 -04004432 userpg->time_running = running +
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004433 atomic64_read(&event->child_total_time_running);
4434
Andy Lutomirskic1317ec2014-10-24 15:58:11 -07004435 arch_perf_update_userpage(event, userpg, now);
Peter Zijlstrae3f35412011-11-21 11:43:53 +01004436
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004437 barrier();
4438 ++userpg->lock;
4439 preempt_enable();
4440unlock:
4441 rcu_read_unlock();
4442}
4443
Peter Zijlstra906010b2009-09-21 16:08:49 +02004444static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4445{
4446 struct perf_event *event = vma->vm_file->private_data;
Frederic Weisbecker76369132011-05-19 19:55:04 +02004447 struct ring_buffer *rb;
Peter Zijlstra906010b2009-09-21 16:08:49 +02004448 int ret = VM_FAULT_SIGBUS;
4449
4450 if (vmf->flags & FAULT_FLAG_MKWRITE) {
4451 if (vmf->pgoff == 0)
4452 ret = 0;
4453 return ret;
4454 }
4455
4456 rcu_read_lock();
Frederic Weisbecker76369132011-05-19 19:55:04 +02004457 rb = rcu_dereference(event->rb);
4458 if (!rb)
Peter Zijlstra906010b2009-09-21 16:08:49 +02004459 goto unlock;
4460
4461 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4462 goto unlock;
4463
Frederic Weisbecker76369132011-05-19 19:55:04 +02004464 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
Peter Zijlstra906010b2009-09-21 16:08:49 +02004465 if (!vmf->page)
4466 goto unlock;
4467
4468 get_page(vmf->page);
4469 vmf->page->mapping = vma->vm_file->f_mapping;
4470 vmf->page->index = vmf->pgoff;
4471
4472 ret = 0;
4473unlock:
4474 rcu_read_unlock();
4475
4476 return ret;
4477}
4478
Peter Zijlstra10c6db12011-11-26 02:47:31 +01004479static void ring_buffer_attach(struct perf_event *event,
4480 struct ring_buffer *rb)
4481{
Peter Zijlstrab69cf532014-03-14 10:50:33 +01004482 struct ring_buffer *old_rb = NULL;
Peter Zijlstra10c6db12011-11-26 02:47:31 +01004483 unsigned long flags;
4484
Peter Zijlstrab69cf532014-03-14 10:50:33 +01004485 if (event->rb) {
4486 /*
4487 * Should be impossible, we set this when removing
4488 * event->rb_entry and wait/clear when adding event->rb_entry.
4489 */
4490 WARN_ON_ONCE(event->rcu_pending);
Peter Zijlstra10c6db12011-11-26 02:47:31 +01004491
Peter Zijlstrab69cf532014-03-14 10:50:33 +01004492 old_rb = event->rb;
Peter Zijlstrab69cf532014-03-14 10:50:33 +01004493 spin_lock_irqsave(&old_rb->event_lock, flags);
4494 list_del_rcu(&event->rb_entry);
4495 spin_unlock_irqrestore(&old_rb->event_lock, flags);
Peter Zijlstra10c6db12011-11-26 02:47:31 +01004496
Oleg Nesterov2f993cf2015-05-30 22:04:25 +02004497 event->rcu_batches = get_state_synchronize_rcu();
4498 event->rcu_pending = 1;
Peter Zijlstrab69cf532014-03-14 10:50:33 +01004499 }
Peter Zijlstra10c6db12011-11-26 02:47:31 +01004500
Peter Zijlstrab69cf532014-03-14 10:50:33 +01004501 if (rb) {
Oleg Nesterov2f993cf2015-05-30 22:04:25 +02004502 if (event->rcu_pending) {
4503 cond_synchronize_rcu(event->rcu_batches);
4504 event->rcu_pending = 0;
4505 }
4506
Peter Zijlstrab69cf532014-03-14 10:50:33 +01004507 spin_lock_irqsave(&rb->event_lock, flags);
4508 list_add_rcu(&event->rb_entry, &rb->event_list);
4509 spin_unlock_irqrestore(&rb->event_lock, flags);
4510 }
4511
4512 rcu_assign_pointer(event->rb, rb);
4513
4514 if (old_rb) {
4515 ring_buffer_put(old_rb);
4516 /*
4517 * Since we detached before setting the new rb, so that we
4518 * could attach the new rb, we could have missed a wakeup.
4519 * Provide it now.
4520 */
4521 wake_up_all(&event->waitq);
4522 }
Peter Zijlstra10c6db12011-11-26 02:47:31 +01004523}
4524
4525static void ring_buffer_wakeup(struct perf_event *event)
4526{
4527 struct ring_buffer *rb;
4528
4529 rcu_read_lock();
4530 rb = rcu_dereference(event->rb);
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004531 if (rb) {
4532 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4533 wake_up_all(&event->waitq);
4534 }
Peter Zijlstra10c6db12011-11-26 02:47:31 +01004535 rcu_read_unlock();
4536}
4537
Alexander Shishkinfdc26702015-01-14 14:18:16 +02004538struct ring_buffer *ring_buffer_get(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004539{
Frederic Weisbecker76369132011-05-19 19:55:04 +02004540 struct ring_buffer *rb;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004541
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004542 rcu_read_lock();
Frederic Weisbecker76369132011-05-19 19:55:04 +02004543 rb = rcu_dereference(event->rb);
4544 if (rb) {
4545 if (!atomic_inc_not_zero(&rb->refcount))
4546 rb = NULL;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004547 }
4548 rcu_read_unlock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004549
Frederic Weisbecker76369132011-05-19 19:55:04 +02004550 return rb;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004551}
4552
Alexander Shishkinfdc26702015-01-14 14:18:16 +02004553void ring_buffer_put(struct ring_buffer *rb)
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004554{
Frederic Weisbecker76369132011-05-19 19:55:04 +02004555 if (!atomic_dec_and_test(&rb->refcount))
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004556 return;
4557
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004558 WARN_ON_ONCE(!list_empty(&rb->event_list));
Peter Zijlstra10c6db12011-11-26 02:47:31 +01004559
Frederic Weisbecker76369132011-05-19 19:55:04 +02004560 call_rcu(&rb->rcu_head, rb_free_rcu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004561}
4562
4563static void perf_mmap_open(struct vm_area_struct *vma)
4564{
4565 struct perf_event *event = vma->vm_file->private_data;
4566
4567 atomic_inc(&event->mmap_count);
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004568 atomic_inc(&event->rb->mmap_count);
Andy Lutomirski1e0fb9e2014-10-24 15:58:10 -07004569
Peter Zijlstra45bfb2e2015-01-14 14:18:11 +02004570 if (vma->vm_pgoff)
4571 atomic_inc(&event->rb->aux_mmap_count);
4572
Andy Lutomirski1e0fb9e2014-10-24 15:58:10 -07004573 if (event->pmu->event_mapped)
4574 event->pmu->event_mapped(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004575}
4576
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004577/*
4578 * A buffer can be mmap()ed multiple times; either directly through the same
4579 * event, or through other events by use of perf_event_set_output().
4580 *
4581 * In order to undo the VM accounting done by perf_mmap() we need to destroy
4582 * the buffer here, where we still have a VM context. This means we need
4583 * to detach all events redirecting to us.
4584 */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004585static void perf_mmap_close(struct vm_area_struct *vma)
4586{
4587 struct perf_event *event = vma->vm_file->private_data;
4588
Peter Zijlstrab69cf532014-03-14 10:50:33 +01004589 struct ring_buffer *rb = ring_buffer_get(event);
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004590 struct user_struct *mmap_user = rb->mmap_user;
4591 int mmap_locked = rb->mmap_locked;
4592 unsigned long size = perf_data_size(rb);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004593
Andy Lutomirski1e0fb9e2014-10-24 15:58:10 -07004594 if (event->pmu->event_unmapped)
4595 event->pmu->event_unmapped(event);
4596
Peter Zijlstra45bfb2e2015-01-14 14:18:11 +02004597 /*
4598 * rb->aux_mmap_count will always drop before rb->mmap_count and
4599 * event->mmap_count, so it is ok to use event->mmap_mutex to
4600 * serialize with perf_mmap here.
4601 */
4602 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
4603 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
4604 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
4605 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
4606
4607 rb_free_aux(rb);
4608 mutex_unlock(&event->mmap_mutex);
4609 }
4610
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004611 atomic_dec(&rb->mmap_count);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004612
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004613 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
Peter Zijlstrab69cf532014-03-14 10:50:33 +01004614 goto out_put;
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004615
Peter Zijlstrab69cf532014-03-14 10:50:33 +01004616 ring_buffer_attach(event, NULL);
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004617 mutex_unlock(&event->mmap_mutex);
4618
4619 /* If there's still other mmap()s of this buffer, we're done. */
Peter Zijlstrab69cf532014-03-14 10:50:33 +01004620 if (atomic_read(&rb->mmap_count))
4621 goto out_put;
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004622
4623 /*
4624 * No other mmap()s, detach from all other events that might redirect
4625 * into the now unreachable buffer. Somewhat complicated by the
4626 * fact that rb::event_lock otherwise nests inside mmap_mutex.
4627 */
4628again:
4629 rcu_read_lock();
4630 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
4631 if (!atomic_long_inc_not_zero(&event->refcount)) {
4632 /*
4633 * This event is en-route to free_event() which will
4634 * detach it and remove it from the list.
4635 */
4636 continue;
4637 }
4638 rcu_read_unlock();
4639
4640 mutex_lock(&event->mmap_mutex);
4641 /*
4642 * Check we didn't race with perf_event_set_output() which can
4643 * swizzle the rb from under us while we were waiting to
4644 * acquire mmap_mutex.
4645 *
4646 * If we find a different rb; ignore this event, a next
4647 * iteration will no longer find it on the list. We have to
4648 * still restart the iteration to make sure we're not now
4649 * iterating the wrong list.
4650 */
Peter Zijlstrab69cf532014-03-14 10:50:33 +01004651 if (event->rb == rb)
4652 ring_buffer_attach(event, NULL);
4653
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004654 mutex_unlock(&event->mmap_mutex);
4655 put_event(event);
4656
4657 /*
4658 * Restart the iteration; either we're on the wrong list or
4659 * destroyed its integrity by doing a deletion.
4660 */
4661 goto again;
4662 }
4663 rcu_read_unlock();
4664
4665 /*
4666 * It could be there's still a few 0-ref events on the list; they'll
4667 * get cleaned up by free_event() -- they'll also still have their
4668 * ref on the rb and will free it whenever they are done with it.
4669 *
4670 * Aside from that, this buffer is 'fully' detached and unmapped,
4671 * undo the VM accounting.
4672 */
4673
4674 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
4675 vma->vm_mm->pinned_vm -= mmap_locked;
4676 free_uid(mmap_user);
4677
Peter Zijlstrab69cf532014-03-14 10:50:33 +01004678out_put:
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004679 ring_buffer_put(rb); /* could be last */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004680}
4681
Alexey Dobriyanf0f37e22009-09-27 22:29:37 +04004682static const struct vm_operations_struct perf_mmap_vmops = {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004683 .open = perf_mmap_open,
Peter Zijlstra45bfb2e2015-01-14 14:18:11 +02004684 .close = perf_mmap_close, /* non mergable */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004685 .fault = perf_mmap_fault,
4686 .page_mkwrite = perf_mmap_fault,
4687};
4688
4689static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4690{
4691 struct perf_event *event = file->private_data;
4692 unsigned long user_locked, user_lock_limit;
4693 struct user_struct *user = current_user();
4694 unsigned long locked, lock_limit;
Peter Zijlstra45bfb2e2015-01-14 14:18:11 +02004695 struct ring_buffer *rb = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004696 unsigned long vma_size;
4697 unsigned long nr_pages;
Peter Zijlstra45bfb2e2015-01-14 14:18:11 +02004698 long user_extra = 0, extra = 0;
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02004699 int ret = 0, flags = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004700
Peter Zijlstrac7920612010-05-18 10:33:24 +02004701 /*
4702 * Don't allow mmap() of inherited per-task counters. This would
4703 * create a performance issue due to all children writing to the
Frederic Weisbecker76369132011-05-19 19:55:04 +02004704 * same rb.
Peter Zijlstrac7920612010-05-18 10:33:24 +02004705 */
4706 if (event->cpu == -1 && event->attr.inherit)
4707 return -EINVAL;
4708
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004709 if (!(vma->vm_flags & VM_SHARED))
4710 return -EINVAL;
4711
4712 vma_size = vma->vm_end - vma->vm_start;
Peter Zijlstra45bfb2e2015-01-14 14:18:11 +02004713
4714 if (vma->vm_pgoff == 0) {
4715 nr_pages = (vma_size / PAGE_SIZE) - 1;
4716 } else {
4717 /*
4718 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
4719 * mapped, all subsequent mappings should have the same size
4720 * and offset. Must be above the normal perf buffer.
4721 */
4722 u64 aux_offset, aux_size;
4723
4724 if (!event->rb)
4725 return -EINVAL;
4726
4727 nr_pages = vma_size / PAGE_SIZE;
4728
4729 mutex_lock(&event->mmap_mutex);
4730 ret = -EINVAL;
4731
4732 rb = event->rb;
4733 if (!rb)
4734 goto aux_unlock;
4735
4736 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
4737 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
4738
4739 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
4740 goto aux_unlock;
4741
4742 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
4743 goto aux_unlock;
4744
4745 /* already mapped with a different offset */
4746 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
4747 goto aux_unlock;
4748
4749 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
4750 goto aux_unlock;
4751
4752 /* already mapped with a different size */
4753 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
4754 goto aux_unlock;
4755
4756 if (!is_power_of_2(nr_pages))
4757 goto aux_unlock;
4758
4759 if (!atomic_inc_not_zero(&rb->mmap_count))
4760 goto aux_unlock;
4761
4762 if (rb_has_aux(rb)) {
4763 atomic_inc(&rb->aux_mmap_count);
4764 ret = 0;
4765 goto unlock;
4766 }
4767
4768 atomic_set(&rb->aux_mmap_count, 1);
4769 user_extra = nr_pages;
4770
4771 goto accounting;
4772 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004773
4774 /*
Frederic Weisbecker76369132011-05-19 19:55:04 +02004775 * If we have rb pages ensure they're a power-of-two number, so we
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004776 * can do bitmasks instead of modulo.
4777 */
Kan Liang2ed11312015-03-02 02:14:26 -05004778 if (nr_pages != 0 && !is_power_of_2(nr_pages))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004779 return -EINVAL;
4780
4781 if (vma_size != PAGE_SIZE * (1 + nr_pages))
4782 return -EINVAL;
4783
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004784 WARN_ON_ONCE(event->ctx->parent_ctx);
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004785again:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004786 mutex_lock(&event->mmap_mutex);
Frederic Weisbecker76369132011-05-19 19:55:04 +02004787 if (event->rb) {
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004788 if (event->rb->nr_pages != nr_pages) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004789 ret = -EINVAL;
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004790 goto unlock;
4791 }
4792
4793 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
4794 /*
4795 * Raced against perf_mmap_close() through
4796 * perf_event_set_output(). Try again, hope for better
4797 * luck.
4798 */
4799 mutex_unlock(&event->mmap_mutex);
4800 goto again;
4801 }
4802
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004803 goto unlock;
4804 }
4805
4806 user_extra = nr_pages + 1;
Peter Zijlstra45bfb2e2015-01-14 14:18:11 +02004807
4808accounting:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004809 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
4810
4811 /*
4812 * Increase the limit linearly with more CPUs:
4813 */
4814 user_lock_limit *= num_online_cpus();
4815
4816 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
4817
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004818 if (user_locked > user_lock_limit)
4819 extra = user_locked - user_lock_limit;
4820
Jiri Slaby78d7d402010-03-05 13:42:54 -08004821 lock_limit = rlimit(RLIMIT_MEMLOCK);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004822 lock_limit >>= PAGE_SHIFT;
Christoph Lameterbc3e53f2011-10-31 17:07:30 -07004823 locked = vma->vm_mm->pinned_vm + extra;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004824
4825 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
4826 !capable(CAP_IPC_LOCK)) {
4827 ret = -EPERM;
4828 goto unlock;
4829 }
4830
Peter Zijlstra45bfb2e2015-01-14 14:18:11 +02004831 WARN_ON(!rb && event->rb);
Peter Zijlstra906010b2009-09-21 16:08:49 +02004832
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02004833 if (vma->vm_flags & VM_WRITE)
Frederic Weisbecker76369132011-05-19 19:55:04 +02004834 flags |= RING_BUFFER_WRITABLE;
Peter Zijlstrad57e34f2010-05-28 19:41:35 +02004835
Frederic Weisbecker76369132011-05-19 19:55:04 +02004836 if (!rb) {
Peter Zijlstra45bfb2e2015-01-14 14:18:11 +02004837 rb = rb_alloc(nr_pages,
4838 event->attr.watermark ? event->attr.wakeup_watermark : 0,
4839 event->cpu, flags);
4840
4841 if (!rb) {
4842 ret = -ENOMEM;
4843 goto unlock;
4844 }
4845
4846 atomic_set(&rb->mmap_count, 1);
4847 rb->mmap_user = get_current_user();
4848 rb->mmap_locked = extra;
4849
4850 ring_buffer_attach(event, rb);
4851
4852 perf_event_init_userpage(event);
4853 perf_event_update_userpage(event);
4854 } else {
Alexander Shishkin1a594132015-01-14 14:18:18 +02004855 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
4856 event->attr.aux_watermark, flags);
Peter Zijlstra45bfb2e2015-01-14 14:18:11 +02004857 if (!ret)
4858 rb->aux_mmap_locked = extra;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004859 }
Peter Zijlstra26cb63a2013-05-28 10:55:48 +02004860
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004861unlock:
Peter Zijlstra45bfb2e2015-01-14 14:18:11 +02004862 if (!ret) {
4863 atomic_long_add(user_extra, &user->locked_vm);
4864 vma->vm_mm->pinned_vm += extra;
4865
Peter Zijlstraac9721f2010-05-27 12:54:41 +02004866 atomic_inc(&event->mmap_count);
Peter Zijlstra45bfb2e2015-01-14 14:18:11 +02004867 } else if (rb) {
4868 atomic_dec(&rb->mmap_count);
4869 }
4870aux_unlock:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004871 mutex_unlock(&event->mmap_mutex);
4872
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02004873 /*
4874 * Since pinned accounting is per vm we cannot allow fork() to copy our
4875 * vma.
4876 */
Peter Zijlstra26cb63a2013-05-28 10:55:48 +02004877 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004878 vma->vm_ops = &perf_mmap_vmops;
4879
Andy Lutomirski1e0fb9e2014-10-24 15:58:10 -07004880 if (event->pmu->event_mapped)
4881 event->pmu->event_mapped(event);
4882
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004883 return ret;
4884}
4885
4886static int perf_fasync(int fd, struct file *filp, int on)
4887{
Al Viro496ad9a2013-01-23 17:07:38 -05004888 struct inode *inode = file_inode(filp);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004889 struct perf_event *event = filp->private_data;
4890 int retval;
4891
Al Viro59551022016-01-22 15:40:57 -05004892 inode_lock(inode);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004893 retval = fasync_helper(fd, filp, on, &event->fasync);
Al Viro59551022016-01-22 15:40:57 -05004894 inode_unlock(inode);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004895
4896 if (retval < 0)
4897 return retval;
4898
4899 return 0;
4900}
4901
4902static const struct file_operations perf_fops = {
Arnd Bergmann3326c1c2010-03-23 19:09:33 +01004903 .llseek = no_llseek,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004904 .release = perf_release,
4905 .read = perf_read,
4906 .poll = perf_poll,
4907 .unlocked_ioctl = perf_ioctl,
Pawel Mollb3f20782014-06-13 16:03:32 +01004908 .compat_ioctl = perf_compat_ioctl,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004909 .mmap = perf_mmap,
4910 .fasync = perf_fasync,
4911};
4912
4913/*
4914 * Perf event wakeup
4915 *
4916 * If there's data, ensure we set the poll() state and publish everything
4917 * to user-space before waking everybody up.
4918 */
4919
Peter Zijlstrafed66e2cd2015-06-11 10:32:01 +02004920static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
4921{
4922 /* only the parent has fasync state */
4923 if (event->parent)
4924 event = event->parent;
4925 return &event->fasync;
4926}
4927
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004928void perf_event_wakeup(struct perf_event *event)
4929{
Peter Zijlstra10c6db12011-11-26 02:47:31 +01004930 ring_buffer_wakeup(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004931
4932 if (event->pending_kill) {
Peter Zijlstrafed66e2cd2015-06-11 10:32:01 +02004933 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004934 event->pending_kill = 0;
4935 }
4936}
4937
Peter Zijlstrae360adb2010-10-14 14:01:34 +08004938static void perf_pending_event(struct irq_work *entry)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004939{
4940 struct perf_event *event = container_of(entry,
4941 struct perf_event, pending);
Peter Zijlstrad5252112015-02-19 18:03:11 +01004942 int rctx;
4943
4944 rctx = perf_swevent_get_recursion_context();
4945 /*
4946 * If we 'fail' here, that's OK, it means recursion is already disabled
4947 * and we won't recurse 'further'.
4948 */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004949
4950 if (event->pending_disable) {
4951 event->pending_disable = 0;
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01004952 perf_event_disable_local(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004953 }
4954
4955 if (event->pending_wakeup) {
4956 event->pending_wakeup = 0;
4957 perf_event_wakeup(event);
4958 }
Peter Zijlstrad5252112015-02-19 18:03:11 +01004959
4960 if (rctx >= 0)
4961 perf_swevent_put_recursion_context(rctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004962}
4963
Ingo Molnarcdd6c482009-09-21 12:02:48 +02004964/*
Zhang, Yanmin39447b32010-04-19 13:32:41 +08004965 * We assume there is only KVM supporting the callbacks.
4966 * Later on, we might change it to a list if there is
4967 * another virtualization implementation supporting the callbacks.
4968 */
4969struct perf_guest_info_callbacks *perf_guest_cbs;
4970
4971int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4972{
4973 perf_guest_cbs = cbs;
4974 return 0;
4975}
4976EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
4977
4978int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4979{
4980 perf_guest_cbs = NULL;
4981 return 0;
4982}
4983EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
4984
Jiri Olsa40189942012-08-07 15:20:37 +02004985static void
4986perf_output_sample_regs(struct perf_output_handle *handle,
4987 struct pt_regs *regs, u64 mask)
4988{
4989 int bit;
4990
4991 for_each_set_bit(bit, (const unsigned long *) &mask,
4992 sizeof(mask) * BITS_PER_BYTE) {
4993 u64 val;
4994
4995 val = perf_reg_value(regs, bit);
4996 perf_output_put(handle, val);
4997 }
4998}
4999
Stephane Eranian60e23642014-09-24 13:48:37 +02005000static void perf_sample_regs_user(struct perf_regs *regs_user,
Andy Lutomirski88a7c262015-01-04 10:36:19 -08005001 struct pt_regs *regs,
5002 struct pt_regs *regs_user_copy)
Jiri Olsa40189942012-08-07 15:20:37 +02005003{
Andy Lutomirski88a7c262015-01-04 10:36:19 -08005004 if (user_mode(regs)) {
5005 regs_user->abi = perf_reg_abi(current);
Peter Zijlstra25657112014-09-24 13:48:42 +02005006 regs_user->regs = regs;
Andy Lutomirski88a7c262015-01-04 10:36:19 -08005007 } else if (current->mm) {
5008 perf_get_regs_user(regs_user, regs, regs_user_copy);
Peter Zijlstra25657112014-09-24 13:48:42 +02005009 } else {
5010 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5011 regs_user->regs = NULL;
Jiri Olsa40189942012-08-07 15:20:37 +02005012 }
5013}
5014
Stephane Eranian60e23642014-09-24 13:48:37 +02005015static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5016 struct pt_regs *regs)
5017{
5018 regs_intr->regs = regs;
5019 regs_intr->abi = perf_reg_abi(current);
5020}
5021
5022
Jiri Olsac5ebced2012-08-07 15:20:40 +02005023/*
5024 * Get remaining task size from user stack pointer.
5025 *
5026 * It'd be better to take stack vma map and limit this more
5027 * precisly, but there's no way to get it safely under interrupt,
5028 * so using TASK_SIZE as limit.
5029 */
5030static u64 perf_ustack_task_size(struct pt_regs *regs)
5031{
5032 unsigned long addr = perf_user_stack_pointer(regs);
5033
5034 if (!addr || addr >= TASK_SIZE)
5035 return 0;
5036
5037 return TASK_SIZE - addr;
5038}
5039
5040static u16
5041perf_sample_ustack_size(u16 stack_size, u16 header_size,
5042 struct pt_regs *regs)
5043{
5044 u64 task_size;
5045
5046 /* No regs, no stack pointer, no dump. */
5047 if (!regs)
5048 return 0;
5049
5050 /*
5051 * Check if we fit in with the requested stack size into the:
5052 * - TASK_SIZE
5053 * If we don't, we limit the size to the TASK_SIZE.
5054 *
5055 * - remaining sample size
5056 * If we don't, we customize the stack size to
5057 * fit in to the remaining sample size.
5058 */
5059
5060 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
5061 stack_size = min(stack_size, (u16) task_size);
5062
5063 /* Current header size plus static size and dynamic size. */
5064 header_size += 2 * sizeof(u64);
5065
5066 /* Do we fit in with the current stack dump size? */
5067 if ((u16) (header_size + stack_size) < header_size) {
5068 /*
5069 * If we overflow the maximum size for the sample,
5070 * we customize the stack dump size to fit in.
5071 */
5072 stack_size = USHRT_MAX - header_size - sizeof(u64);
5073 stack_size = round_up(stack_size, sizeof(u64));
5074 }
5075
5076 return stack_size;
5077}
5078
5079static void
5080perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
5081 struct pt_regs *regs)
5082{
5083 /* Case of a kernel thread, nothing to dump */
5084 if (!regs) {
5085 u64 size = 0;
5086 perf_output_put(handle, size);
5087 } else {
5088 unsigned long sp;
5089 unsigned int rem;
5090 u64 dyn_size;
5091
5092 /*
5093 * We dump:
5094 * static size
5095 * - the size requested by user or the best one we can fit
5096 * in to the sample max size
5097 * data
5098 * - user stack dump data
5099 * dynamic size
5100 * - the actual dumped size
5101 */
5102
5103 /* Static size. */
5104 perf_output_put(handle, dump_size);
5105
5106 /* Data. */
5107 sp = perf_user_stack_pointer(regs);
5108 rem = __output_copy_user(handle, (void *) sp, dump_size);
5109 dyn_size = dump_size - rem;
5110
5111 perf_output_skip(handle, rem);
5112
5113 /* Dynamic size. */
5114 perf_output_put(handle, dyn_size);
5115 }
5116}
5117
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005118static void __perf_event_header__init_id(struct perf_event_header *header,
5119 struct perf_sample_data *data,
5120 struct perf_event *event)
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02005121{
5122 u64 sample_type = event->attr.sample_type;
5123
5124 data->type = sample_type;
5125 header->size += event->id_header_size;
5126
5127 if (sample_type & PERF_SAMPLE_TID) {
5128 /* namespace issues */
5129 data->tid_entry.pid = perf_event_pid(event, current);
5130 data->tid_entry.tid = perf_event_tid(event, current);
5131 }
5132
5133 if (sample_type & PERF_SAMPLE_TIME)
Peter Zijlstra34f43922015-02-20 14:05:38 +01005134 data->time = perf_event_clock(event);
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02005135
Adrian Hunterff3d5272013-08-27 11:23:07 +03005136 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02005137 data->id = primary_event_id(event);
5138
5139 if (sample_type & PERF_SAMPLE_STREAM_ID)
5140 data->stream_id = event->id;
5141
5142 if (sample_type & PERF_SAMPLE_CPU) {
5143 data->cpu_entry.cpu = raw_smp_processor_id();
5144 data->cpu_entry.reserved = 0;
5145 }
5146}
5147
Frederic Weisbecker76369132011-05-19 19:55:04 +02005148void perf_event_header__init_id(struct perf_event_header *header,
5149 struct perf_sample_data *data,
5150 struct perf_event *event)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005151{
5152 if (event->attr.sample_id_all)
5153 __perf_event_header__init_id(header, data, event);
5154}
5155
5156static void __perf_event__output_id_sample(struct perf_output_handle *handle,
5157 struct perf_sample_data *data)
5158{
5159 u64 sample_type = data->type;
5160
5161 if (sample_type & PERF_SAMPLE_TID)
5162 perf_output_put(handle, data->tid_entry);
5163
5164 if (sample_type & PERF_SAMPLE_TIME)
5165 perf_output_put(handle, data->time);
5166
5167 if (sample_type & PERF_SAMPLE_ID)
5168 perf_output_put(handle, data->id);
5169
5170 if (sample_type & PERF_SAMPLE_STREAM_ID)
5171 perf_output_put(handle, data->stream_id);
5172
5173 if (sample_type & PERF_SAMPLE_CPU)
5174 perf_output_put(handle, data->cpu_entry);
Adrian Hunterff3d5272013-08-27 11:23:07 +03005175
5176 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5177 perf_output_put(handle, data->id);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005178}
5179
Frederic Weisbecker76369132011-05-19 19:55:04 +02005180void perf_event__output_id_sample(struct perf_event *event,
5181 struct perf_output_handle *handle,
5182 struct perf_sample_data *sample)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005183{
5184 if (event->attr.sample_id_all)
5185 __perf_event__output_id_sample(handle, sample);
5186}
5187
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005188static void perf_output_read_one(struct perf_output_handle *handle,
Stephane Eranianeed01522010-10-26 16:08:01 +02005189 struct perf_event *event,
5190 u64 enabled, u64 running)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005191{
5192 u64 read_format = event->attr.read_format;
5193 u64 values[4];
5194 int n = 0;
5195
Peter Zijlstrab5e58792010-05-21 14:43:12 +02005196 values[n++] = perf_event_count(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005197 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
Stephane Eranianeed01522010-10-26 16:08:01 +02005198 values[n++] = enabled +
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005199 atomic64_read(&event->child_total_time_enabled);
5200 }
5201 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
Stephane Eranianeed01522010-10-26 16:08:01 +02005202 values[n++] = running +
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005203 atomic64_read(&event->child_total_time_running);
5204 }
5205 if (read_format & PERF_FORMAT_ID)
5206 values[n++] = primary_event_id(event);
5207
Frederic Weisbecker76369132011-05-19 19:55:04 +02005208 __output_copy(handle, values, n * sizeof(u64));
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005209}
5210
5211/*
5212 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
5213 */
5214static void perf_output_read_group(struct perf_output_handle *handle,
Stephane Eranianeed01522010-10-26 16:08:01 +02005215 struct perf_event *event,
5216 u64 enabled, u64 running)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005217{
5218 struct perf_event *leader = event->group_leader, *sub;
5219 u64 read_format = event->attr.read_format;
5220 u64 values[5];
5221 int n = 0;
5222
5223 values[n++] = 1 + leader->nr_siblings;
5224
5225 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
Stephane Eranianeed01522010-10-26 16:08:01 +02005226 values[n++] = enabled;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005227
5228 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
Stephane Eranianeed01522010-10-26 16:08:01 +02005229 values[n++] = running;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005230
5231 if (leader != event)
5232 leader->pmu->read(leader);
5233
Peter Zijlstrab5e58792010-05-21 14:43:12 +02005234 values[n++] = perf_event_count(leader);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005235 if (read_format & PERF_FORMAT_ID)
5236 values[n++] = primary_event_id(leader);
5237
Frederic Weisbecker76369132011-05-19 19:55:04 +02005238 __output_copy(handle, values, n * sizeof(u64));
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005239
5240 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
5241 n = 0;
5242
Jiri Olsa6f5ab002012-10-15 20:13:45 +02005243 if ((sub != event) &&
5244 (sub->state == PERF_EVENT_STATE_ACTIVE))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005245 sub->pmu->read(sub);
5246
Peter Zijlstrab5e58792010-05-21 14:43:12 +02005247 values[n++] = perf_event_count(sub);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005248 if (read_format & PERF_FORMAT_ID)
5249 values[n++] = primary_event_id(sub);
5250
Frederic Weisbecker76369132011-05-19 19:55:04 +02005251 __output_copy(handle, values, n * sizeof(u64));
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005252 }
5253}
5254
Stephane Eranianeed01522010-10-26 16:08:01 +02005255#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5256 PERF_FORMAT_TOTAL_TIME_RUNNING)
5257
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005258static void perf_output_read(struct perf_output_handle *handle,
5259 struct perf_event *event)
5260{
Peter Zijlstrae3f35412011-11-21 11:43:53 +01005261 u64 enabled = 0, running = 0, now;
Stephane Eranianeed01522010-10-26 16:08:01 +02005262 u64 read_format = event->attr.read_format;
5263
5264 /*
5265 * compute total_time_enabled, total_time_running
5266 * based on snapshot values taken when the event
5267 * was last scheduled in.
5268 *
5269 * we cannot simply called update_context_time()
5270 * because of locking issue as we are called in
5271 * NMI context
5272 */
Eric B Munsonc4794292011-06-23 16:34:38 -04005273 if (read_format & PERF_FORMAT_TOTAL_TIMES)
Peter Zijlstrae3f35412011-11-21 11:43:53 +01005274 calc_timer_values(event, &now, &enabled, &running);
Stephane Eranianeed01522010-10-26 16:08:01 +02005275
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005276 if (event->attr.read_format & PERF_FORMAT_GROUP)
Stephane Eranianeed01522010-10-26 16:08:01 +02005277 perf_output_read_group(handle, event, enabled, running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005278 else
Stephane Eranianeed01522010-10-26 16:08:01 +02005279 perf_output_read_one(handle, event, enabled, running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005280}
5281
5282void perf_output_sample(struct perf_output_handle *handle,
5283 struct perf_event_header *header,
5284 struct perf_sample_data *data,
5285 struct perf_event *event)
5286{
5287 u64 sample_type = data->type;
5288
5289 perf_output_put(handle, *header);
5290
Adrian Hunterff3d5272013-08-27 11:23:07 +03005291 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5292 perf_output_put(handle, data->id);
5293
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005294 if (sample_type & PERF_SAMPLE_IP)
5295 perf_output_put(handle, data->ip);
5296
5297 if (sample_type & PERF_SAMPLE_TID)
5298 perf_output_put(handle, data->tid_entry);
5299
5300 if (sample_type & PERF_SAMPLE_TIME)
5301 perf_output_put(handle, data->time);
5302
5303 if (sample_type & PERF_SAMPLE_ADDR)
5304 perf_output_put(handle, data->addr);
5305
5306 if (sample_type & PERF_SAMPLE_ID)
5307 perf_output_put(handle, data->id);
5308
5309 if (sample_type & PERF_SAMPLE_STREAM_ID)
5310 perf_output_put(handle, data->stream_id);
5311
5312 if (sample_type & PERF_SAMPLE_CPU)
5313 perf_output_put(handle, data->cpu_entry);
5314
5315 if (sample_type & PERF_SAMPLE_PERIOD)
5316 perf_output_put(handle, data->period);
5317
5318 if (sample_type & PERF_SAMPLE_READ)
5319 perf_output_read(handle, event);
5320
5321 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5322 if (data->callchain) {
5323 int size = 1;
5324
5325 if (data->callchain)
5326 size += data->callchain->nr;
5327
5328 size *= sizeof(u64);
5329
Frederic Weisbecker76369132011-05-19 19:55:04 +02005330 __output_copy(handle, data->callchain, size);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005331 } else {
5332 u64 nr = 0;
5333 perf_output_put(handle, nr);
5334 }
5335 }
5336
5337 if (sample_type & PERF_SAMPLE_RAW) {
5338 if (data->raw) {
Alexei Starovoitovfa128e62015-10-20 20:02:33 -07005339 u32 raw_size = data->raw->size;
5340 u32 real_size = round_up(raw_size + sizeof(u32),
5341 sizeof(u64)) - sizeof(u32);
5342 u64 zero = 0;
5343
5344 perf_output_put(handle, real_size);
5345 __output_copy(handle, data->raw->data, raw_size);
5346 if (real_size - raw_size)
5347 __output_copy(handle, &zero, real_size - raw_size);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005348 } else {
5349 struct {
5350 u32 size;
5351 u32 data;
5352 } raw = {
5353 .size = sizeof(u32),
5354 .data = 0,
5355 };
5356 perf_output_put(handle, raw);
5357 }
5358 }
Peter Zijlstraa7ac67e2011-06-27 16:47:16 +02005359
Stephane Eranianbce38cd2012-02-09 23:20:51 +01005360 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5361 if (data->br_stack) {
5362 size_t size;
5363
5364 size = data->br_stack->nr
5365 * sizeof(struct perf_branch_entry);
5366
5367 perf_output_put(handle, data->br_stack->nr);
5368 perf_output_copy(handle, data->br_stack->entries, size);
5369 } else {
5370 /*
5371 * we always store at least the value of nr
5372 */
5373 u64 nr = 0;
5374 perf_output_put(handle, nr);
5375 }
5376 }
Jiri Olsa40189942012-08-07 15:20:37 +02005377
5378 if (sample_type & PERF_SAMPLE_REGS_USER) {
5379 u64 abi = data->regs_user.abi;
5380
5381 /*
5382 * If there are no regs to dump, notice it through
5383 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5384 */
5385 perf_output_put(handle, abi);
5386
5387 if (abi) {
5388 u64 mask = event->attr.sample_regs_user;
5389 perf_output_sample_regs(handle,
5390 data->regs_user.regs,
5391 mask);
5392 }
5393 }
Jiri Olsac5ebced2012-08-07 15:20:40 +02005394
Peter Zijlstraa5cdd402013-07-16 17:09:07 +02005395 if (sample_type & PERF_SAMPLE_STACK_USER) {
Jiri Olsac5ebced2012-08-07 15:20:40 +02005396 perf_output_sample_ustack(handle,
5397 data->stack_user_size,
5398 data->regs_user.regs);
Peter Zijlstraa5cdd402013-07-16 17:09:07 +02005399 }
Andi Kleenc3feedf2013-01-24 16:10:28 +01005400
5401 if (sample_type & PERF_SAMPLE_WEIGHT)
5402 perf_output_put(handle, data->weight);
Stephane Eraniand6be9ad2013-01-24 16:10:31 +01005403
5404 if (sample_type & PERF_SAMPLE_DATA_SRC)
5405 perf_output_put(handle, data->data_src.val);
Peter Zijlstraa5cdd402013-07-16 17:09:07 +02005406
Andi Kleenfdfbbd02013-09-20 07:40:39 -07005407 if (sample_type & PERF_SAMPLE_TRANSACTION)
5408 perf_output_put(handle, data->txn);
5409
Stephane Eranian60e23642014-09-24 13:48:37 +02005410 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5411 u64 abi = data->regs_intr.abi;
5412 /*
5413 * If there are no regs to dump, notice it through
5414 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5415 */
5416 perf_output_put(handle, abi);
5417
5418 if (abi) {
5419 u64 mask = event->attr.sample_regs_intr;
5420
5421 perf_output_sample_regs(handle,
5422 data->regs_intr.regs,
5423 mask);
5424 }
5425 }
5426
Peter Zijlstraa5cdd402013-07-16 17:09:07 +02005427 if (!event->attr.watermark) {
5428 int wakeup_events = event->attr.wakeup_events;
5429
5430 if (wakeup_events) {
5431 struct ring_buffer *rb = handle->rb;
5432 int events = local_inc_return(&rb->events);
5433
5434 if (events >= wakeup_events) {
5435 local_sub(wakeup_events, &rb->events);
5436 local_inc(&rb->wakeup);
5437 }
5438 }
5439 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005440}
5441
5442void perf_prepare_sample(struct perf_event_header *header,
5443 struct perf_sample_data *data,
5444 struct perf_event *event,
5445 struct pt_regs *regs)
5446{
5447 u64 sample_type = event->attr.sample_type;
5448
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005449 header->type = PERF_RECORD_SAMPLE;
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02005450 header->size = sizeof(*header) + event->header_size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005451
5452 header->misc = 0;
5453 header->misc |= perf_misc_flags(regs);
5454
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005455 __perf_event_header__init_id(header, data, event);
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02005456
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02005457 if (sample_type & PERF_SAMPLE_IP)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005458 data->ip = perf_instruction_pointer(regs);
5459
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005460 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5461 int size = 1;
5462
Andrew Vagine6dab5f2012-07-11 18:14:58 +04005463 data->callchain = perf_callchain(event, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005464
5465 if (data->callchain)
5466 size += data->callchain->nr;
5467
5468 header->size += size * sizeof(u64);
5469 }
5470
5471 if (sample_type & PERF_SAMPLE_RAW) {
5472 int size = sizeof(u32);
5473
5474 if (data->raw)
5475 size += data->raw->size;
5476 else
5477 size += sizeof(u32);
5478
Alexei Starovoitovfa128e62015-10-20 20:02:33 -07005479 header->size += round_up(size, sizeof(u64));
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005480 }
Stephane Eranianbce38cd2012-02-09 23:20:51 +01005481
5482 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5483 int size = sizeof(u64); /* nr */
5484 if (data->br_stack) {
5485 size += data->br_stack->nr
5486 * sizeof(struct perf_branch_entry);
5487 }
5488 header->size += size;
5489 }
Jiri Olsa40189942012-08-07 15:20:37 +02005490
Peter Zijlstra25657112014-09-24 13:48:42 +02005491 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
Andy Lutomirski88a7c262015-01-04 10:36:19 -08005492 perf_sample_regs_user(&data->regs_user, regs,
5493 &data->regs_user_copy);
Peter Zijlstra25657112014-09-24 13:48:42 +02005494
Jiri Olsa40189942012-08-07 15:20:37 +02005495 if (sample_type & PERF_SAMPLE_REGS_USER) {
5496 /* regs dump ABI info */
5497 int size = sizeof(u64);
5498
Jiri Olsa40189942012-08-07 15:20:37 +02005499 if (data->regs_user.regs) {
5500 u64 mask = event->attr.sample_regs_user;
5501 size += hweight64(mask) * sizeof(u64);
5502 }
5503
5504 header->size += size;
5505 }
Jiri Olsac5ebced2012-08-07 15:20:40 +02005506
5507 if (sample_type & PERF_SAMPLE_STACK_USER) {
5508 /*
5509 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
5510 * processed as the last one or have additional check added
5511 * in case new sample type is added, because we could eat
5512 * up the rest of the sample size.
5513 */
Jiri Olsac5ebced2012-08-07 15:20:40 +02005514 u16 stack_size = event->attr.sample_stack_user;
5515 u16 size = sizeof(u64);
5516
Jiri Olsac5ebced2012-08-07 15:20:40 +02005517 stack_size = perf_sample_ustack_size(stack_size, header->size,
Peter Zijlstra25657112014-09-24 13:48:42 +02005518 data->regs_user.regs);
Jiri Olsac5ebced2012-08-07 15:20:40 +02005519
5520 /*
5521 * If there is something to dump, add space for the dump
5522 * itself and for the field that tells the dynamic size,
5523 * which is how many have been actually dumped.
5524 */
5525 if (stack_size)
5526 size += sizeof(u64) + stack_size;
5527
5528 data->stack_user_size = stack_size;
5529 header->size += size;
5530 }
Stephane Eranian60e23642014-09-24 13:48:37 +02005531
5532 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5533 /* regs dump ABI info */
5534 int size = sizeof(u64);
5535
5536 perf_sample_regs_intr(&data->regs_intr, regs);
5537
5538 if (data->regs_intr.regs) {
5539 u64 mask = event->attr.sample_regs_intr;
5540
5541 size += hweight64(mask) * sizeof(u64);
5542 }
5543
5544 header->size += size;
5545 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005546}
5547
Yan, Zheng21509082015-05-06 15:33:49 -04005548void perf_event_output(struct perf_event *event,
5549 struct perf_sample_data *data,
5550 struct pt_regs *regs)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005551{
5552 struct perf_output_handle handle;
5553 struct perf_event_header header;
5554
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02005555 /* protect the callchain buffers */
5556 rcu_read_lock();
5557
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005558 perf_prepare_sample(&header, data, event, regs);
5559
Peter Zijlstraa7ac67e2011-06-27 16:47:16 +02005560 if (perf_output_begin(&handle, event, header.size))
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02005561 goto exit;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005562
5563 perf_output_sample(&handle, &header, data, event);
5564
5565 perf_output_end(&handle);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02005566
5567exit:
5568 rcu_read_unlock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005569}
5570
5571/*
5572 * read event_id
5573 */
5574
5575struct perf_read_event {
5576 struct perf_event_header header;
5577
5578 u32 pid;
5579 u32 tid;
5580};
5581
5582static void
5583perf_event_read_event(struct perf_event *event,
5584 struct task_struct *task)
5585{
5586 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005587 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005588 struct perf_read_event read_event = {
5589 .header = {
5590 .type = PERF_RECORD_READ,
5591 .misc = 0,
Arnaldo Carvalho de Meloc320c7b2010-10-20 12:50:11 -02005592 .size = sizeof(read_event) + event->read_size,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005593 },
5594 .pid = perf_event_pid(event, task),
5595 .tid = perf_event_tid(event, task),
5596 };
5597 int ret;
5598
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005599 perf_event_header__init_id(&read_event.header, &sample, event);
Peter Zijlstraa7ac67e2011-06-27 16:47:16 +02005600 ret = perf_output_begin(&handle, event, read_event.header.size);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005601 if (ret)
5602 return;
5603
5604 perf_output_put(&handle, read_event);
5605 perf_output_read(&handle, event);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005606 perf_event__output_id_sample(event, &handle, &sample);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005607
5608 perf_output_end(&handle);
5609}
5610
Jiri Olsa52d857a2013-05-06 18:27:18 +02005611typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
5612
5613static void
5614perf_event_aux_ctx(struct perf_event_context *ctx,
Jiri Olsa52d857a2013-05-06 18:27:18 +02005615 perf_event_aux_output_cb output,
5616 void *data)
5617{
5618 struct perf_event *event;
5619
5620 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5621 if (event->state < PERF_EVENT_STATE_INACTIVE)
5622 continue;
5623 if (!event_filter_match(event))
5624 continue;
Jiri Olsa67516842013-07-09 18:56:31 +02005625 output(event, data);
Jiri Olsa52d857a2013-05-06 18:27:18 +02005626 }
5627}
5628
5629static void
Jiri Olsa4e93ad62015-11-04 16:00:05 +01005630perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
5631 struct perf_event_context *task_ctx)
5632{
5633 rcu_read_lock();
5634 preempt_disable();
5635 perf_event_aux_ctx(task_ctx, output, data);
5636 preempt_enable();
5637 rcu_read_unlock();
5638}
5639
5640static void
Jiri Olsa67516842013-07-09 18:56:31 +02005641perf_event_aux(perf_event_aux_output_cb output, void *data,
Jiri Olsa52d857a2013-05-06 18:27:18 +02005642 struct perf_event_context *task_ctx)
5643{
5644 struct perf_cpu_context *cpuctx;
5645 struct perf_event_context *ctx;
5646 struct pmu *pmu;
5647 int ctxn;
5648
Jiri Olsa4e93ad62015-11-04 16:00:05 +01005649 /*
5650 * If we have task_ctx != NULL we only notify
5651 * the task context itself. The task_ctx is set
5652 * only for EXIT events before releasing task
5653 * context.
5654 */
5655 if (task_ctx) {
5656 perf_event_aux_task_ctx(output, data, task_ctx);
5657 return;
5658 }
5659
Jiri Olsa52d857a2013-05-06 18:27:18 +02005660 rcu_read_lock();
5661 list_for_each_entry_rcu(pmu, &pmus, entry) {
5662 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
5663 if (cpuctx->unique_pmu != pmu)
5664 goto next;
Jiri Olsa67516842013-07-09 18:56:31 +02005665 perf_event_aux_ctx(&cpuctx->ctx, output, data);
Jiri Olsa52d857a2013-05-06 18:27:18 +02005666 ctxn = pmu->task_ctx_nr;
5667 if (ctxn < 0)
5668 goto next;
5669 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
5670 if (ctx)
Jiri Olsa67516842013-07-09 18:56:31 +02005671 perf_event_aux_ctx(ctx, output, data);
Jiri Olsa52d857a2013-05-06 18:27:18 +02005672next:
5673 put_cpu_ptr(pmu->pmu_cpu_context);
5674 }
Jiri Olsa52d857a2013-05-06 18:27:18 +02005675 rcu_read_unlock();
5676}
5677
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005678/*
5679 * task tracking -- fork/exit
5680 *
Stephane Eranian13d7a242013-08-21 12:10:24 +02005681 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005682 */
5683
5684struct perf_task_event {
5685 struct task_struct *task;
5686 struct perf_event_context *task_ctx;
5687
5688 struct {
5689 struct perf_event_header header;
5690
5691 u32 pid;
5692 u32 ppid;
5693 u32 tid;
5694 u32 ptid;
5695 u64 time;
5696 } event_id;
5697};
5698
Jiri Olsa67516842013-07-09 18:56:31 +02005699static int perf_event_task_match(struct perf_event *event)
5700{
Stephane Eranian13d7a242013-08-21 12:10:24 +02005701 return event->attr.comm || event->attr.mmap ||
5702 event->attr.mmap2 || event->attr.mmap_data ||
5703 event->attr.task;
Jiri Olsa67516842013-07-09 18:56:31 +02005704}
5705
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005706static void perf_event_task_output(struct perf_event *event,
Jiri Olsa52d857a2013-05-06 18:27:18 +02005707 void *data)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005708{
Jiri Olsa52d857a2013-05-06 18:27:18 +02005709 struct perf_task_event *task_event = data;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005710 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005711 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005712 struct task_struct *task = task_event->task;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005713 int ret, size = task_event->event_id.header.size;
Mike Galbraith8bb39f92010-03-26 11:11:33 +01005714
Jiri Olsa67516842013-07-09 18:56:31 +02005715 if (!perf_event_task_match(event))
5716 return;
5717
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005718 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005719
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005720 ret = perf_output_begin(&handle, event,
Peter Zijlstraa7ac67e2011-06-27 16:47:16 +02005721 task_event->event_id.header.size);
Peter Zijlstraef607772010-05-18 10:50:41 +02005722 if (ret)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005723 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005724
5725 task_event->event_id.pid = perf_event_pid(event, task);
5726 task_event->event_id.ppid = perf_event_pid(event, current);
5727
5728 task_event->event_id.tid = perf_event_tid(event, task);
5729 task_event->event_id.ptid = perf_event_tid(event, current);
5730
Peter Zijlstra34f43922015-02-20 14:05:38 +01005731 task_event->event_id.time = perf_event_clock(event);
5732
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005733 perf_output_put(&handle, task_event->event_id);
5734
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005735 perf_event__output_id_sample(event, &handle, &sample);
5736
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005737 perf_output_end(&handle);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005738out:
5739 task_event->event_id.header.size = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005740}
5741
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005742static void perf_event_task(struct task_struct *task,
5743 struct perf_event_context *task_ctx,
5744 int new)
5745{
5746 struct perf_task_event task_event;
5747
5748 if (!atomic_read(&nr_comm_events) &&
5749 !atomic_read(&nr_mmap_events) &&
5750 !atomic_read(&nr_task_events))
5751 return;
5752
5753 task_event = (struct perf_task_event){
5754 .task = task,
5755 .task_ctx = task_ctx,
5756 .event_id = {
5757 .header = {
5758 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
5759 .misc = 0,
5760 .size = sizeof(task_event.event_id),
5761 },
5762 /* .pid */
5763 /* .ppid */
5764 /* .tid */
5765 /* .ptid */
Peter Zijlstra34f43922015-02-20 14:05:38 +01005766 /* .time */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005767 },
5768 };
5769
Jiri Olsa67516842013-07-09 18:56:31 +02005770 perf_event_aux(perf_event_task_output,
Jiri Olsa52d857a2013-05-06 18:27:18 +02005771 &task_event,
5772 task_ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005773}
5774
5775void perf_event_fork(struct task_struct *task)
5776{
5777 perf_event_task(task, NULL, 1);
5778}
5779
5780/*
5781 * comm tracking
5782 */
5783
5784struct perf_comm_event {
5785 struct task_struct *task;
5786 char *comm;
5787 int comm_size;
5788
5789 struct {
5790 struct perf_event_header header;
5791
5792 u32 pid;
5793 u32 tid;
5794 } event_id;
5795};
5796
Jiri Olsa67516842013-07-09 18:56:31 +02005797static int perf_event_comm_match(struct perf_event *event)
5798{
5799 return event->attr.comm;
5800}
5801
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005802static void perf_event_comm_output(struct perf_event *event,
Jiri Olsa52d857a2013-05-06 18:27:18 +02005803 void *data)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005804{
Jiri Olsa52d857a2013-05-06 18:27:18 +02005805 struct perf_comm_event *comm_event = data;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005806 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005807 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005808 int size = comm_event->event_id.header.size;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005809 int ret;
5810
Jiri Olsa67516842013-07-09 18:56:31 +02005811 if (!perf_event_comm_match(event))
5812 return;
5813
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005814 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
5815 ret = perf_output_begin(&handle, event,
Peter Zijlstraa7ac67e2011-06-27 16:47:16 +02005816 comm_event->event_id.header.size);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005817
5818 if (ret)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005819 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005820
5821 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
5822 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
5823
5824 perf_output_put(&handle, comm_event->event_id);
Frederic Weisbecker76369132011-05-19 19:55:04 +02005825 __output_copy(&handle, comm_event->comm,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005826 comm_event->comm_size);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005827
5828 perf_event__output_id_sample(event, &handle, &sample);
5829
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005830 perf_output_end(&handle);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005831out:
5832 comm_event->event_id.header.size = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005833}
5834
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005835static void perf_event_comm_event(struct perf_comm_event *comm_event)
5836{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005837 char comm[TASK_COMM_LEN];
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005838 unsigned int size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005839
5840 memset(comm, 0, sizeof(comm));
Márton Németh96b02d72009-11-21 23:10:15 +01005841 strlcpy(comm, comm_event->task->comm, sizeof(comm));
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005842 size = ALIGN(strlen(comm)+1, sizeof(u64));
5843
5844 comm_event->comm = comm;
5845 comm_event->comm_size = size;
5846
5847 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02005848
Jiri Olsa67516842013-07-09 18:56:31 +02005849 perf_event_aux(perf_event_comm_output,
Jiri Olsa52d857a2013-05-06 18:27:18 +02005850 comm_event,
5851 NULL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005852}
5853
Adrian Hunter82b89772014-05-28 11:45:04 +03005854void perf_event_comm(struct task_struct *task, bool exec)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005855{
5856 struct perf_comm_event comm_event;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005857
5858 if (!atomic_read(&nr_comm_events))
5859 return;
5860
5861 comm_event = (struct perf_comm_event){
5862 .task = task,
5863 /* .comm */
5864 /* .comm_size */
5865 .event_id = {
5866 .header = {
5867 .type = PERF_RECORD_COMM,
Adrian Hunter82b89772014-05-28 11:45:04 +03005868 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005869 /* .size */
5870 },
5871 /* .pid */
5872 /* .tid */
5873 },
5874 };
5875
5876 perf_event_comm_event(&comm_event);
5877}
5878
5879/*
5880 * mmap tracking
5881 */
5882
5883struct perf_mmap_event {
5884 struct vm_area_struct *vma;
5885
5886 const char *file_name;
5887 int file_size;
Stephane Eranian13d7a242013-08-21 12:10:24 +02005888 int maj, min;
5889 u64 ino;
5890 u64 ino_generation;
Peter Zijlstraf972eb62014-05-19 15:13:47 -04005891 u32 prot, flags;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005892
5893 struct {
5894 struct perf_event_header header;
5895
5896 u32 pid;
5897 u32 tid;
5898 u64 start;
5899 u64 len;
5900 u64 pgoff;
5901 } event_id;
5902};
5903
Jiri Olsa67516842013-07-09 18:56:31 +02005904static int perf_event_mmap_match(struct perf_event *event,
5905 void *data)
5906{
5907 struct perf_mmap_event *mmap_event = data;
5908 struct vm_area_struct *vma = mmap_event->vma;
5909 int executable = vma->vm_flags & VM_EXEC;
5910
5911 return (!executable && event->attr.mmap_data) ||
Stephane Eranian13d7a242013-08-21 12:10:24 +02005912 (executable && (event->attr.mmap || event->attr.mmap2));
Jiri Olsa67516842013-07-09 18:56:31 +02005913}
5914
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005915static void perf_event_mmap_output(struct perf_event *event,
Jiri Olsa52d857a2013-05-06 18:27:18 +02005916 void *data)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005917{
Jiri Olsa52d857a2013-05-06 18:27:18 +02005918 struct perf_mmap_event *mmap_event = data;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005919 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005920 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005921 int size = mmap_event->event_id.header.size;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005922 int ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005923
Jiri Olsa67516842013-07-09 18:56:31 +02005924 if (!perf_event_mmap_match(event, data))
5925 return;
5926
Stephane Eranian13d7a242013-08-21 12:10:24 +02005927 if (event->attr.mmap2) {
5928 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
5929 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5930 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5931 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
Arnaldo Carvalho de Melod008d522013-09-10 10:24:05 -03005932 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
Peter Zijlstraf972eb62014-05-19 15:13:47 -04005933 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
5934 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
Stephane Eranian13d7a242013-08-21 12:10:24 +02005935 }
5936
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005937 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
5938 ret = perf_output_begin(&handle, event,
Peter Zijlstraa7ac67e2011-06-27 16:47:16 +02005939 mmap_event->event_id.header.size);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005940 if (ret)
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005941 goto out;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005942
5943 mmap_event->event_id.pid = perf_event_pid(event, current);
5944 mmap_event->event_id.tid = perf_event_tid(event, current);
5945
5946 perf_output_put(&handle, mmap_event->event_id);
Stephane Eranian13d7a242013-08-21 12:10:24 +02005947
5948 if (event->attr.mmap2) {
5949 perf_output_put(&handle, mmap_event->maj);
5950 perf_output_put(&handle, mmap_event->min);
5951 perf_output_put(&handle, mmap_event->ino);
5952 perf_output_put(&handle, mmap_event->ino_generation);
Peter Zijlstraf972eb62014-05-19 15:13:47 -04005953 perf_output_put(&handle, mmap_event->prot);
5954 perf_output_put(&handle, mmap_event->flags);
Stephane Eranian13d7a242013-08-21 12:10:24 +02005955 }
5956
Frederic Weisbecker76369132011-05-19 19:55:04 +02005957 __output_copy(&handle, mmap_event->file_name,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005958 mmap_event->file_size);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005959
5960 perf_event__output_id_sample(event, &handle, &sample);
5961
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005962 perf_output_end(&handle);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02005963out:
5964 mmap_event->event_id.header.size = size;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005965}
5966
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005967static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5968{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005969 struct vm_area_struct *vma = mmap_event->vma;
5970 struct file *file = vma->vm_file;
Stephane Eranian13d7a242013-08-21 12:10:24 +02005971 int maj = 0, min = 0;
5972 u64 ino = 0, gen = 0;
Peter Zijlstraf972eb62014-05-19 15:13:47 -04005973 u32 prot = 0, flags = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005974 unsigned int size;
5975 char tmp[16];
5976 char *buf = NULL;
Peter Zijlstra2c42cfb2013-10-17 00:06:46 +02005977 char *name;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005978
5979 if (file) {
Stephane Eranian13d7a242013-08-21 12:10:24 +02005980 struct inode *inode;
5981 dev_t dev;
Oleg Nesterov3ea2f2b2013-10-16 22:10:04 +02005982
Peter Zijlstra2c42cfb2013-10-17 00:06:46 +02005983 buf = kmalloc(PATH_MAX, GFP_KERNEL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005984 if (!buf) {
Oleg Nesterovc7e548b2013-10-17 20:24:17 +02005985 name = "//enomem";
5986 goto cpy_name;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005987 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005988 /*
Oleg Nesterov3ea2f2b2013-10-16 22:10:04 +02005989 * d_path() works from the end of the rb backwards, so we
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005990 * need to add enough zero bytes after the string to handle
5991 * the 64bit alignment we do later.
5992 */
Miklos Szeredi9bf39ab2015-06-19 10:29:13 +02005993 name = file_path(file, buf, PATH_MAX - sizeof(u64));
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005994 if (IS_ERR(name)) {
Oleg Nesterovc7e548b2013-10-17 20:24:17 +02005995 name = "//toolong";
5996 goto cpy_name;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02005997 }
Stephane Eranian13d7a242013-08-21 12:10:24 +02005998 inode = file_inode(vma->vm_file);
5999 dev = inode->i_sb->s_dev;
6000 ino = inode->i_ino;
6001 gen = inode->i_generation;
6002 maj = MAJOR(dev);
6003 min = MINOR(dev);
Peter Zijlstraf972eb62014-05-19 15:13:47 -04006004
6005 if (vma->vm_flags & VM_READ)
6006 prot |= PROT_READ;
6007 if (vma->vm_flags & VM_WRITE)
6008 prot |= PROT_WRITE;
6009 if (vma->vm_flags & VM_EXEC)
6010 prot |= PROT_EXEC;
6011
6012 if (vma->vm_flags & VM_MAYSHARE)
6013 flags = MAP_SHARED;
6014 else
6015 flags = MAP_PRIVATE;
6016
6017 if (vma->vm_flags & VM_DENYWRITE)
6018 flags |= MAP_DENYWRITE;
6019 if (vma->vm_flags & VM_MAYEXEC)
6020 flags |= MAP_EXECUTABLE;
6021 if (vma->vm_flags & VM_LOCKED)
6022 flags |= MAP_LOCKED;
6023 if (vma->vm_flags & VM_HUGETLB)
6024 flags |= MAP_HUGETLB;
6025
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006026 goto got_name;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006027 } else {
Jiri Olsafbe26ab2014-07-14 17:57:19 +02006028 if (vma->vm_ops && vma->vm_ops->name) {
6029 name = (char *) vma->vm_ops->name(vma);
6030 if (name)
6031 goto cpy_name;
6032 }
6033
Peter Zijlstra2c42cfb2013-10-17 00:06:46 +02006034 name = (char *)arch_vma_name(vma);
Oleg Nesterovc7e548b2013-10-17 20:24:17 +02006035 if (name)
6036 goto cpy_name;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006037
Oleg Nesterov32c5fb72013-10-16 22:09:45 +02006038 if (vma->vm_start <= vma->vm_mm->start_brk &&
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006039 vma->vm_end >= vma->vm_mm->brk) {
Oleg Nesterovc7e548b2013-10-17 20:24:17 +02006040 name = "[heap]";
6041 goto cpy_name;
Oleg Nesterov32c5fb72013-10-16 22:09:45 +02006042 }
6043 if (vma->vm_start <= vma->vm_mm->start_stack &&
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006044 vma->vm_end >= vma->vm_mm->start_stack) {
Oleg Nesterovc7e548b2013-10-17 20:24:17 +02006045 name = "[stack]";
6046 goto cpy_name;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006047 }
6048
Oleg Nesterovc7e548b2013-10-17 20:24:17 +02006049 name = "//anon";
6050 goto cpy_name;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006051 }
6052
Oleg Nesterovc7e548b2013-10-17 20:24:17 +02006053cpy_name:
6054 strlcpy(tmp, name, sizeof(tmp));
6055 name = tmp;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006056got_name:
Peter Zijlstra2c42cfb2013-10-17 00:06:46 +02006057 /*
6058 * Since our buffer works in 8 byte units we need to align our string
6059 * size to a multiple of 8. However, we must guarantee the tail end is
6060 * zero'd out to avoid leaking random bits to userspace.
6061 */
6062 size = strlen(name)+1;
6063 while (!IS_ALIGNED(size, sizeof(u64)))
6064 name[size++] = '\0';
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006065
6066 mmap_event->file_name = name;
6067 mmap_event->file_size = size;
Stephane Eranian13d7a242013-08-21 12:10:24 +02006068 mmap_event->maj = maj;
6069 mmap_event->min = min;
6070 mmap_event->ino = ino;
6071 mmap_event->ino_generation = gen;
Peter Zijlstraf972eb62014-05-19 15:13:47 -04006072 mmap_event->prot = prot;
6073 mmap_event->flags = flags;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006074
Stephane Eranian2fe85422013-01-24 16:10:39 +01006075 if (!(vma->vm_flags & VM_EXEC))
6076 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
6077
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006078 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
6079
Jiri Olsa67516842013-07-09 18:56:31 +02006080 perf_event_aux(perf_event_mmap_output,
Jiri Olsa52d857a2013-05-06 18:27:18 +02006081 mmap_event,
6082 NULL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006083
6084 kfree(buf);
6085}
6086
Eric B Munson3af9e852010-05-18 15:30:49 +01006087void perf_event_mmap(struct vm_area_struct *vma)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006088{
6089 struct perf_mmap_event mmap_event;
6090
6091 if (!atomic_read(&nr_mmap_events))
6092 return;
6093
6094 mmap_event = (struct perf_mmap_event){
6095 .vma = vma,
6096 /* .file_name */
6097 /* .file_size */
6098 .event_id = {
6099 .header = {
6100 .type = PERF_RECORD_MMAP,
Zhang, Yanmin39447b32010-04-19 13:32:41 +08006101 .misc = PERF_RECORD_MISC_USER,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006102 /* .size */
6103 },
6104 /* .pid */
6105 /* .tid */
6106 .start = vma->vm_start,
6107 .len = vma->vm_end - vma->vm_start,
Peter Zijlstra3a0304e2010-02-26 10:33:41 +01006108 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006109 },
Stephane Eranian13d7a242013-08-21 12:10:24 +02006110 /* .maj (attr_mmap2 only) */
6111 /* .min (attr_mmap2 only) */
6112 /* .ino (attr_mmap2 only) */
6113 /* .ino_generation (attr_mmap2 only) */
Peter Zijlstraf972eb62014-05-19 15:13:47 -04006114 /* .prot (attr_mmap2 only) */
6115 /* .flags (attr_mmap2 only) */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006116 };
6117
6118 perf_event_mmap_event(&mmap_event);
6119}
6120
Alexander Shishkin68db7e92015-01-14 14:18:15 +02006121void perf_event_aux_event(struct perf_event *event, unsigned long head,
6122 unsigned long size, u64 flags)
6123{
6124 struct perf_output_handle handle;
6125 struct perf_sample_data sample;
6126 struct perf_aux_event {
6127 struct perf_event_header header;
6128 u64 offset;
6129 u64 size;
6130 u64 flags;
6131 } rec = {
6132 .header = {
6133 .type = PERF_RECORD_AUX,
6134 .misc = 0,
6135 .size = sizeof(rec),
6136 },
6137 .offset = head,
6138 .size = size,
6139 .flags = flags,
6140 };
6141 int ret;
6142
6143 perf_event_header__init_id(&rec.header, &sample, event);
6144 ret = perf_output_begin(&handle, event, rec.header.size);
6145
6146 if (ret)
6147 return;
6148
6149 perf_output_put(&handle, rec);
6150 perf_event__output_id_sample(event, &handle, &sample);
6151
6152 perf_output_end(&handle);
6153}
6154
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006155/*
Kan Liangf38b0db2015-05-10 15:13:14 -04006156 * Lost/dropped samples logging
6157 */
6158void perf_log_lost_samples(struct perf_event *event, u64 lost)
6159{
6160 struct perf_output_handle handle;
6161 struct perf_sample_data sample;
6162 int ret;
6163
6164 struct {
6165 struct perf_event_header header;
6166 u64 lost;
6167 } lost_samples_event = {
6168 .header = {
6169 .type = PERF_RECORD_LOST_SAMPLES,
6170 .misc = 0,
6171 .size = sizeof(lost_samples_event),
6172 },
6173 .lost = lost,
6174 };
6175
6176 perf_event_header__init_id(&lost_samples_event.header, &sample, event);
6177
6178 ret = perf_output_begin(&handle, event,
6179 lost_samples_event.header.size);
6180 if (ret)
6181 return;
6182
6183 perf_output_put(&handle, lost_samples_event);
6184 perf_event__output_id_sample(event, &handle, &sample);
6185 perf_output_end(&handle);
6186}
6187
6188/*
Adrian Hunter45ac1402015-07-21 12:44:02 +03006189 * context_switch tracking
6190 */
6191
6192struct perf_switch_event {
6193 struct task_struct *task;
6194 struct task_struct *next_prev;
6195
6196 struct {
6197 struct perf_event_header header;
6198 u32 next_prev_pid;
6199 u32 next_prev_tid;
6200 } event_id;
6201};
6202
6203static int perf_event_switch_match(struct perf_event *event)
6204{
6205 return event->attr.context_switch;
6206}
6207
6208static void perf_event_switch_output(struct perf_event *event, void *data)
6209{
6210 struct perf_switch_event *se = data;
6211 struct perf_output_handle handle;
6212 struct perf_sample_data sample;
6213 int ret;
6214
6215 if (!perf_event_switch_match(event))
6216 return;
6217
6218 /* Only CPU-wide events are allowed to see next/prev pid/tid */
6219 if (event->ctx->task) {
6220 se->event_id.header.type = PERF_RECORD_SWITCH;
6221 se->event_id.header.size = sizeof(se->event_id.header);
6222 } else {
6223 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
6224 se->event_id.header.size = sizeof(se->event_id);
6225 se->event_id.next_prev_pid =
6226 perf_event_pid(event, se->next_prev);
6227 se->event_id.next_prev_tid =
6228 perf_event_tid(event, se->next_prev);
6229 }
6230
6231 perf_event_header__init_id(&se->event_id.header, &sample, event);
6232
6233 ret = perf_output_begin(&handle, event, se->event_id.header.size);
6234 if (ret)
6235 return;
6236
6237 if (event->ctx->task)
6238 perf_output_put(&handle, se->event_id.header);
6239 else
6240 perf_output_put(&handle, se->event_id);
6241
6242 perf_event__output_id_sample(event, &handle, &sample);
6243
6244 perf_output_end(&handle);
6245}
6246
6247static void perf_event_switch(struct task_struct *task,
6248 struct task_struct *next_prev, bool sched_in)
6249{
6250 struct perf_switch_event switch_event;
6251
6252 /* N.B. caller checks nr_switch_events != 0 */
6253
6254 switch_event = (struct perf_switch_event){
6255 .task = task,
6256 .next_prev = next_prev,
6257 .event_id = {
6258 .header = {
6259 /* .type */
6260 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
6261 /* .size */
6262 },
6263 /* .next_prev_pid */
6264 /* .next_prev_tid */
6265 },
6266 };
6267
6268 perf_event_aux(perf_event_switch_output,
6269 &switch_event,
6270 NULL);
6271}
6272
6273/*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006274 * IRQ throttle logging
6275 */
6276
6277static void perf_log_throttle(struct perf_event *event, int enable)
6278{
6279 struct perf_output_handle handle;
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02006280 struct perf_sample_data sample;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006281 int ret;
6282
6283 struct {
6284 struct perf_event_header header;
6285 u64 time;
6286 u64 id;
6287 u64 stream_id;
6288 } throttle_event = {
6289 .header = {
6290 .type = PERF_RECORD_THROTTLE,
6291 .misc = 0,
6292 .size = sizeof(throttle_event),
6293 },
Peter Zijlstra34f43922015-02-20 14:05:38 +01006294 .time = perf_event_clock(event),
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006295 .id = primary_event_id(event),
6296 .stream_id = event->id,
6297 };
6298
6299 if (enable)
6300 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
6301
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02006302 perf_event_header__init_id(&throttle_event.header, &sample, event);
6303
6304 ret = perf_output_begin(&handle, event,
Peter Zijlstraa7ac67e2011-06-27 16:47:16 +02006305 throttle_event.header.size);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006306 if (ret)
6307 return;
6308
6309 perf_output_put(&handle, throttle_event);
Arnaldo Carvalho de Meloc980d102010-12-04 23:02:20 -02006310 perf_event__output_id_sample(event, &handle, &sample);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006311 perf_output_end(&handle);
6312}
6313
Alexander Shishkinec0d7722015-01-14 14:18:23 +02006314static void perf_log_itrace_start(struct perf_event *event)
6315{
6316 struct perf_output_handle handle;
6317 struct perf_sample_data sample;
6318 struct perf_aux_event {
6319 struct perf_event_header header;
6320 u32 pid;
6321 u32 tid;
6322 } rec;
6323 int ret;
6324
6325 if (event->parent)
6326 event = event->parent;
6327
6328 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
6329 event->hw.itrace_started)
6330 return;
6331
Alexander Shishkinec0d7722015-01-14 14:18:23 +02006332 rec.header.type = PERF_RECORD_ITRACE_START;
6333 rec.header.misc = 0;
6334 rec.header.size = sizeof(rec);
6335 rec.pid = perf_event_pid(event, current);
6336 rec.tid = perf_event_tid(event, current);
6337
6338 perf_event_header__init_id(&rec.header, &sample, event);
6339 ret = perf_output_begin(&handle, event, rec.header.size);
6340
6341 if (ret)
6342 return;
6343
6344 perf_output_put(&handle, rec);
6345 perf_event__output_id_sample(event, &handle, &sample);
6346
6347 perf_output_end(&handle);
6348}
6349
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006350/*
6351 * Generic event overflow handling, sampling.
6352 */
6353
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02006354static int __perf_event_overflow(struct perf_event *event,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006355 int throttle, struct perf_sample_data *data,
6356 struct pt_regs *regs)
6357{
6358 int events = atomic_read(&event->event_limit);
6359 struct hw_perf_event *hwc = &event->hw;
Stephane Eraniane050e3f2012-01-26 17:03:19 +01006360 u64 seq;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006361 int ret = 0;
6362
Peter Zijlstra96398822010-11-24 18:55:29 +01006363 /*
6364 * Non-sampling counters might still use the PMI to fold short
6365 * hardware counters, ignore those.
6366 */
6367 if (unlikely(!is_sampling_event(event)))
6368 return 0;
6369
Stephane Eraniane050e3f2012-01-26 17:03:19 +01006370 seq = __this_cpu_read(perf_throttled_seq);
6371 if (seq != hwc->interrupts_seq) {
6372 hwc->interrupts_seq = seq;
6373 hwc->interrupts = 1;
6374 } else {
6375 hwc->interrupts++;
6376 if (unlikely(throttle
6377 && hwc->interrupts >= max_samples_per_tick)) {
6378 __this_cpu_inc(perf_throttled_count);
Peter Zijlstra163ec432011-02-16 11:22:34 +01006379 hwc->interrupts = MAX_INTERRUPTS;
6380 perf_log_throttle(event, 0);
Frederic Weisbeckerd84153d2013-07-23 02:31:05 +02006381 tick_nohz_full_kick();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006382 ret = 1;
6383 }
Stephane Eraniane050e3f2012-01-26 17:03:19 +01006384 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006385
6386 if (event->attr.freq) {
6387 u64 now = perf_clock();
Peter Zijlstraabd50712010-01-26 18:50:16 +01006388 s64 delta = now - hwc->freq_time_stamp;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006389
Peter Zijlstraabd50712010-01-26 18:50:16 +01006390 hwc->freq_time_stamp = now;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006391
Peter Zijlstraabd50712010-01-26 18:50:16 +01006392 if (delta > 0 && delta < 2*TICK_NSEC)
Stephane Eranianf39d47f2012-02-07 14:39:57 +01006393 perf_adjust_period(event, delta, hwc->last_period, true);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006394 }
6395
6396 /*
6397 * XXX event_limit might not quite work as expected on inherited
6398 * events
6399 */
6400
6401 event->pending_kill = POLL_IN;
6402 if (events && atomic_dec_and_test(&event->event_limit)) {
6403 ret = 1;
6404 event->pending_kill = POLL_HUP;
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02006405 event->pending_disable = 1;
6406 irq_work_queue(&event->pending);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006407 }
6408
Peter Zijlstra453f19e2009-11-20 22:19:43 +01006409 if (event->overflow_handler)
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02006410 event->overflow_handler(event, data, regs);
Peter Zijlstra453f19e2009-11-20 22:19:43 +01006411 else
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02006412 perf_event_output(event, data, regs);
Peter Zijlstra453f19e2009-11-20 22:19:43 +01006413
Peter Zijlstrafed66e2cd2015-06-11 10:32:01 +02006414 if (*perf_event_fasync(event) && event->pending_kill) {
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02006415 event->pending_wakeup = 1;
6416 irq_work_queue(&event->pending);
Peter Zijlstraf506b3d2011-05-26 17:02:53 +02006417 }
6418
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006419 return ret;
6420}
6421
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02006422int perf_event_overflow(struct perf_event *event,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006423 struct perf_sample_data *data,
6424 struct pt_regs *regs)
6425{
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02006426 return __perf_event_overflow(event, 1, data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006427}
6428
6429/*
6430 * Generic software event infrastructure
6431 */
6432
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006433struct swevent_htable {
6434 struct swevent_hlist *swevent_hlist;
6435 struct mutex hlist_mutex;
6436 int hlist_refcount;
6437
6438 /* Recursion avoidance in each contexts */
6439 int recursion[PERF_NR_CONTEXTS];
6440};
6441
6442static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
6443
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006444/*
6445 * We directly increment event->count and keep a second value in
6446 * event->hw.period_left to count intervals. This period event
6447 * is kept in the range [-sample_period, 0] so that we can use the
6448 * sign as trigger.
6449 */
6450
Jiri Olsaab573842013-05-01 17:25:44 +02006451u64 perf_swevent_set_period(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006452{
6453 struct hw_perf_event *hwc = &event->hw;
6454 u64 period = hwc->last_period;
6455 u64 nr, offset;
6456 s64 old, val;
6457
6458 hwc->last_period = hwc->sample_period;
6459
6460again:
Peter Zijlstrae7850592010-05-21 14:43:08 +02006461 old = val = local64_read(&hwc->period_left);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006462 if (val < 0)
6463 return 0;
6464
6465 nr = div64_u64(period + val, period);
6466 offset = nr * period;
6467 val -= offset;
Peter Zijlstrae7850592010-05-21 14:43:08 +02006468 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006469 goto again;
6470
6471 return nr;
6472}
6473
Peter Zijlstra0cff7842009-11-20 22:19:44 +01006474static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02006475 struct perf_sample_data *data,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006476 struct pt_regs *regs)
6477{
6478 struct hw_perf_event *hwc = &event->hw;
6479 int throttle = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006480
Peter Zijlstra0cff7842009-11-20 22:19:44 +01006481 if (!overflow)
6482 overflow = perf_swevent_set_period(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006483
6484 if (hwc->interrupts == MAX_INTERRUPTS)
6485 return;
6486
6487 for (; overflow; overflow--) {
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02006488 if (__perf_event_overflow(event, throttle,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006489 data, regs)) {
6490 /*
6491 * We inhibit the overflow from happening when
6492 * hwc->interrupts == MAX_INTERRUPTS.
6493 */
6494 break;
6495 }
6496 throttle = 1;
6497 }
6498}
6499
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02006500static void perf_swevent_event(struct perf_event *event, u64 nr,
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02006501 struct perf_sample_data *data,
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006502 struct pt_regs *regs)
6503{
6504 struct hw_perf_event *hwc = &event->hw;
6505
Peter Zijlstrae7850592010-05-21 14:43:08 +02006506 local64_add(nr, &event->count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006507
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006508 if (!regs)
6509 return;
6510
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01006511 if (!is_sampling_event(event))
Peter Zijlstra0cff7842009-11-20 22:19:44 +01006512 return;
6513
Andrew Vagin5d81e5c2011-11-07 15:54:12 +03006514 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
6515 data->period = nr;
6516 return perf_swevent_overflow(event, 1, data, regs);
6517 } else
6518 data->period = event->hw.last_period;
6519
Peter Zijlstra0cff7842009-11-20 22:19:44 +01006520 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02006521 return perf_swevent_overflow(event, 1, data, regs);
Peter Zijlstra0cff7842009-11-20 22:19:44 +01006522
Peter Zijlstrae7850592010-05-21 14:43:08 +02006523 if (local64_add_negative(nr, &hwc->period_left))
Peter Zijlstra0cff7842009-11-20 22:19:44 +01006524 return;
6525
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02006526 perf_swevent_overflow(event, 0, data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006527}
6528
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01006529static int perf_exclude_event(struct perf_event *event,
6530 struct pt_regs *regs)
6531{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02006532 if (event->hw.state & PERF_HES_STOPPED)
Frederic Weisbecker91b2f482011-03-07 21:27:08 +01006533 return 1;
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02006534
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01006535 if (regs) {
6536 if (event->attr.exclude_user && user_mode(regs))
6537 return 1;
6538
6539 if (event->attr.exclude_kernel && !user_mode(regs))
6540 return 1;
6541 }
6542
6543 return 0;
6544}
6545
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006546static int perf_swevent_match(struct perf_event *event,
6547 enum perf_type_id type,
Li Zefan6fb29152009-10-15 11:21:42 +08006548 u32 event_id,
6549 struct perf_sample_data *data,
6550 struct pt_regs *regs)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006551{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006552 if (event->attr.type != type)
6553 return 0;
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01006554
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006555 if (event->attr.config != event_id)
6556 return 0;
6557
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01006558 if (perf_exclude_event(event, regs))
6559 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006560
6561 return 1;
6562}
6563
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006564static inline u64 swevent_hash(u64 type, u32 event_id)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006565{
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006566 u64 val = event_id | (type << 32);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006567
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006568 return hash_64(val, SWEVENT_HLIST_BITS);
6569}
6570
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02006571static inline struct hlist_head *
6572__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006573{
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02006574 u64 hash = swevent_hash(type, event_id);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006575
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02006576 return &hlist->heads[hash];
6577}
6578
6579/* For the read side: events when they trigger */
6580static inline struct hlist_head *
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006581find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02006582{
6583 struct swevent_hlist *hlist;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006584
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006585 hlist = rcu_dereference(swhash->swevent_hlist);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006586 if (!hlist)
6587 return NULL;
6588
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02006589 return __find_swevent_head(hlist, type, event_id);
6590}
6591
6592/* For the event head insertion and removal in the hlist */
6593static inline struct hlist_head *
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006594find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02006595{
6596 struct swevent_hlist *hlist;
6597 u32 event_id = event->attr.config;
6598 u64 type = event->attr.type;
6599
6600 /*
6601 * Event scheduling is always serialized against hlist allocation
6602 * and release. Which makes the protected version suitable here.
6603 * The context lock guarantees that.
6604 */
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006605 hlist = rcu_dereference_protected(swhash->swevent_hlist,
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02006606 lockdep_is_held(&event->ctx->lock));
6607 if (!hlist)
6608 return NULL;
6609
6610 return __find_swevent_head(hlist, type, event_id);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006611}
6612
6613static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02006614 u64 nr,
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006615 struct perf_sample_data *data,
6616 struct pt_regs *regs)
6617{
Christoph Lameter4a32fea2014-08-17 12:30:27 -05006618 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006619 struct perf_event *event;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006620 struct hlist_head *head;
6621
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006622 rcu_read_lock();
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006623 head = find_swevent_head_rcu(swhash, type, event_id);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006624 if (!head)
6625 goto end;
6626
Sasha Levinb67bfe02013-02-27 17:06:00 -08006627 hlist_for_each_entry_rcu(event, head, hlist_entry) {
Li Zefan6fb29152009-10-15 11:21:42 +08006628 if (perf_swevent_match(event, type, event_id, data, regs))
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02006629 perf_swevent_event(event, nr, data, regs);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006630 }
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006631end:
6632 rcu_read_unlock();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006633}
6634
Peter Zijlstra (Intel)86038c52014-12-16 12:47:34 +01006635DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
6636
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01006637int perf_swevent_get_recursion_context(void)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006638{
Christoph Lameter4a32fea2014-08-17 12:30:27 -05006639 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01006640
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006641 return get_recursion_context(swhash->recursion);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006642}
Ingo Molnar645e8cc2009-11-22 12:20:19 +01006643EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006644
Jesper Juhlfa9f90b2010-11-28 21:39:34 +01006645inline void perf_swevent_put_recursion_context(int rctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006646{
Christoph Lameter4a32fea2014-08-17 12:30:27 -05006647 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02006648
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006649 put_recursion_context(swhash->recursion, rctx);
Frederic Weisbeckerce71b9d2009-11-22 05:26:55 +01006650}
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006651
Peter Zijlstra (Intel)86038c52014-12-16 12:47:34 +01006652void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006653{
Ingo Molnara4234bf2009-11-23 10:57:59 +01006654 struct perf_sample_data data;
Peter Zijlstra (Intel)86038c52014-12-16 12:47:34 +01006655
6656 if (WARN_ON_ONCE(!regs))
6657 return;
6658
6659 perf_sample_data_init(&data, addr, 0);
6660 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
6661}
6662
6663void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6664{
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01006665 int rctx;
6666
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02006667 preempt_disable_notrace();
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01006668 rctx = perf_swevent_get_recursion_context();
Peter Zijlstra (Intel)86038c52014-12-16 12:47:34 +01006669 if (unlikely(rctx < 0))
6670 goto fail;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006671
Peter Zijlstra (Intel)86038c52014-12-16 12:47:34 +01006672 ___perf_sw_event(event_id, nr, regs, addr);
Peter Zijlstra4ed7c922009-11-23 11:37:29 +01006673
6674 perf_swevent_put_recursion_context(rctx);
Peter Zijlstra (Intel)86038c52014-12-16 12:47:34 +01006675fail:
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02006676 preempt_enable_notrace();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006677}
6678
6679static void perf_swevent_read(struct perf_event *event)
6680{
6681}
6682
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02006683static int perf_swevent_add(struct perf_event *event, int flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006684{
Christoph Lameter4a32fea2014-08-17 12:30:27 -05006685 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006686 struct hw_perf_event *hwc = &event->hw;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006687 struct hlist_head *head;
6688
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01006689 if (is_sampling_event(event)) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006690 hwc->last_period = hwc->sample_period;
6691 perf_swevent_set_period(event);
6692 }
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006693
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02006694 hwc->state = !(flags & PERF_EF_START);
6695
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006696 head = find_swevent_head(swhash, event);
Peter Zijlstra12ca6ad2015-12-15 13:49:05 +01006697 if (WARN_ON_ONCE(!head))
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006698 return -EINVAL;
6699
6700 hlist_add_head_rcu(&event->hlist_entry, head);
Shaohua Li6a694a62015-02-05 15:55:32 -08006701 perf_event_update_userpage(event);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006702
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006703 return 0;
6704}
6705
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02006706static void perf_swevent_del(struct perf_event *event, int flags)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006707{
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006708 hlist_del_rcu(&event->hlist_entry);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006709}
6710
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02006711static void perf_swevent_start(struct perf_event *event, int flags)
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02006712{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02006713 event->hw.state = 0;
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02006714}
6715
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02006716static void perf_swevent_stop(struct perf_event *event, int flags)
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02006717{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02006718 event->hw.state = PERF_HES_STOPPED;
Peter Zijlstrac6df8d52010-06-03 11:21:20 +02006719}
6720
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02006721/* Deref the hlist from the update side */
6722static inline struct swevent_hlist *
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006723swevent_hlist_deref(struct swevent_htable *swhash)
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02006724{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006725 return rcu_dereference_protected(swhash->swevent_hlist,
6726 lockdep_is_held(&swhash->hlist_mutex));
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02006727}
6728
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006729static void swevent_hlist_release(struct swevent_htable *swhash)
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006730{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006731 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006732
Frederic Weisbecker49f135e2010-05-20 10:17:46 +02006733 if (!hlist)
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006734 return;
6735
Andreea-Cristina Bernat70691d42014-08-22 16:26:05 +03006736 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
Lai Jiangshanfa4bbc42011-03-18 12:08:29 +08006737 kfree_rcu(hlist, rcu_head);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006738}
6739
6740static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
6741{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006742 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006743
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006744 mutex_lock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006745
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006746 if (!--swhash->hlist_refcount)
6747 swevent_hlist_release(swhash);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006748
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006749 mutex_unlock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006750}
6751
6752static void swevent_hlist_put(struct perf_event *event)
6753{
6754 int cpu;
6755
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006756 for_each_possible_cpu(cpu)
6757 swevent_hlist_put_cpu(event, cpu);
6758}
6759
6760static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
6761{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006762 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006763 int err = 0;
6764
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006765 mutex_lock(&swhash->hlist_mutex);
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006766 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006767 struct swevent_hlist *hlist;
6768
6769 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
6770 if (!hlist) {
6771 err = -ENOMEM;
6772 goto exit;
6773 }
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006774 rcu_assign_pointer(swhash->swevent_hlist, hlist);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006775 }
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006776 swhash->hlist_refcount++;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02006777exit:
Peter Zijlstrab28ab832010-09-06 14:48:15 +02006778 mutex_unlock(&swhash->hlist_mutex);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006779
6780 return err;
6781}
6782
6783static int swevent_hlist_get(struct perf_event *event)
6784{
6785 int err;
6786 int cpu, failed_cpu;
6787
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006788 get_online_cpus();
6789 for_each_possible_cpu(cpu) {
6790 err = swevent_hlist_get_cpu(event, cpu);
6791 if (err) {
6792 failed_cpu = cpu;
6793 goto fail;
6794 }
6795 }
6796 put_online_cpus();
6797
6798 return 0;
Peter Zijlstra9ed60602010-06-11 17:36:35 +02006799fail:
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006800 for_each_possible_cpu(cpu) {
6801 if (cpu == failed_cpu)
6802 break;
6803 swevent_hlist_put_cpu(event, cpu);
6804 }
6805
6806 put_online_cpus();
6807 return err;
6808}
6809
Ingo Molnarc5905af2012-02-24 08:31:31 +01006810struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
Frederic Weisbecker95476b62010-04-14 23:42:18 +02006811
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006812static void sw_perf_event_destroy(struct perf_event *event)
6813{
6814 u64 event_id = event->attr.config;
6815
6816 WARN_ON(event->parent);
6817
Ingo Molnarc5905af2012-02-24 08:31:31 +01006818 static_key_slow_dec(&perf_swevent_enabled[event_id]);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006819 swevent_hlist_put(event);
6820}
6821
6822static int perf_swevent_init(struct perf_event *event)
6823{
Tommi Rantala8176cce2013-04-13 22:49:14 +03006824 u64 event_id = event->attr.config;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006825
6826 if (event->attr.type != PERF_TYPE_SOFTWARE)
6827 return -ENOENT;
6828
Stephane Eranian2481c5f2012-02-09 23:20:59 +01006829 /*
6830 * no branch sampling for software events
6831 */
6832 if (has_branch_stack(event))
6833 return -EOPNOTSUPP;
6834
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006835 switch (event_id) {
6836 case PERF_COUNT_SW_CPU_CLOCK:
6837 case PERF_COUNT_SW_TASK_CLOCK:
6838 return -ENOENT;
6839
6840 default:
6841 break;
6842 }
6843
Dan Carpenterce677832010-10-24 21:50:42 +02006844 if (event_id >= PERF_COUNT_SW_MAX)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006845 return -ENOENT;
6846
6847 if (!event->parent) {
6848 int err;
6849
6850 err = swevent_hlist_get(event);
6851 if (err)
6852 return err;
6853
Ingo Molnarc5905af2012-02-24 08:31:31 +01006854 static_key_slow_inc(&perf_swevent_enabled[event_id]);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006855 event->destroy = sw_perf_event_destroy;
6856 }
6857
6858 return 0;
6859}
6860
6861static struct pmu perf_swevent = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006862 .task_ctx_nr = perf_sw_context,
6863
Peter Zijlstra34f43922015-02-20 14:05:38 +01006864 .capabilities = PERF_PMU_CAP_NO_NMI,
6865
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006866 .event_init = perf_swevent_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02006867 .add = perf_swevent_add,
6868 .del = perf_swevent_del,
6869 .start = perf_swevent_start,
6870 .stop = perf_swevent_stop,
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02006871 .read = perf_swevent_read,
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02006872};
Frederic Weisbecker95476b62010-04-14 23:42:18 +02006873
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006874#ifdef CONFIG_EVENT_TRACING
6875
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02006876static int perf_tp_filter_match(struct perf_event *event,
Frederic Weisbecker95476b62010-04-14 23:42:18 +02006877 struct perf_sample_data *data)
6878{
6879 void *record = data->raw->data;
6880
Peter Zijlstrab71b4372015-11-02 10:50:51 +01006881 /* only top level events have filters set */
6882 if (event->parent)
6883 event = event->parent;
6884
Frederic Weisbecker95476b62010-04-14 23:42:18 +02006885 if (likely(!event->filter) || filter_match_preds(event->filter, record))
6886 return 1;
6887 return 0;
6888}
6889
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02006890static int perf_tp_event_match(struct perf_event *event,
6891 struct perf_sample_data *data,
6892 struct pt_regs *regs)
6893{
Frederic Weisbeckera0f7d0f2011-03-07 21:27:09 +01006894 if (event->hw.state & PERF_HES_STOPPED)
6895 return 0;
Peter Zijlstra580d6072010-05-20 20:54:31 +02006896 /*
6897 * All tracepoints are from kernel-space.
6898 */
6899 if (event->attr.exclude_kernel)
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02006900 return 0;
6901
6902 if (!perf_tp_filter_match(event, data))
6903 return 0;
6904
6905 return 1;
6906}
6907
6908void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
Andrew Vagine6dab5f2012-07-11 18:14:58 +04006909 struct pt_regs *regs, struct hlist_head *head, int rctx,
6910 struct task_struct *task)
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02006911{
6912 struct perf_sample_data data;
6913 struct perf_event *event;
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02006914
6915 struct perf_raw_record raw = {
6916 .size = entry_size,
6917 .data = record,
6918 };
6919
Robert Richterfd0d0002012-04-02 20:19:08 +02006920 perf_sample_data_init(&data, addr, 0);
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02006921 data.raw = &raw;
6922
Sasha Levinb67bfe02013-02-27 17:06:00 -08006923 hlist_for_each_entry_rcu(event, head, hlist_entry) {
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02006924 if (perf_tp_event_match(event, &data, regs))
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02006925 perf_swevent_event(event, count, &data, regs);
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02006926 }
Peter Zijlstraecc55f82010-05-21 15:11:34 +02006927
Andrew Vagine6dab5f2012-07-11 18:14:58 +04006928 /*
6929 * If we got specified a target task, also iterate its context and
6930 * deliver this event there too.
6931 */
6932 if (task && task != current) {
6933 struct perf_event_context *ctx;
6934 struct trace_entry *entry = record;
6935
6936 rcu_read_lock();
6937 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
6938 if (!ctx)
6939 goto unlock;
6940
6941 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6942 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6943 continue;
6944 if (event->attr.config != entry->type)
6945 continue;
6946 if (perf_tp_event_match(event, &data, regs))
6947 perf_swevent_event(event, count, &data, regs);
6948 }
6949unlock:
6950 rcu_read_unlock();
6951 }
6952
Peter Zijlstraecc55f82010-05-21 15:11:34 +02006953 perf_swevent_put_recursion_context(rctx);
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02006954}
6955EXPORT_SYMBOL_GPL(perf_tp_event);
6956
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006957static void tp_perf_event_destroy(struct perf_event *event)
6958{
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02006959 perf_trace_destroy(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006960}
6961
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006962static int perf_tp_event_init(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006963{
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02006964 int err;
6965
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006966 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6967 return -ENOENT;
6968
Stephane Eranian2481c5f2012-02-09 23:20:59 +01006969 /*
6970 * no branch sampling for tracepoint events
6971 */
6972 if (has_branch_stack(event))
6973 return -EOPNOTSUPP;
6974
Peter Zijlstra1c024eca2010-05-19 14:02:22 +02006975 err = perf_trace_init(event);
6976 if (err)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006977 return err;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006978
6979 event->destroy = tp_perf_event_destroy;
6980
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006981 return 0;
6982}
6983
6984static struct pmu perf_tracepoint = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02006985 .task_ctx_nr = perf_sw_context,
6986
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006987 .event_init = perf_tp_event_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02006988 .add = perf_trace_add,
6989 .del = perf_trace_del,
6990 .start = perf_swevent_start,
6991 .stop = perf_swevent_stop,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006992 .read = perf_swevent_read,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02006993};
6994
6995static inline void perf_tp_register(void)
6996{
Peter Zijlstra2e80a822010-11-17 23:17:36 +01006997 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006998}
Li Zefan6fb29152009-10-15 11:21:42 +08006999
7000static int perf_event_set_filter(struct perf_event *event, void __user *arg)
7001{
7002 char *filter_str;
7003 int ret;
7004
7005 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7006 return -EINVAL;
7007
7008 filter_str = strndup_user(arg, PAGE_SIZE);
7009 if (IS_ERR(filter_str))
7010 return PTR_ERR(filter_str);
7011
7012 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
7013
7014 kfree(filter_str);
7015 return ret;
7016}
7017
7018static void perf_event_free_filter(struct perf_event *event)
7019{
7020 ftrace_profile_free_filter(event);
7021}
7022
Alexei Starovoitov25415172015-03-25 12:49:20 -07007023static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
7024{
7025 struct bpf_prog *prog;
7026
7027 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7028 return -EINVAL;
7029
7030 if (event->tp_event->prog)
7031 return -EEXIST;
7032
Wang Nan04a22fa2015-07-01 02:13:50 +00007033 if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
7034 /* bpf programs can only be attached to u/kprobes */
Alexei Starovoitov25415172015-03-25 12:49:20 -07007035 return -EINVAL;
7036
7037 prog = bpf_prog_get(prog_fd);
7038 if (IS_ERR(prog))
7039 return PTR_ERR(prog);
7040
Linus Torvalds6c373ca2015-04-15 09:00:47 -07007041 if (prog->type != BPF_PROG_TYPE_KPROBE) {
Alexei Starovoitov25415172015-03-25 12:49:20 -07007042 /* valid fd, but invalid bpf program type */
7043 bpf_prog_put(prog);
7044 return -EINVAL;
7045 }
7046
7047 event->tp_event->prog = prog;
7048
7049 return 0;
7050}
7051
7052static void perf_event_free_bpf_prog(struct perf_event *event)
7053{
7054 struct bpf_prog *prog;
7055
7056 if (!event->tp_event)
7057 return;
7058
7059 prog = event->tp_event->prog;
7060 if (prog) {
7061 event->tp_event->prog = NULL;
7062 bpf_prog_put(prog);
7063 }
7064}
7065
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007066#else
Li Zefan6fb29152009-10-15 11:21:42 +08007067
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007068static inline void perf_tp_register(void)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007069{
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007070}
Li Zefan6fb29152009-10-15 11:21:42 +08007071
7072static int perf_event_set_filter(struct perf_event *event, void __user *arg)
7073{
7074 return -ENOENT;
7075}
7076
7077static void perf_event_free_filter(struct perf_event *event)
7078{
7079}
7080
Alexei Starovoitov25415172015-03-25 12:49:20 -07007081static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
7082{
7083 return -ENOENT;
7084}
7085
7086static void perf_event_free_bpf_prog(struct perf_event *event)
7087{
7088}
Li Zefan07b139c2009-12-21 14:27:35 +08007089#endif /* CONFIG_EVENT_TRACING */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007090
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02007091#ifdef CONFIG_HAVE_HW_BREAKPOINT
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01007092void perf_bp_event(struct perf_event *bp, void *data)
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02007093{
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01007094 struct perf_sample_data sample;
7095 struct pt_regs *regs = data;
7096
Robert Richterfd0d0002012-04-02 20:19:08 +02007097 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
Frederic Weisbeckerf5ffe022009-11-23 15:42:34 +01007098
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02007099 if (!bp->hw.state && !perf_exclude_event(bp, regs))
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02007100 perf_swevent_event(bp, 1, &sample, regs);
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +02007101}
7102#endif
7103
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007104/*
7105 * hrtimer based swevent callback
7106 */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007107
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007108static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007109{
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007110 enum hrtimer_restart ret = HRTIMER_RESTART;
7111 struct perf_sample_data data;
7112 struct pt_regs *regs;
7113 struct perf_event *event;
7114 u64 period;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007115
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007116 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
Peter Zijlstraba3dd362011-02-15 12:41:46 +01007117
7118 if (event->state != PERF_EVENT_STATE_ACTIVE)
7119 return HRTIMER_NORESTART;
7120
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007121 event->pmu->read(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007122
Robert Richterfd0d0002012-04-02 20:19:08 +02007123 perf_sample_data_init(&data, 0, event->hw.last_period);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007124 regs = get_irq_regs();
7125
7126 if (regs && !perf_exclude_event(event, regs)) {
Paul E. McKenney77aeeeb2011-11-10 16:02:52 -08007127 if (!(event->attr.exclude_idle && is_idle_task(current)))
Robert Richter33b07b82012-04-05 18:24:43 +02007128 if (__perf_event_overflow(event, 1, &data, regs))
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007129 ret = HRTIMER_NORESTART;
7130 }
7131
7132 period = max_t(u64, 10000, event->hw.sample_period);
7133 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
7134
7135 return ret;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007136}
7137
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007138static void perf_swevent_start_hrtimer(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007139{
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007140 struct hw_perf_event *hwc = &event->hw;
Franck Bui-Huu5d508e82010-11-23 16:21:45 +01007141 s64 period;
7142
7143 if (!is_sampling_event(event))
7144 return;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007145
Franck Bui-Huu5d508e82010-11-23 16:21:45 +01007146 period = local64_read(&hwc->period_left);
7147 if (period) {
7148 if (period < 0)
7149 period = 10000;
Peter Zijlstrafa407f32010-06-24 12:35:12 +02007150
Franck Bui-Huu5d508e82010-11-23 16:21:45 +01007151 local64_set(&hwc->period_left, 0);
7152 } else {
7153 period = max_t(u64, 10000, hwc->sample_period);
7154 }
Thomas Gleixner3497d202015-04-14 21:09:03 +00007155 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
7156 HRTIMER_MODE_REL_PINNED);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007157}
7158
7159static void perf_swevent_cancel_hrtimer(struct perf_event *event)
7160{
7161 struct hw_perf_event *hwc = &event->hw;
7162
Franck Bui-Huu6c7e5502010-11-23 16:21:43 +01007163 if (is_sampling_event(event)) {
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007164 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
Peter Zijlstrafa407f32010-06-24 12:35:12 +02007165 local64_set(&hwc->period_left, ktime_to_ns(remaining));
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007166
7167 hrtimer_cancel(&hwc->hrtimer);
7168 }
7169}
7170
Peter Zijlstraba3dd362011-02-15 12:41:46 +01007171static void perf_swevent_init_hrtimer(struct perf_event *event)
7172{
7173 struct hw_perf_event *hwc = &event->hw;
7174
7175 if (!is_sampling_event(event))
7176 return;
7177
7178 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
7179 hwc->hrtimer.function = perf_swevent_hrtimer;
7180
7181 /*
7182 * Since hrtimers have a fixed rate, we can do a static freq->period
7183 * mapping and avoid the whole period adjust feedback stuff.
7184 */
7185 if (event->attr.freq) {
7186 long freq = event->attr.sample_freq;
7187
7188 event->attr.sample_period = NSEC_PER_SEC / freq;
7189 hwc->sample_period = event->attr.sample_period;
7190 local64_set(&hwc->period_left, hwc->sample_period);
Namhyung Kim778141e2013-03-18 11:41:46 +09007191 hwc->last_period = hwc->sample_period;
Peter Zijlstraba3dd362011-02-15 12:41:46 +01007192 event->attr.freq = 0;
7193 }
7194}
7195
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007196/*
7197 * Software event: cpu wall time clock
7198 */
7199
7200static void cpu_clock_event_update(struct perf_event *event)
7201{
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007202 s64 prev;
7203 u64 now;
7204
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02007205 now = local_clock();
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007206 prev = local64_xchg(&event->hw.prev_count, now);
7207 local64_add(now - prev, &event->count);
7208}
7209
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02007210static void cpu_clock_event_start(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007211{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02007212 local64_set(&event->hw.prev_count, local_clock());
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007213 perf_swevent_start_hrtimer(event);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02007214}
7215
7216static void cpu_clock_event_stop(struct perf_event *event, int flags)
7217{
7218 perf_swevent_cancel_hrtimer(event);
7219 cpu_clock_event_update(event);
7220}
7221
7222static int cpu_clock_event_add(struct perf_event *event, int flags)
7223{
7224 if (flags & PERF_EF_START)
7225 cpu_clock_event_start(event, flags);
Shaohua Li6a694a62015-02-05 15:55:32 -08007226 perf_event_update_userpage(event);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007227
7228 return 0;
7229}
7230
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02007231static void cpu_clock_event_del(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007232{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02007233 cpu_clock_event_stop(event, flags);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007234}
7235
7236static void cpu_clock_event_read(struct perf_event *event)
7237{
7238 cpu_clock_event_update(event);
7239}
7240
7241static int cpu_clock_event_init(struct perf_event *event)
7242{
7243 if (event->attr.type != PERF_TYPE_SOFTWARE)
7244 return -ENOENT;
7245
7246 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
7247 return -ENOENT;
7248
Stephane Eranian2481c5f2012-02-09 23:20:59 +01007249 /*
7250 * no branch sampling for software events
7251 */
7252 if (has_branch_stack(event))
7253 return -EOPNOTSUPP;
7254
Peter Zijlstraba3dd362011-02-15 12:41:46 +01007255 perf_swevent_init_hrtimer(event);
7256
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007257 return 0;
7258}
7259
7260static struct pmu perf_cpu_clock = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02007261 .task_ctx_nr = perf_sw_context,
7262
Peter Zijlstra34f43922015-02-20 14:05:38 +01007263 .capabilities = PERF_PMU_CAP_NO_NMI,
7264
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007265 .event_init = cpu_clock_event_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02007266 .add = cpu_clock_event_add,
7267 .del = cpu_clock_event_del,
7268 .start = cpu_clock_event_start,
7269 .stop = cpu_clock_event_stop,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007270 .read = cpu_clock_event_read,
7271};
7272
7273/*
7274 * Software event: task time clock
7275 */
7276
7277static void task_clock_event_update(struct perf_event *event, u64 now)
7278{
7279 u64 prev;
7280 s64 delta;
7281
7282 prev = local64_xchg(&event->hw.prev_count, now);
7283 delta = now - prev;
7284 local64_add(delta, &event->count);
7285}
7286
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02007287static void task_clock_event_start(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007288{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02007289 local64_set(&event->hw.prev_count, event->ctx->time);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007290 perf_swevent_start_hrtimer(event);
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02007291}
7292
7293static void task_clock_event_stop(struct perf_event *event, int flags)
7294{
7295 perf_swevent_cancel_hrtimer(event);
7296 task_clock_event_update(event, event->ctx->time);
7297}
7298
7299static int task_clock_event_add(struct perf_event *event, int flags)
7300{
7301 if (flags & PERF_EF_START)
7302 task_clock_event_start(event, flags);
Shaohua Li6a694a62015-02-05 15:55:32 -08007303 perf_event_update_userpage(event);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007304
7305 return 0;
7306}
7307
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02007308static void task_clock_event_del(struct perf_event *event, int flags)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007309{
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02007310 task_clock_event_stop(event, PERF_EF_UPDATE);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007311}
7312
7313static void task_clock_event_read(struct perf_event *event)
7314{
Peter Zijlstra768a06e2011-02-22 16:52:24 +01007315 u64 now = perf_clock();
7316 u64 delta = now - event->ctx->timestamp;
7317 u64 time = event->ctx->time + delta;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007318
7319 task_clock_event_update(event, time);
7320}
7321
7322static int task_clock_event_init(struct perf_event *event)
7323{
7324 if (event->attr.type != PERF_TYPE_SOFTWARE)
7325 return -ENOENT;
7326
7327 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
7328 return -ENOENT;
7329
Stephane Eranian2481c5f2012-02-09 23:20:59 +01007330 /*
7331 * no branch sampling for software events
7332 */
7333 if (has_branch_stack(event))
7334 return -EOPNOTSUPP;
7335
Peter Zijlstraba3dd362011-02-15 12:41:46 +01007336 perf_swevent_init_hrtimer(event);
7337
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007338 return 0;
7339}
7340
7341static struct pmu perf_task_clock = {
Peter Zijlstra89a1e182010-09-07 17:34:50 +02007342 .task_ctx_nr = perf_sw_context,
7343
Peter Zijlstra34f43922015-02-20 14:05:38 +01007344 .capabilities = PERF_PMU_CAP_NO_NMI,
7345
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007346 .event_init = task_clock_event_init,
Peter Zijlstraa4eaf7f2010-06-16 14:37:10 +02007347 .add = task_clock_event_add,
7348 .del = task_clock_event_del,
7349 .start = task_clock_event_start,
7350 .stop = task_clock_event_stop,
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007351 .read = task_clock_event_read,
7352};
7353
Peter Zijlstraad5133b2010-06-15 12:22:39 +02007354static void perf_pmu_nop_void(struct pmu *pmu)
7355{
7356}
7357
Sukadev Bhattiprolufbbe0702015-09-03 20:07:45 -07007358static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
7359{
7360}
7361
Peter Zijlstraad5133b2010-06-15 12:22:39 +02007362static int perf_pmu_nop_int(struct pmu *pmu)
7363{
7364 return 0;
7365}
7366
Geliang Tang18ab2cd2015-09-27 23:25:50 +08007367static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
Sukadev Bhattiprolufbbe0702015-09-03 20:07:45 -07007368
7369static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
Peter Zijlstraad5133b2010-06-15 12:22:39 +02007370{
Sukadev Bhattiprolufbbe0702015-09-03 20:07:45 -07007371 __this_cpu_write(nop_txn_flags, flags);
7372
7373 if (flags & ~PERF_PMU_TXN_ADD)
7374 return;
7375
Peter Zijlstraad5133b2010-06-15 12:22:39 +02007376 perf_pmu_disable(pmu);
7377}
7378
7379static int perf_pmu_commit_txn(struct pmu *pmu)
7380{
Sukadev Bhattiprolufbbe0702015-09-03 20:07:45 -07007381 unsigned int flags = __this_cpu_read(nop_txn_flags);
7382
7383 __this_cpu_write(nop_txn_flags, 0);
7384
7385 if (flags & ~PERF_PMU_TXN_ADD)
7386 return 0;
7387
Peter Zijlstraad5133b2010-06-15 12:22:39 +02007388 perf_pmu_enable(pmu);
7389 return 0;
7390}
7391
7392static void perf_pmu_cancel_txn(struct pmu *pmu)
7393{
Sukadev Bhattiprolufbbe0702015-09-03 20:07:45 -07007394 unsigned int flags = __this_cpu_read(nop_txn_flags);
7395
7396 __this_cpu_write(nop_txn_flags, 0);
7397
7398 if (flags & ~PERF_PMU_TXN_ADD)
7399 return;
7400
Peter Zijlstraad5133b2010-06-15 12:22:39 +02007401 perf_pmu_enable(pmu);
7402}
7403
Peter Zijlstra35edc2a2011-11-20 20:36:02 +01007404static int perf_event_idx_default(struct perf_event *event)
7405{
Peter Zijlstrac719f562014-10-21 11:10:21 +02007406 return 0;
Peter Zijlstra35edc2a2011-11-20 20:36:02 +01007407}
7408
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007409/*
7410 * Ensures all contexts with the same task_ctx_nr have the same
7411 * pmu_cpu_context too.
7412 */
Mark Rutland9e317042014-02-10 17:44:18 +00007413static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007414{
7415 struct pmu *pmu;
7416
7417 if (ctxn < 0)
7418 return NULL;
7419
7420 list_for_each_entry(pmu, &pmus, entry) {
7421 if (pmu->task_ctx_nr == ctxn)
7422 return pmu->pmu_cpu_context;
7423 }
7424
7425 return NULL;
7426}
7427
Peter Zijlstra51676952010-12-07 14:18:20 +01007428static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007429{
Peter Zijlstra51676952010-12-07 14:18:20 +01007430 int cpu;
7431
7432 for_each_possible_cpu(cpu) {
7433 struct perf_cpu_context *cpuctx;
7434
7435 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7436
Peter Zijlstra3f1f3322012-10-02 15:38:52 +02007437 if (cpuctx->unique_pmu == old_pmu)
7438 cpuctx->unique_pmu = pmu;
Peter Zijlstra51676952010-12-07 14:18:20 +01007439 }
7440}
7441
7442static void free_pmu_context(struct pmu *pmu)
7443{
7444 struct pmu *i;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007445
7446 mutex_lock(&pmus_lock);
7447 /*
7448 * Like a real lame refcount.
7449 */
Peter Zijlstra51676952010-12-07 14:18:20 +01007450 list_for_each_entry(i, &pmus, entry) {
7451 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
7452 update_pmu_context(i, pmu);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007453 goto out;
Peter Zijlstra51676952010-12-07 14:18:20 +01007454 }
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007455 }
7456
Peter Zijlstra51676952010-12-07 14:18:20 +01007457 free_percpu(pmu->pmu_cpu_context);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007458out:
7459 mutex_unlock(&pmus_lock);
7460}
Peter Zijlstra2e80a822010-11-17 23:17:36 +01007461static struct idr pmu_idr;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007462
Peter Zijlstraabe43402010-11-17 23:17:37 +01007463static ssize_t
7464type_show(struct device *dev, struct device_attribute *attr, char *page)
7465{
7466 struct pmu *pmu = dev_get_drvdata(dev);
7467
7468 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
7469}
Greg Kroah-Hartman90826ca2013-08-23 14:24:40 -07007470static DEVICE_ATTR_RO(type);
Peter Zijlstraabe43402010-11-17 23:17:37 +01007471
Stephane Eranian62b85632013-04-03 14:21:34 +02007472static ssize_t
7473perf_event_mux_interval_ms_show(struct device *dev,
7474 struct device_attribute *attr,
7475 char *page)
7476{
7477 struct pmu *pmu = dev_get_drvdata(dev);
7478
7479 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
7480}
7481
Peter Zijlstra272325c2015-04-15 11:41:58 +02007482static DEFINE_MUTEX(mux_interval_mutex);
7483
Stephane Eranian62b85632013-04-03 14:21:34 +02007484static ssize_t
7485perf_event_mux_interval_ms_store(struct device *dev,
7486 struct device_attribute *attr,
7487 const char *buf, size_t count)
7488{
7489 struct pmu *pmu = dev_get_drvdata(dev);
7490 int timer, cpu, ret;
7491
7492 ret = kstrtoint(buf, 0, &timer);
7493 if (ret)
7494 return ret;
7495
7496 if (timer < 1)
7497 return -EINVAL;
7498
7499 /* same value, noting to do */
7500 if (timer == pmu->hrtimer_interval_ms)
7501 return count;
7502
Peter Zijlstra272325c2015-04-15 11:41:58 +02007503 mutex_lock(&mux_interval_mutex);
Stephane Eranian62b85632013-04-03 14:21:34 +02007504 pmu->hrtimer_interval_ms = timer;
7505
7506 /* update all cpuctx for this PMU */
Peter Zijlstra272325c2015-04-15 11:41:58 +02007507 get_online_cpus();
7508 for_each_online_cpu(cpu) {
Stephane Eranian62b85632013-04-03 14:21:34 +02007509 struct perf_cpu_context *cpuctx;
7510 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7511 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
7512
Peter Zijlstra272325c2015-04-15 11:41:58 +02007513 cpu_function_call(cpu,
7514 (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
Stephane Eranian62b85632013-04-03 14:21:34 +02007515 }
Peter Zijlstra272325c2015-04-15 11:41:58 +02007516 put_online_cpus();
7517 mutex_unlock(&mux_interval_mutex);
Stephane Eranian62b85632013-04-03 14:21:34 +02007518
7519 return count;
7520}
Greg Kroah-Hartman90826ca2013-08-23 14:24:40 -07007521static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
Stephane Eranian62b85632013-04-03 14:21:34 +02007522
Greg Kroah-Hartman90826ca2013-08-23 14:24:40 -07007523static struct attribute *pmu_dev_attrs[] = {
7524 &dev_attr_type.attr,
7525 &dev_attr_perf_event_mux_interval_ms.attr,
7526 NULL,
Peter Zijlstraabe43402010-11-17 23:17:37 +01007527};
Greg Kroah-Hartman90826ca2013-08-23 14:24:40 -07007528ATTRIBUTE_GROUPS(pmu_dev);
Peter Zijlstraabe43402010-11-17 23:17:37 +01007529
7530static int pmu_bus_running;
7531static struct bus_type pmu_bus = {
7532 .name = "event_source",
Greg Kroah-Hartman90826ca2013-08-23 14:24:40 -07007533 .dev_groups = pmu_dev_groups,
Peter Zijlstraabe43402010-11-17 23:17:37 +01007534};
7535
7536static void pmu_dev_release(struct device *dev)
7537{
7538 kfree(dev);
7539}
7540
7541static int pmu_dev_alloc(struct pmu *pmu)
7542{
7543 int ret = -ENOMEM;
7544
7545 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
7546 if (!pmu->dev)
7547 goto out;
7548
Peter Zijlstra0c9d42e2011-11-20 23:30:47 +01007549 pmu->dev->groups = pmu->attr_groups;
Peter Zijlstraabe43402010-11-17 23:17:37 +01007550 device_initialize(pmu->dev);
7551 ret = dev_set_name(pmu->dev, "%s", pmu->name);
7552 if (ret)
7553 goto free_dev;
7554
7555 dev_set_drvdata(pmu->dev, pmu);
7556 pmu->dev->bus = &pmu_bus;
7557 pmu->dev->release = pmu_dev_release;
7558 ret = device_add(pmu->dev);
7559 if (ret)
7560 goto free_dev;
7561
7562out:
7563 return ret;
7564
7565free_dev:
7566 put_device(pmu->dev);
7567 goto out;
7568}
7569
Peter Zijlstra547e9fd2011-01-19 12:51:39 +01007570static struct lock_class_key cpuctx_mutex;
Peter Zijlstrafacc4302011-04-09 21:17:42 +02007571static struct lock_class_key cpuctx_lock;
Peter Zijlstra547e9fd2011-01-19 12:51:39 +01007572
Mischa Jonker03d8e802013-06-04 11:45:48 +02007573int perf_pmu_register(struct pmu *pmu, const char *name, int type)
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007574{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007575 int cpu, ret;
Peter Zijlstra33696fc2010-06-14 08:49:00 +02007576
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007577 mutex_lock(&pmus_lock);
Peter Zijlstra33696fc2010-06-14 08:49:00 +02007578 ret = -ENOMEM;
7579 pmu->pmu_disable_count = alloc_percpu(int);
7580 if (!pmu->pmu_disable_count)
7581 goto unlock;
Peter Zijlstraad5133b2010-06-15 12:22:39 +02007582
Peter Zijlstra2e80a822010-11-17 23:17:36 +01007583 pmu->type = -1;
7584 if (!name)
7585 goto skip_type;
7586 pmu->name = name;
7587
7588 if (type < 0) {
Tejun Heo0e9c3be2013-02-27 17:04:55 -08007589 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
7590 if (type < 0) {
7591 ret = type;
Peter Zijlstra2e80a822010-11-17 23:17:36 +01007592 goto free_pdc;
7593 }
7594 }
7595 pmu->type = type;
7596
Peter Zijlstraabe43402010-11-17 23:17:37 +01007597 if (pmu_bus_running) {
7598 ret = pmu_dev_alloc(pmu);
7599 if (ret)
7600 goto free_idr;
7601 }
7602
Peter Zijlstra2e80a822010-11-17 23:17:36 +01007603skip_type:
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007604 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
7605 if (pmu->pmu_cpu_context)
7606 goto got_cpu_context;
7607
Wei Yongjunc4814202013-04-12 11:05:54 +08007608 ret = -ENOMEM;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007609 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
7610 if (!pmu->pmu_cpu_context)
Peter Zijlstraabe43402010-11-17 23:17:37 +01007611 goto free_dev;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007612
7613 for_each_possible_cpu(cpu) {
7614 struct perf_cpu_context *cpuctx;
7615
7616 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
Peter Zijlstraeb184472010-09-07 15:55:13 +02007617 __perf_event_init_context(&cpuctx->ctx);
Peter Zijlstra547e9fd2011-01-19 12:51:39 +01007618 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
Peter Zijlstrafacc4302011-04-09 21:17:42 +02007619 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007620 cpuctx->ctx.pmu = pmu;
Stephane Eranian9e630202013-04-03 14:21:33 +02007621
Peter Zijlstra272325c2015-04-15 11:41:58 +02007622 __perf_mux_hrtimer_init(cpuctx, cpu);
Stephane Eranian9e630202013-04-03 14:21:33 +02007623
Peter Zijlstra3f1f3322012-10-02 15:38:52 +02007624 cpuctx->unique_pmu = pmu;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007625 }
7626
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02007627got_cpu_context:
Peter Zijlstraad5133b2010-06-15 12:22:39 +02007628 if (!pmu->start_txn) {
7629 if (pmu->pmu_enable) {
7630 /*
7631 * If we have pmu_enable/pmu_disable calls, install
7632 * transaction stubs that use that to try and batch
7633 * hardware accesses.
7634 */
7635 pmu->start_txn = perf_pmu_start_txn;
7636 pmu->commit_txn = perf_pmu_commit_txn;
7637 pmu->cancel_txn = perf_pmu_cancel_txn;
7638 } else {
Sukadev Bhattiprolufbbe0702015-09-03 20:07:45 -07007639 pmu->start_txn = perf_pmu_nop_txn;
Peter Zijlstraad5133b2010-06-15 12:22:39 +02007640 pmu->commit_txn = perf_pmu_nop_int;
7641 pmu->cancel_txn = perf_pmu_nop_void;
7642 }
7643 }
7644
7645 if (!pmu->pmu_enable) {
7646 pmu->pmu_enable = perf_pmu_nop_void;
7647 pmu->pmu_disable = perf_pmu_nop_void;
7648 }
7649
Peter Zijlstra35edc2a2011-11-20 20:36:02 +01007650 if (!pmu->event_idx)
7651 pmu->event_idx = perf_event_idx_default;
7652
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007653 list_add_rcu(&pmu->entry, &pmus);
Alexander Shishkinbed5b252015-01-30 12:31:06 +02007654 atomic_set(&pmu->exclusive_cnt, 0);
Peter Zijlstra33696fc2010-06-14 08:49:00 +02007655 ret = 0;
7656unlock:
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007657 mutex_unlock(&pmus_lock);
7658
Peter Zijlstra33696fc2010-06-14 08:49:00 +02007659 return ret;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007660
Peter Zijlstraabe43402010-11-17 23:17:37 +01007661free_dev:
7662 device_del(pmu->dev);
7663 put_device(pmu->dev);
7664
Peter Zijlstra2e80a822010-11-17 23:17:36 +01007665free_idr:
7666 if (pmu->type >= PERF_TYPE_MAX)
7667 idr_remove(&pmu_idr, pmu->type);
7668
Peter Zijlstra108b02c2010-09-06 14:32:03 +02007669free_pdc:
7670 free_percpu(pmu->pmu_disable_count);
7671 goto unlock;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007672}
Yan, Zhengc464c762014-03-18 16:56:41 +08007673EXPORT_SYMBOL_GPL(perf_pmu_register);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007674
7675void perf_pmu_unregister(struct pmu *pmu)
7676{
7677 mutex_lock(&pmus_lock);
7678 list_del_rcu(&pmu->entry);
7679 mutex_unlock(&pmus_lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007680
7681 /*
Peter Zijlstracde8e882010-09-13 11:06:55 +02007682 * We dereference the pmu list under both SRCU and regular RCU, so
7683 * synchronize against both of those.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007684 */
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007685 synchronize_srcu(&pmus_srcu);
Peter Zijlstracde8e882010-09-13 11:06:55 +02007686 synchronize_rcu();
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007687
Peter Zijlstra33696fc2010-06-14 08:49:00 +02007688 free_percpu(pmu->pmu_disable_count);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01007689 if (pmu->type >= PERF_TYPE_MAX)
7690 idr_remove(&pmu_idr, pmu->type);
Peter Zijlstraabe43402010-11-17 23:17:37 +01007691 device_del(pmu->dev);
7692 put_device(pmu->dev);
Peter Zijlstra51676952010-12-07 14:18:20 +01007693 free_pmu_context(pmu);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007694}
Yan, Zhengc464c762014-03-18 16:56:41 +08007695EXPORT_SYMBOL_GPL(perf_pmu_unregister);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007696
Mark Rutlandcc34b982015-01-07 14:56:51 +00007697static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
7698{
Peter Zijlstraccd41c82015-02-25 15:56:04 +01007699 struct perf_event_context *ctx = NULL;
Mark Rutlandcc34b982015-01-07 14:56:51 +00007700 int ret;
7701
7702 if (!try_module_get(pmu->module))
7703 return -ENODEV;
Peter Zijlstraccd41c82015-02-25 15:56:04 +01007704
7705 if (event->group_leader != event) {
Peter Zijlstra8b10c5e2015-05-01 16:08:46 +02007706 /*
7707 * This ctx->mutex can nest when we're called through
7708 * inheritance. See the perf_event_ctx_lock_nested() comment.
7709 */
7710 ctx = perf_event_ctx_lock_nested(event->group_leader,
7711 SINGLE_DEPTH_NESTING);
Peter Zijlstraccd41c82015-02-25 15:56:04 +01007712 BUG_ON(!ctx);
7713 }
7714
Mark Rutlandcc34b982015-01-07 14:56:51 +00007715 event->pmu = pmu;
7716 ret = pmu->event_init(event);
Peter Zijlstraccd41c82015-02-25 15:56:04 +01007717
7718 if (ctx)
7719 perf_event_ctx_unlock(event->group_leader, ctx);
7720
Mark Rutlandcc34b982015-01-07 14:56:51 +00007721 if (ret)
7722 module_put(pmu->module);
7723
7724 return ret;
7725}
7726
Geliang Tang18ab2cd2015-09-27 23:25:50 +08007727static struct pmu *perf_init_event(struct perf_event *event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007728{
Peter Zijlstra51b0fe32010-06-11 13:35:57 +02007729 struct pmu *pmu = NULL;
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007730 int idx;
Lin Ming940c5b22011-02-27 21:13:31 +08007731 int ret;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02007732
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007733 idx = srcu_read_lock(&pmus_srcu);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01007734
7735 rcu_read_lock();
7736 pmu = idr_find(&pmu_idr, event->attr.type);
7737 rcu_read_unlock();
Lin Ming940c5b22011-02-27 21:13:31 +08007738 if (pmu) {
Mark Rutlandcc34b982015-01-07 14:56:51 +00007739 ret = perf_try_init_event(pmu, event);
Lin Ming940c5b22011-02-27 21:13:31 +08007740 if (ret)
7741 pmu = ERR_PTR(ret);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01007742 goto unlock;
Lin Ming940c5b22011-02-27 21:13:31 +08007743 }
Peter Zijlstra2e80a822010-11-17 23:17:36 +01007744
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007745 list_for_each_entry_rcu(pmu, &pmus, entry) {
Mark Rutlandcc34b982015-01-07 14:56:51 +00007746 ret = perf_try_init_event(pmu, event);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007747 if (!ret)
Peter Zijlstrae5f4d332010-09-10 17:38:06 +02007748 goto unlock;
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02007749
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007750 if (ret != -ENOENT) {
7751 pmu = ERR_PTR(ret);
Peter Zijlstrae5f4d332010-09-10 17:38:06 +02007752 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007753 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007754 }
Peter Zijlstrae5f4d332010-09-10 17:38:06 +02007755 pmu = ERR_PTR(-ENOENT);
7756unlock:
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007757 srcu_read_unlock(&pmus_srcu, idx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007758
7759 return pmu;
7760}
7761
Frederic Weisbecker4beb31f2013-07-23 02:31:02 +02007762static void account_event_cpu(struct perf_event *event, int cpu)
7763{
7764 if (event->parent)
7765 return;
7766
Frederic Weisbecker4beb31f2013-07-23 02:31:02 +02007767 if (is_cgroup_event(event))
7768 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
7769}
7770
Frederic Weisbecker766d6c02013-07-23 02:31:01 +02007771static void account_event(struct perf_event *event)
7772{
Peter Zijlstra25432ae2016-01-08 11:05:09 +01007773 bool inc = false;
7774
Frederic Weisbecker4beb31f2013-07-23 02:31:02 +02007775 if (event->parent)
7776 return;
7777
Frederic Weisbecker766d6c02013-07-23 02:31:01 +02007778 if (event->attach_state & PERF_ATTACH_TASK)
Peter Zijlstra25432ae2016-01-08 11:05:09 +01007779 inc = true;
Frederic Weisbecker766d6c02013-07-23 02:31:01 +02007780 if (event->attr.mmap || event->attr.mmap_data)
7781 atomic_inc(&nr_mmap_events);
7782 if (event->attr.comm)
7783 atomic_inc(&nr_comm_events);
7784 if (event->attr.task)
7785 atomic_inc(&nr_task_events);
Frederic Weisbecker948b26b2013-08-02 18:29:55 +02007786 if (event->attr.freq) {
7787 if (atomic_inc_return(&nr_freq_events) == 1)
7788 tick_nohz_full_kick_all();
7789 }
Adrian Hunter45ac1402015-07-21 12:44:02 +03007790 if (event->attr.context_switch) {
7791 atomic_inc(&nr_switch_events);
Peter Zijlstra25432ae2016-01-08 11:05:09 +01007792 inc = true;
Adrian Hunter45ac1402015-07-21 12:44:02 +03007793 }
Frederic Weisbecker4beb31f2013-07-23 02:31:02 +02007794 if (has_branch_stack(event))
Peter Zijlstra25432ae2016-01-08 11:05:09 +01007795 inc = true;
Frederic Weisbecker4beb31f2013-07-23 02:31:02 +02007796 if (is_cgroup_event(event))
Peter Zijlstra25432ae2016-01-08 11:05:09 +01007797 inc = true;
7798
Peter Zijlstra9107c892016-02-24 18:45:45 +01007799 if (inc) {
7800 if (atomic_inc_not_zero(&perf_sched_count))
7801 goto enabled;
7802
7803 mutex_lock(&perf_sched_mutex);
7804 if (!atomic_read(&perf_sched_count)) {
7805 static_branch_enable(&perf_sched_events);
7806 /*
7807 * Guarantee that all CPUs observe they key change and
7808 * call the perf scheduling hooks before proceeding to
7809 * install events that need them.
7810 */
7811 synchronize_sched();
7812 }
7813 /*
7814 * Now that we have waited for the sync_sched(), allow further
7815 * increments to by-pass the mutex.
7816 */
7817 atomic_inc(&perf_sched_count);
7818 mutex_unlock(&perf_sched_mutex);
7819 }
7820enabled:
Frederic Weisbecker766d6c02013-07-23 02:31:01 +02007821
Frederic Weisbecker4beb31f2013-07-23 02:31:02 +02007822 account_event_cpu(event, event->cpu);
Frederic Weisbecker766d6c02013-07-23 02:31:01 +02007823}
7824
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007825/*
7826 * Allocate and initialize a event structure
7827 */
7828static struct perf_event *
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02007829perf_event_alloc(struct perf_event_attr *attr, int cpu,
Peter Zijlstrad580ff82010-10-14 17:43:23 +02007830 struct task_struct *task,
7831 struct perf_event *group_leader,
7832 struct perf_event *parent_event,
Avi Kivity4dc0da82011-06-29 18:42:35 +03007833 perf_overflow_handler_t overflow_handler,
Matt Fleming79dff512015-01-23 18:45:42 +00007834 void *context, int cgroup_fd)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007835{
Peter Zijlstra51b0fe32010-06-11 13:35:57 +02007836 struct pmu *pmu;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007837 struct perf_event *event;
7838 struct hw_perf_event *hwc;
Frederic Weisbecker90983b12013-07-23 02:31:00 +02007839 long err = -EINVAL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007840
Oleg Nesterov66832eb2011-01-18 17:10:32 +01007841 if ((unsigned)cpu >= nr_cpu_ids) {
7842 if (!task || cpu != -1)
7843 return ERR_PTR(-EINVAL);
7844 }
7845
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02007846 event = kzalloc(sizeof(*event), GFP_KERNEL);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007847 if (!event)
7848 return ERR_PTR(-ENOMEM);
7849
7850 /*
7851 * Single events are their own group leaders, with an
7852 * empty sibling list:
7853 */
7854 if (!group_leader)
7855 group_leader = event;
7856
7857 mutex_init(&event->child_mutex);
7858 INIT_LIST_HEAD(&event->child_list);
7859
7860 INIT_LIST_HEAD(&event->group_entry);
7861 INIT_LIST_HEAD(&event->event_entry);
7862 INIT_LIST_HEAD(&event->sibling_list);
Peter Zijlstra10c6db12011-11-26 02:47:31 +01007863 INIT_LIST_HEAD(&event->rb_entry);
Stephane Eranian71ad88e2013-11-12 17:58:48 +01007864 INIT_LIST_HEAD(&event->active_entry);
Stephane Eranianf3ae75d2014-01-08 11:15:52 +01007865 INIT_HLIST_NODE(&event->hlist_entry);
7866
Peter Zijlstra10c6db12011-11-26 02:47:31 +01007867
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007868 init_waitqueue_head(&event->waitq);
Peter Zijlstrae360adb2010-10-14 14:01:34 +08007869 init_irq_work(&event->pending, perf_pending_event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007870
7871 mutex_init(&event->mmap_mutex);
7872
Al Viroa6fa9412012-08-20 14:59:25 +01007873 atomic_long_set(&event->refcount, 1);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007874 event->cpu = cpu;
7875 event->attr = *attr;
7876 event->group_leader = group_leader;
7877 event->pmu = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007878 event->oncpu = -1;
7879
7880 event->parent = parent_event;
7881
Eric W. Biederman17cf22c2010-03-02 14:51:53 -08007882 event->ns = get_pid_ns(task_active_pid_ns(current));
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007883 event->id = atomic64_inc_return(&perf_event_id);
7884
7885 event->state = PERF_EVENT_STATE_INACTIVE;
7886
Peter Zijlstrad580ff82010-10-14 17:43:23 +02007887 if (task) {
7888 event->attach_state = PERF_ATTACH_TASK;
Peter Zijlstrad580ff82010-10-14 17:43:23 +02007889 /*
Peter Zijlstra50f16a82015-03-05 22:10:19 +01007890 * XXX pmu::event_init needs to know what task to account to
7891 * and we cannot use the ctx information because we need the
7892 * pmu before we get a ctx.
Peter Zijlstrad580ff82010-10-14 17:43:23 +02007893 */
Peter Zijlstra50f16a82015-03-05 22:10:19 +01007894 event->hw.target = task;
Peter Zijlstrad580ff82010-10-14 17:43:23 +02007895 }
7896
Peter Zijlstra34f43922015-02-20 14:05:38 +01007897 event->clock = &local_clock;
7898 if (parent_event)
7899 event->clock = parent_event->clock;
7900
Avi Kivity4dc0da82011-06-29 18:42:35 +03007901 if (!overflow_handler && parent_event) {
Frederic Weisbeckerb326e952009-12-05 09:44:31 +01007902 overflow_handler = parent_event->overflow_handler;
Avi Kivity4dc0da82011-06-29 18:42:35 +03007903 context = parent_event->overflow_handler_context;
7904 }
Oleg Nesterov66832eb2011-01-18 17:10:32 +01007905
Frederic Weisbeckerb326e952009-12-05 09:44:31 +01007906 event->overflow_handler = overflow_handler;
Avi Kivity4dc0da82011-06-29 18:42:35 +03007907 event->overflow_handler_context = context;
Frederic Weisbecker97eaf532009-10-18 15:33:50 +02007908
Jiri Olsa0231bb52013-02-01 11:23:45 +01007909 perf_event__state_init(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007910
7911 pmu = NULL;
7912
7913 hwc = &event->hw;
7914 hwc->sample_period = attr->sample_period;
7915 if (attr->freq && attr->sample_freq)
7916 hwc->sample_period = 1;
7917 hwc->last_period = hwc->sample_period;
7918
Peter Zijlstrae7850592010-05-21 14:43:08 +02007919 local64_set(&hwc->period_left, hwc->sample_period);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007920
7921 /*
7922 * we currently do not support PERF_FORMAT_GROUP on inherited events
7923 */
7924 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
Frederic Weisbecker90983b12013-07-23 02:31:00 +02007925 goto err_ns;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007926
Yan, Zhenga46a2302014-11-04 21:56:06 -05007927 if (!has_branch_stack(event))
7928 event->attr.branch_sample_type = 0;
7929
Matt Fleming79dff512015-01-23 18:45:42 +00007930 if (cgroup_fd != -1) {
7931 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
7932 if (err)
7933 goto err_ns;
7934 }
7935
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02007936 pmu = perf_init_event(event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007937 if (!pmu)
Frederic Weisbecker90983b12013-07-23 02:31:00 +02007938 goto err_ns;
7939 else if (IS_ERR(pmu)) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007940 err = PTR_ERR(pmu);
Frederic Weisbecker90983b12013-07-23 02:31:00 +02007941 goto err_ns;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007942 }
7943
Alexander Shishkinbed5b252015-01-30 12:31:06 +02007944 err = exclusive_event_init(event);
7945 if (err)
7946 goto err_pmu;
7947
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007948 if (!event->parent) {
Frederic Weisbecker927c7a92010-07-01 16:20:36 +02007949 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
7950 err = get_callchain_buffers();
Frederic Weisbecker90983b12013-07-23 02:31:00 +02007951 if (err)
Alexander Shishkinbed5b252015-01-30 12:31:06 +02007952 goto err_per_task;
Stephane Eraniand010b332012-02-09 23:21:00 +01007953 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007954 }
7955
7956 return event;
Frederic Weisbecker90983b12013-07-23 02:31:00 +02007957
Alexander Shishkinbed5b252015-01-30 12:31:06 +02007958err_per_task:
7959 exclusive_event_destroy(event);
7960
Frederic Weisbecker90983b12013-07-23 02:31:00 +02007961err_pmu:
7962 if (event->destroy)
7963 event->destroy(event);
Yan, Zhengc464c762014-03-18 16:56:41 +08007964 module_put(pmu->module);
Frederic Weisbecker90983b12013-07-23 02:31:00 +02007965err_ns:
Matt Fleming79dff512015-01-23 18:45:42 +00007966 if (is_cgroup_event(event))
7967 perf_detach_cgroup(event);
Frederic Weisbecker90983b12013-07-23 02:31:00 +02007968 if (event->ns)
7969 put_pid_ns(event->ns);
7970 kfree(event);
7971
7972 return ERR_PTR(err);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02007973}
7974
7975static int perf_copy_attr(struct perf_event_attr __user *uattr,
7976 struct perf_event_attr *attr)
7977{
7978 u32 size;
7979 int ret;
7980
7981 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
7982 return -EFAULT;
7983
7984 /*
7985 * zero the full structure, so that a short copy will be nice.
7986 */
7987 memset(attr, 0, sizeof(*attr));
7988
7989 ret = get_user(size, &uattr->size);
7990 if (ret)
7991 return ret;
7992
7993 if (size > PAGE_SIZE) /* silly large */
7994 goto err_size;
7995
7996 if (!size) /* abi compat */
7997 size = PERF_ATTR_SIZE_VER0;
7998
7999 if (size < PERF_ATTR_SIZE_VER0)
8000 goto err_size;
8001
8002 /*
8003 * If we're handed a bigger struct than we know of,
8004 * ensure all the unknown bits are 0 - i.e. new
8005 * user-space does not rely on any kernel feature
8006 * extensions we dont know about yet.
8007 */
8008 if (size > sizeof(*attr)) {
8009 unsigned char __user *addr;
8010 unsigned char __user *end;
8011 unsigned char val;
8012
8013 addr = (void __user *)uattr + sizeof(*attr);
8014 end = (void __user *)uattr + size;
8015
8016 for (; addr < end; addr++) {
8017 ret = get_user(val, addr);
8018 if (ret)
8019 return ret;
8020 if (val)
8021 goto err_size;
8022 }
8023 size = sizeof(*attr);
8024 }
8025
8026 ret = copy_from_user(attr, uattr, size);
8027 if (ret)
8028 return -EFAULT;
8029
Mahesh Salgaonkarcd757642010-01-30 10:25:18 +05308030 if (attr->__reserved_1)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008031 return -EINVAL;
8032
8033 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
8034 return -EINVAL;
8035
8036 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
8037 return -EINVAL;
8038
Stephane Eranianbce38cd2012-02-09 23:20:51 +01008039 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
8040 u64 mask = attr->branch_sample_type;
8041
8042 /* only using defined bits */
8043 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
8044 return -EINVAL;
8045
8046 /* at least one branch bit must be set */
8047 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
8048 return -EINVAL;
8049
Stephane Eranianbce38cd2012-02-09 23:20:51 +01008050 /* propagate priv level, when not set for branch */
8051 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
8052
8053 /* exclude_kernel checked on syscall entry */
8054 if (!attr->exclude_kernel)
8055 mask |= PERF_SAMPLE_BRANCH_KERNEL;
8056
8057 if (!attr->exclude_user)
8058 mask |= PERF_SAMPLE_BRANCH_USER;
8059
8060 if (!attr->exclude_hv)
8061 mask |= PERF_SAMPLE_BRANCH_HV;
8062 /*
8063 * adjust user setting (for HW filter setup)
8064 */
8065 attr->branch_sample_type = mask;
8066 }
Stephane Eraniane7122092013-06-06 11:02:04 +02008067 /* privileged levels capture (kernel, hv): check permissions */
8068 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
Stephane Eranian2b923c82013-05-21 12:53:37 +02008069 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
8070 return -EACCES;
Stephane Eranianbce38cd2012-02-09 23:20:51 +01008071 }
Jiri Olsa40189942012-08-07 15:20:37 +02008072
Jiri Olsac5ebced2012-08-07 15:20:40 +02008073 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
Jiri Olsa40189942012-08-07 15:20:37 +02008074 ret = perf_reg_validate(attr->sample_regs_user);
Jiri Olsac5ebced2012-08-07 15:20:40 +02008075 if (ret)
8076 return ret;
8077 }
8078
8079 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
8080 if (!arch_perf_have_user_stack_dump())
8081 return -ENOSYS;
8082
8083 /*
8084 * We have __u32 type for the size, but so far
8085 * we can only use __u16 as maximum due to the
8086 * __u16 sample size limit.
8087 */
8088 if (attr->sample_stack_user >= USHRT_MAX)
8089 ret = -EINVAL;
8090 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
8091 ret = -EINVAL;
8092 }
Jiri Olsa40189942012-08-07 15:20:37 +02008093
Stephane Eranian60e23642014-09-24 13:48:37 +02008094 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
8095 ret = perf_reg_validate(attr->sample_regs_intr);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008096out:
8097 return ret;
8098
8099err_size:
8100 put_user(sizeof(*attr), &uattr->size);
8101 ret = -E2BIG;
8102 goto out;
8103}
8104
Peter Zijlstraac9721f2010-05-27 12:54:41 +02008105static int
8106perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008107{
Peter Zijlstrab69cf532014-03-14 10:50:33 +01008108 struct ring_buffer *rb = NULL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008109 int ret = -EINVAL;
8110
Peter Zijlstraac9721f2010-05-27 12:54:41 +02008111 if (!output_event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008112 goto set;
8113
Peter Zijlstraac9721f2010-05-27 12:54:41 +02008114 /* don't allow circular references */
8115 if (event == output_event)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008116 goto out;
8117
Peter Zijlstra0f139302010-05-20 14:35:15 +02008118 /*
8119 * Don't allow cross-cpu buffers
8120 */
8121 if (output_event->cpu != event->cpu)
8122 goto out;
8123
8124 /*
Frederic Weisbecker76369132011-05-19 19:55:04 +02008125 * If its not a per-cpu rb, it must be the same task.
Peter Zijlstra0f139302010-05-20 14:35:15 +02008126 */
8127 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
8128 goto out;
8129
Peter Zijlstra34f43922015-02-20 14:05:38 +01008130 /*
8131 * Mixing clocks in the same buffer is trouble you don't need.
8132 */
8133 if (output_event->clock != event->clock)
8134 goto out;
8135
Peter Zijlstra45bfb2e2015-01-14 14:18:11 +02008136 /*
8137 * If both events generate aux data, they must be on the same PMU
8138 */
8139 if (has_aux(event) && has_aux(output_event) &&
8140 event->pmu != output_event->pmu)
8141 goto out;
8142
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008143set:
8144 mutex_lock(&event->mmap_mutex);
Peter Zijlstraac9721f2010-05-27 12:54:41 +02008145 /* Can't redirect output if we've got an active mmap() */
8146 if (atomic_read(&event->mmap_count))
8147 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008148
Peter Zijlstraac9721f2010-05-27 12:54:41 +02008149 if (output_event) {
Frederic Weisbecker76369132011-05-19 19:55:04 +02008150 /* get the rb we want to redirect to */
8151 rb = ring_buffer_get(output_event);
8152 if (!rb)
Peter Zijlstraac9721f2010-05-27 12:54:41 +02008153 goto unlock;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008154 }
8155
Peter Zijlstrab69cf532014-03-14 10:50:33 +01008156 ring_buffer_attach(event, rb);
Peter Zijlstra9bb5d402013-06-04 10:44:21 +02008157
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008158 ret = 0;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02008159unlock:
8160 mutex_unlock(&event->mmap_mutex);
8161
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008162out:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008163 return ret;
8164}
8165
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01008166static void mutex_lock_double(struct mutex *a, struct mutex *b)
8167{
8168 if (b < a)
8169 swap(a, b);
8170
8171 mutex_lock(a);
8172 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
8173}
8174
Peter Zijlstra34f43922015-02-20 14:05:38 +01008175static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
8176{
8177 bool nmi_safe = false;
8178
8179 switch (clk_id) {
8180 case CLOCK_MONOTONIC:
8181 event->clock = &ktime_get_mono_fast_ns;
8182 nmi_safe = true;
8183 break;
8184
8185 case CLOCK_MONOTONIC_RAW:
8186 event->clock = &ktime_get_raw_fast_ns;
8187 nmi_safe = true;
8188 break;
8189
8190 case CLOCK_REALTIME:
8191 event->clock = &ktime_get_real_ns;
8192 break;
8193
8194 case CLOCK_BOOTTIME:
8195 event->clock = &ktime_get_boot_ns;
8196 break;
8197
8198 case CLOCK_TAI:
8199 event->clock = &ktime_get_tai_ns;
8200 break;
8201
8202 default:
8203 return -EINVAL;
8204 }
8205
8206 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
8207 return -EINVAL;
8208
8209 return 0;
8210}
8211
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008212/**
8213 * sys_perf_event_open - open a performance event, associate it to a task/cpu
8214 *
8215 * @attr_uptr: event_id type attributes for monitoring/sampling
8216 * @pid: target pid
8217 * @cpu: target cpu
8218 * @group_fd: group leader event fd
8219 */
8220SYSCALL_DEFINE5(perf_event_open,
8221 struct perf_event_attr __user *, attr_uptr,
8222 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
8223{
Peter Zijlstrab04243e2010-09-17 11:28:48 +02008224 struct perf_event *group_leader = NULL, *output_event = NULL;
8225 struct perf_event *event, *sibling;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008226 struct perf_event_attr attr;
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01008227 struct perf_event_context *ctx, *uninitialized_var(gctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008228 struct file *event_file = NULL;
Al Viro2903ff02012-08-28 12:52:22 -04008229 struct fd group = {NULL, 0};
Matt Helsley38a81da2010-09-13 13:01:20 -07008230 struct task_struct *task = NULL;
Peter Zijlstra89a1e182010-09-07 17:34:50 +02008231 struct pmu *pmu;
Al Viroea635c62010-05-26 17:40:29 -04008232 int event_fd;
Peter Zijlstrab04243e2010-09-17 11:28:48 +02008233 int move_group = 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008234 int err;
Yann Droneauda21b0b32014-01-05 21:36:33 +01008235 int f_flags = O_RDWR;
Matt Fleming79dff512015-01-23 18:45:42 +00008236 int cgroup_fd = -1;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008237
8238 /* for future expandability... */
Stephane Eraniane5d13672011-02-14 11:20:01 +02008239 if (flags & ~PERF_FLAG_ALL)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008240 return -EINVAL;
8241
8242 err = perf_copy_attr(attr_uptr, &attr);
8243 if (err)
8244 return err;
8245
8246 if (!attr.exclude_kernel) {
8247 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
8248 return -EACCES;
8249 }
8250
8251 if (attr.freq) {
8252 if (attr.sample_freq > sysctl_perf_event_sample_rate)
8253 return -EINVAL;
Peter Zijlstra0819b2e2014-05-15 20:23:48 +02008254 } else {
8255 if (attr.sample_period & (1ULL << 63))
8256 return -EINVAL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008257 }
8258
Stephane Eraniane5d13672011-02-14 11:20:01 +02008259 /*
8260 * In cgroup mode, the pid argument is used to pass the fd
8261 * opened to the cgroup directory in cgroupfs. The cpu argument
8262 * designates the cpu on which to monitor threads from that
8263 * cgroup.
8264 */
8265 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
8266 return -EINVAL;
8267
Yann Droneauda21b0b32014-01-05 21:36:33 +01008268 if (flags & PERF_FLAG_FD_CLOEXEC)
8269 f_flags |= O_CLOEXEC;
8270
8271 event_fd = get_unused_fd_flags(f_flags);
Al Viroea635c62010-05-26 17:40:29 -04008272 if (event_fd < 0)
8273 return event_fd;
8274
Peter Zijlstraac9721f2010-05-27 12:54:41 +02008275 if (group_fd != -1) {
Al Viro2903ff02012-08-28 12:52:22 -04008276 err = perf_fget_light(group_fd, &group);
8277 if (err)
Stephane Eraniand14b12d2010-09-17 11:28:47 +02008278 goto err_fd;
Al Viro2903ff02012-08-28 12:52:22 -04008279 group_leader = group.file->private_data;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02008280 if (flags & PERF_FLAG_FD_OUTPUT)
8281 output_event = group_leader;
8282 if (flags & PERF_FLAG_FD_NO_GROUP)
8283 group_leader = NULL;
8284 }
8285
Stephane Eraniane5d13672011-02-14 11:20:01 +02008286 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02008287 task = find_lively_task_by_vpid(pid);
8288 if (IS_ERR(task)) {
8289 err = PTR_ERR(task);
8290 goto err_group_fd;
8291 }
8292 }
8293
Peter Zijlstra1f4ee502014-05-06 09:59:34 +02008294 if (task && group_leader &&
8295 group_leader->attr.inherit != attr.inherit) {
8296 err = -EINVAL;
8297 goto err_task;
8298 }
8299
Yan, Zhengfbfc6232012-06-15 14:31:31 +08008300 get_online_cpus();
8301
Matt Fleming79dff512015-01-23 18:45:42 +00008302 if (flags & PERF_FLAG_PID_CGROUP)
8303 cgroup_fd = pid;
8304
Avi Kivity4dc0da82011-06-29 18:42:35 +03008305 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
Matt Fleming79dff512015-01-23 18:45:42 +00008306 NULL, NULL, cgroup_fd);
Stephane Eraniand14b12d2010-09-17 11:28:47 +02008307 if (IS_ERR(event)) {
8308 err = PTR_ERR(event);
Peter Zijlstra1f4ee502014-05-06 09:59:34 +02008309 goto err_cpus;
Stephane Eraniand14b12d2010-09-17 11:28:47 +02008310 }
8311
Vince Weaver53b25332014-05-16 17:12:12 -04008312 if (is_sampling_event(event)) {
8313 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
8314 err = -ENOTSUPP;
8315 goto err_alloc;
8316 }
8317 }
8318
Frederic Weisbecker766d6c02013-07-23 02:31:01 +02008319 account_event(event);
8320
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008321 /*
Peter Zijlstra89a1e182010-09-07 17:34:50 +02008322 * Special case software events and allow them to be part of
8323 * any hardware group.
8324 */
8325 pmu = event->pmu;
Peter Zijlstrab04243e2010-09-17 11:28:48 +02008326
Peter Zijlstra34f43922015-02-20 14:05:38 +01008327 if (attr.use_clockid) {
8328 err = perf_event_set_clock(event, attr.clockid);
8329 if (err)
8330 goto err_alloc;
8331 }
8332
Peter Zijlstrab04243e2010-09-17 11:28:48 +02008333 if (group_leader &&
8334 (is_software_event(event) != is_software_event(group_leader))) {
8335 if (is_software_event(event)) {
8336 /*
8337 * If event and group_leader are not both a software
8338 * event, and event is, then group leader is not.
8339 *
8340 * Allow the addition of software events to !software
8341 * groups, this is safe because software events never
8342 * fail to schedule.
8343 */
8344 pmu = group_leader->pmu;
8345 } else if (is_software_event(group_leader) &&
8346 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
8347 /*
8348 * In case the group is a pure software group, and we
8349 * try to add a hardware event, move the whole group to
8350 * the hardware context.
8351 */
8352 move_group = 1;
8353 }
8354 }
Peter Zijlstra89a1e182010-09-07 17:34:50 +02008355
8356 /*
8357 * Get the target context (task or percpu):
8358 */
Yan, Zheng4af57ef282014-11-04 21:56:01 -05008359 ctx = find_get_context(pmu, task, event);
Peter Zijlstra89a1e182010-09-07 17:34:50 +02008360 if (IS_ERR(ctx)) {
8361 err = PTR_ERR(ctx);
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02008362 goto err_alloc;
Peter Zijlstra89a1e182010-09-07 17:34:50 +02008363 }
8364
Alexander Shishkinbed5b252015-01-30 12:31:06 +02008365 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
8366 err = -EBUSY;
8367 goto err_context;
8368 }
8369
Peter Zijlstrafd1edb32011-03-28 13:13:56 +02008370 if (task) {
8371 put_task_struct(task);
8372 task = NULL;
8373 }
8374
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008375 /*
8376 * Look up the group leader (we will attach this event to it):
8377 */
Peter Zijlstraac9721f2010-05-27 12:54:41 +02008378 if (group_leader) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008379 err = -EINVAL;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008380
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008381 /*
8382 * Do not allow a recursive hierarchy (this new sibling
8383 * becoming part of another group-sibling):
8384 */
8385 if (group_leader->group_leader != group_leader)
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02008386 goto err_context;
Peter Zijlstra34f43922015-02-20 14:05:38 +01008387
8388 /* All events in a group should have the same clock */
8389 if (group_leader->clock != event->clock)
8390 goto err_context;
8391
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008392 /*
8393 * Do not allow to attach to a group in a different
8394 * task or CPU context:
8395 */
Peter Zijlstrab04243e2010-09-17 11:28:48 +02008396 if (move_group) {
Peter Zijlstrac3c87e72015-01-23 11:19:48 +01008397 /*
8398 * Make sure we're both on the same task, or both
8399 * per-cpu events.
8400 */
8401 if (group_leader->ctx->task != ctx->task)
8402 goto err_context;
8403
8404 /*
8405 * Make sure we're both events for the same CPU;
8406 * grouping events for different CPUs is broken; since
8407 * you can never concurrently schedule them anyhow.
8408 */
8409 if (group_leader->cpu != event->cpu)
Peter Zijlstrab04243e2010-09-17 11:28:48 +02008410 goto err_context;
8411 } else {
8412 if (group_leader->ctx != ctx)
8413 goto err_context;
8414 }
8415
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008416 /*
8417 * Only a group leader can be exclusive or pinned
8418 */
8419 if (attr.exclusive || attr.pinned)
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02008420 goto err_context;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02008421 }
8422
8423 if (output_event) {
8424 err = perf_event_set_output(event, output_event);
8425 if (err)
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02008426 goto err_context;
Peter Zijlstraac9721f2010-05-27 12:54:41 +02008427 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008428
Yann Droneauda21b0b32014-01-05 21:36:33 +01008429 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
8430 f_flags);
Al Viroea635c62010-05-26 17:40:29 -04008431 if (IS_ERR(event_file)) {
8432 err = PTR_ERR(event_file);
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02008433 goto err_context;
Al Viroea635c62010-05-26 17:40:29 -04008434 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008435
Peter Zijlstrab04243e2010-09-17 11:28:48 +02008436 if (move_group) {
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01008437 gctx = group_leader->ctx;
Peter Zijlstraf55fc2a2015-09-09 19:06:33 +02008438 mutex_lock_double(&gctx->mutex, &ctx->mutex);
Peter Zijlstra84c4e622016-02-24 18:45:40 +01008439 if (gctx->task == TASK_TOMBSTONE) {
8440 err = -ESRCH;
8441 goto err_locked;
8442 }
Peter Zijlstraf55fc2a2015-09-09 19:06:33 +02008443 } else {
8444 mutex_lock(&ctx->mutex);
8445 }
Peter Zijlstrab04243e2010-09-17 11:28:48 +02008446
Peter Zijlstra84c4e622016-02-24 18:45:40 +01008447 if (ctx->task == TASK_TOMBSTONE) {
8448 err = -ESRCH;
8449 goto err_locked;
8450 }
8451
Peter Zijlstraa7239682015-09-09 19:06:33 +02008452 if (!perf_event_validate_size(event)) {
8453 err = -E2BIG;
8454 goto err_locked;
8455 }
8456
Peter Zijlstraf55fc2a2015-09-09 19:06:33 +02008457 /*
8458 * Must be under the same ctx::mutex as perf_install_in_context(),
8459 * because we need to serialize with concurrent event creation.
8460 */
8461 if (!exclusive_event_installable(event, ctx)) {
8462 /* exclusive and group stuff are assumed mutually exclusive */
8463 WARN_ON_ONCE(move_group);
8464
8465 err = -EBUSY;
8466 goto err_locked;
8467 }
8468
8469 WARN_ON_ONCE(ctx->parent_ctx);
8470
8471 if (move_group) {
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01008472 /*
8473 * See perf_event_ctx_lock() for comments on the details
8474 * of swizzling perf_event::ctx.
8475 */
Peter Zijlstra45a0e072016-01-26 13:09:48 +01008476 perf_remove_from_context(group_leader, 0);
Jiri Olsa0231bb52013-02-01 11:23:45 +01008477
Peter Zijlstrab04243e2010-09-17 11:28:48 +02008478 list_for_each_entry(sibling, &group_leader->sibling_list,
8479 group_entry) {
Peter Zijlstra45a0e072016-01-26 13:09:48 +01008480 perf_remove_from_context(sibling, 0);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02008481 put_ctx(gctx);
8482 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008483
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01008484 /*
8485 * Wait for everybody to stop referencing the events through
8486 * the old lists, before installing it on new lists.
8487 */
Yan, Zheng0cda4c02012-06-15 14:31:33 +08008488 synchronize_rcu();
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01008489
Peter Zijlstra (Intel)8f95b432015-01-27 11:53:12 +01008490 /*
8491 * Install the group siblings before the group leader.
8492 *
8493 * Because a group leader will try and install the entire group
8494 * (through the sibling list, which is still in-tact), we can
8495 * end up with siblings installed in the wrong context.
8496 *
8497 * By installing siblings first we NO-OP because they're not
8498 * reachable through the group lists.
8499 */
Peter Zijlstrab04243e2010-09-17 11:28:48 +02008500 list_for_each_entry(sibling, &group_leader->sibling_list,
8501 group_entry) {
Peter Zijlstra (Intel)8f95b432015-01-27 11:53:12 +01008502 perf_event__state_init(sibling);
Jiri Olsa9fc81d82014-12-10 21:23:51 +01008503 perf_install_in_context(ctx, sibling, sibling->cpu);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02008504 get_ctx(ctx);
8505 }
Peter Zijlstra (Intel)8f95b432015-01-27 11:53:12 +01008506
8507 /*
8508 * Removing from the context ends up with disabled
8509 * event. What we want here is event in the initial
8510 * startup state, ready to be add into new context.
8511 */
8512 perf_event__state_init(group_leader);
8513 perf_install_in_context(ctx, group_leader, group_leader->cpu);
8514 get_ctx(ctx);
Peter Zijlstrab04243e2010-09-17 11:28:48 +02008515
Peter Zijlstraf55fc2a2015-09-09 19:06:33 +02008516 /*
8517 * Now that all events are installed in @ctx, nothing
8518 * references @gctx anymore, so drop the last reference we have
8519 * on it.
8520 */
8521 put_ctx(gctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008522 }
8523
Peter Zijlstraf73e22a2015-09-09 20:48:22 +02008524 /*
8525 * Precalculate sample_data sizes; do while holding ctx::mutex such
8526 * that we're serialized against further additions and before
8527 * perf_install_in_context() which is the point the event is active and
8528 * can use these values.
8529 */
8530 perf_event__header_size(event);
8531 perf_event__id_header_size(event);
Alexander Shishkinbed5b252015-01-30 12:31:06 +02008532
Peter Zijlstra78cd2c72016-01-25 14:08:45 +01008533 event->owner = current;
8534
Yan, Zhenge2d37cd2012-06-15 14:31:32 +08008535 perf_install_in_context(ctx, event, event->cpu);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01008536 perf_unpin_context(ctx);
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01008537
Peter Zijlstraf55fc2a2015-09-09 19:06:33 +02008538 if (move_group)
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01008539 mutex_unlock(&gctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008540 mutex_unlock(&ctx->mutex);
8541
Yan, Zhengfbfc6232012-06-15 14:31:31 +08008542 put_online_cpus();
8543
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008544 mutex_lock(&current->perf_event_mutex);
8545 list_add_tail(&event->owner_entry, &current->perf_event_list);
8546 mutex_unlock(&current->perf_event_mutex);
8547
Peter Zijlstra8a495422010-05-27 15:47:49 +02008548 /*
8549 * Drop the reference on the group_event after placing the
8550 * new event on the sibling_list. This ensures destruction
8551 * of the group leader will find the pointer to itself in
8552 * perf_group_detach().
8553 */
Al Viro2903ff02012-08-28 12:52:22 -04008554 fdput(group);
Al Viroea635c62010-05-26 17:40:29 -04008555 fd_install(event_fd, event_file);
8556 return event_fd;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008557
Peter Zijlstraf55fc2a2015-09-09 19:06:33 +02008558err_locked:
8559 if (move_group)
8560 mutex_unlock(&gctx->mutex);
8561 mutex_unlock(&ctx->mutex);
8562/* err_file: */
8563 fput(event_file);
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02008564err_context:
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01008565 perf_unpin_context(ctx);
Al Viroea635c62010-05-26 17:40:29 -04008566 put_ctx(ctx);
Peter Zijlstrac6be5a52010-10-14 16:59:46 +02008567err_alloc:
Peter Zijlstra13005622016-02-24 18:45:41 +01008568 /*
8569 * If event_file is set, the fput() above will have called ->release()
8570 * and that will take care of freeing the event.
8571 */
8572 if (!event_file)
8573 free_event(event);
Peter Zijlstra1f4ee502014-05-06 09:59:34 +02008574err_cpus:
Yan, Zhengfbfc6232012-06-15 14:31:31 +08008575 put_online_cpus();
Peter Zijlstra1f4ee502014-05-06 09:59:34 +02008576err_task:
Peter Zijlstrae7d0bc02010-10-14 16:54:51 +02008577 if (task)
8578 put_task_struct(task);
Peter Zijlstra89a1e182010-09-07 17:34:50 +02008579err_group_fd:
Al Viro2903ff02012-08-28 12:52:22 -04008580 fdput(group);
Al Viroea635c62010-05-26 17:40:29 -04008581err_fd:
8582 put_unused_fd(event_fd);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008583 return err;
8584}
8585
Arjan van de Venfb0459d2009-09-25 12:25:56 +02008586/**
8587 * perf_event_create_kernel_counter
8588 *
8589 * @attr: attributes of the counter to create
8590 * @cpu: cpu in which the counter is bound
Matt Helsley38a81da2010-09-13 13:01:20 -07008591 * @task: task to profile (NULL for percpu)
Arjan van de Venfb0459d2009-09-25 12:25:56 +02008592 */
8593struct perf_event *
8594perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
Matt Helsley38a81da2010-09-13 13:01:20 -07008595 struct task_struct *task,
Avi Kivity4dc0da82011-06-29 18:42:35 +03008596 perf_overflow_handler_t overflow_handler,
8597 void *context)
Arjan van de Venfb0459d2009-09-25 12:25:56 +02008598{
Arjan van de Venfb0459d2009-09-25 12:25:56 +02008599 struct perf_event_context *ctx;
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02008600 struct perf_event *event;
Arjan van de Venfb0459d2009-09-25 12:25:56 +02008601 int err;
8602
8603 /*
8604 * Get the target context (task or percpu):
8605 */
8606
Avi Kivity4dc0da82011-06-29 18:42:35 +03008607 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
Matt Fleming79dff512015-01-23 18:45:42 +00008608 overflow_handler, context, -1);
Frederic Weisbeckerc6567f62009-11-26 05:35:41 +01008609 if (IS_ERR(event)) {
8610 err = PTR_ERR(event);
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02008611 goto err;
8612 }
8613
Jiri Olsaf8697762014-08-01 14:33:01 +02008614 /* Mark owner so we could distinguish it from user events. */
Peter Zijlstra63b6da32016-01-14 16:05:37 +01008615 event->owner = TASK_TOMBSTONE;
Jiri Olsaf8697762014-08-01 14:33:01 +02008616
Frederic Weisbecker766d6c02013-07-23 02:31:01 +02008617 account_event(event);
8618
Yan, Zheng4af57ef282014-11-04 21:56:01 -05008619 ctx = find_get_context(event->pmu, task, event);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02008620 if (IS_ERR(ctx)) {
8621 err = PTR_ERR(ctx);
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02008622 goto err_free;
Frederic Weisbeckerc6567f62009-11-26 05:35:41 +01008623 }
Arjan van de Venfb0459d2009-09-25 12:25:56 +02008624
Arjan van de Venfb0459d2009-09-25 12:25:56 +02008625 WARN_ON_ONCE(ctx->parent_ctx);
8626 mutex_lock(&ctx->mutex);
Peter Zijlstra84c4e622016-02-24 18:45:40 +01008627 if (ctx->task == TASK_TOMBSTONE) {
8628 err = -ESRCH;
8629 goto err_unlock;
8630 }
8631
Alexander Shishkinbed5b252015-01-30 12:31:06 +02008632 if (!exclusive_event_installable(event, ctx)) {
Alexander Shishkinbed5b252015-01-30 12:31:06 +02008633 err = -EBUSY;
Peter Zijlstra84c4e622016-02-24 18:45:40 +01008634 goto err_unlock;
Alexander Shishkinbed5b252015-01-30 12:31:06 +02008635 }
8636
Arjan van de Venfb0459d2009-09-25 12:25:56 +02008637 perf_install_in_context(ctx, event, cpu);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01008638 perf_unpin_context(ctx);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02008639 mutex_unlock(&ctx->mutex);
8640
Arjan van de Venfb0459d2009-09-25 12:25:56 +02008641 return event;
8642
Peter Zijlstra84c4e622016-02-24 18:45:40 +01008643err_unlock:
8644 mutex_unlock(&ctx->mutex);
8645 perf_unpin_context(ctx);
8646 put_ctx(ctx);
Peter Zijlstrac3f00c72010-08-18 14:37:15 +02008647err_free:
8648 free_event(event);
8649err:
Frederic Weisbeckerc6567f62009-11-26 05:35:41 +01008650 return ERR_PTR(err);
Arjan van de Venfb0459d2009-09-25 12:25:56 +02008651}
8652EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
8653
Yan, Zheng0cda4c02012-06-15 14:31:33 +08008654void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
8655{
8656 struct perf_event_context *src_ctx;
8657 struct perf_event_context *dst_ctx;
8658 struct perf_event *event, *tmp;
8659 LIST_HEAD(events);
8660
8661 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
8662 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
8663
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01008664 /*
8665 * See perf_event_ctx_lock() for comments on the details
8666 * of swizzling perf_event::ctx.
8667 */
8668 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
Yan, Zheng0cda4c02012-06-15 14:31:33 +08008669 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
8670 event_entry) {
Peter Zijlstra45a0e072016-01-26 13:09:48 +01008671 perf_remove_from_context(event, 0);
Frederic Weisbecker9a545de2013-07-23 02:31:03 +02008672 unaccount_event_cpu(event, src_cpu);
Yan, Zheng0cda4c02012-06-15 14:31:33 +08008673 put_ctx(src_ctx);
Peter Zijlstra98861672013-10-03 16:02:23 +02008674 list_add(&event->migrate_entry, &events);
Yan, Zheng0cda4c02012-06-15 14:31:33 +08008675 }
Yan, Zheng0cda4c02012-06-15 14:31:33 +08008676
Peter Zijlstra (Intel)8f95b432015-01-27 11:53:12 +01008677 /*
8678 * Wait for the events to quiesce before re-instating them.
8679 */
Yan, Zheng0cda4c02012-06-15 14:31:33 +08008680 synchronize_rcu();
8681
Peter Zijlstra (Intel)8f95b432015-01-27 11:53:12 +01008682 /*
8683 * Re-instate events in 2 passes.
8684 *
8685 * Skip over group leaders and only install siblings on this first
8686 * pass, siblings will not get enabled without a leader, however a
8687 * leader will enable its siblings, even if those are still on the old
8688 * context.
8689 */
8690 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8691 if (event->group_leader == event)
8692 continue;
8693
8694 list_del(&event->migrate_entry);
8695 if (event->state >= PERF_EVENT_STATE_OFF)
8696 event->state = PERF_EVENT_STATE_INACTIVE;
8697 account_event_cpu(event, dst_cpu);
8698 perf_install_in_context(dst_ctx, event, dst_cpu);
8699 get_ctx(dst_ctx);
8700 }
8701
8702 /*
8703 * Once all the siblings are setup properly, install the group leaders
8704 * to make it go.
8705 */
Peter Zijlstra98861672013-10-03 16:02:23 +02008706 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8707 list_del(&event->migrate_entry);
Yan, Zheng0cda4c02012-06-15 14:31:33 +08008708 if (event->state >= PERF_EVENT_STATE_OFF)
8709 event->state = PERF_EVENT_STATE_INACTIVE;
Frederic Weisbecker9a545de2013-07-23 02:31:03 +02008710 account_event_cpu(event, dst_cpu);
Yan, Zheng0cda4c02012-06-15 14:31:33 +08008711 perf_install_in_context(dst_ctx, event, dst_cpu);
8712 get_ctx(dst_ctx);
8713 }
8714 mutex_unlock(&dst_ctx->mutex);
Peter Zijlstraf63a8da2015-01-23 12:24:14 +01008715 mutex_unlock(&src_ctx->mutex);
Yan, Zheng0cda4c02012-06-15 14:31:33 +08008716}
8717EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
8718
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008719static void sync_child_event(struct perf_event *child_event,
8720 struct task_struct *child)
8721{
8722 struct perf_event *parent_event = child_event->parent;
8723 u64 child_val;
8724
8725 if (child_event->attr.inherit_stat)
8726 perf_event_read_event(child_event, child);
8727
Peter Zijlstrab5e58792010-05-21 14:43:12 +02008728 child_val = perf_event_count(child_event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008729
8730 /*
8731 * Add back the child's count to the parent's count:
8732 */
Peter Zijlstraa6e6dea2010-05-21 14:27:58 +02008733 atomic64_add(child_val, &parent_event->child_count);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008734 atomic64_add(child_event->total_time_enabled,
8735 &parent_event->child_total_time_enabled);
8736 atomic64_add(child_event->total_time_running,
8737 &parent_event->child_total_time_running);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008738}
8739
8740static void
Peter Zijlstra8ba289b2016-01-26 13:06:56 +01008741perf_event_exit_event(struct perf_event *child_event,
8742 struct perf_event_context *child_ctx,
8743 struct task_struct *child)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008744{
Peter Zijlstra8ba289b2016-01-26 13:06:56 +01008745 struct perf_event *parent_event = child_event->parent;
8746
Peter Zijlstra1903d502014-07-15 17:27:27 +02008747 /*
8748 * Do not destroy the 'original' grouping; because of the context
8749 * switch optimization the original events could've ended up in a
8750 * random child task.
8751 *
8752 * If we were to destroy the original group, all group related
8753 * operations would cease to function properly after this random
8754 * child dies.
8755 *
8756 * Do destroy all inherited groups, we don't care about those
8757 * and being thorough is better.
8758 */
Peter Zijlstra32132a32016-01-11 15:40:59 +01008759 raw_spin_lock_irq(&child_ctx->lock);
8760 WARN_ON_ONCE(child_ctx->is_active);
8761
Peter Zijlstra8ba289b2016-01-26 13:06:56 +01008762 if (parent_event)
Peter Zijlstra32132a32016-01-11 15:40:59 +01008763 perf_group_detach(child_event);
8764 list_del_event(child_event, child_ctx);
Peter Zijlstraa69b0ca2016-02-24 18:45:44 +01008765 child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
Peter Zijlstra32132a32016-01-11 15:40:59 +01008766 raw_spin_unlock_irq(&child_ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008767
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008768 /*
Peter Zijlstra8ba289b2016-01-26 13:06:56 +01008769 * Parent events are governed by their filedesc, retain them.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008770 */
Peter Zijlstra8ba289b2016-01-26 13:06:56 +01008771 if (!parent_event) {
Jiri Olsa179033b2014-08-07 11:48:26 -04008772 perf_event_wakeup(child_event);
Peter Zijlstra8ba289b2016-01-26 13:06:56 +01008773 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008774 }
Peter Zijlstra8ba289b2016-01-26 13:06:56 +01008775 /*
8776 * Child events can be cleaned up.
8777 */
8778
8779 sync_child_event(child_event, child);
8780
8781 /*
8782 * Remove this event from the parent's list
8783 */
8784 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8785 mutex_lock(&parent_event->child_mutex);
8786 list_del_init(&child_event->child_list);
8787 mutex_unlock(&parent_event->child_mutex);
8788
8789 /*
8790 * Kick perf_poll() for is_event_hup().
8791 */
8792 perf_event_wakeup(parent_event);
8793 free_event(child_event);
8794 put_event(parent_event);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008795}
8796
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02008797static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008798{
Peter Zijlstra211de6e2014-09-30 19:23:08 +02008799 struct perf_event_context *child_ctx, *clone_ctx = NULL;
Peter Zijlstra63b6da32016-01-14 16:05:37 +01008800 struct perf_event *child_event, *next;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008801
Peter Zijlstra63b6da32016-01-14 16:05:37 +01008802 WARN_ON_ONCE(child != current);
8803
Peter Zijlstra6a3351b2016-01-25 14:09:54 +01008804 child_ctx = perf_pin_task_context(child, ctxn);
Peter Zijlstra63b6da32016-01-14 16:05:37 +01008805 if (!child_ctx)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008806 return;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008807
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008808 /*
Peter Zijlstra6a3351b2016-01-25 14:09:54 +01008809 * In order to reduce the amount of tricky in ctx tear-down, we hold
8810 * ctx::mutex over the entire thing. This serializes against almost
8811 * everything that wants to access the ctx.
8812 *
8813 * The exception is sys_perf_event_open() /
8814 * perf_event_create_kernel_count() which does find_get_context()
8815 * without ctx::mutex (it cannot because of the move_group double mutex
8816 * lock thing). See the comments in perf_install_in_context().
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008817 */
Peter Zijlstra6a3351b2016-01-25 14:09:54 +01008818 mutex_lock(&child_ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008819
8820 /*
Peter Zijlstra6a3351b2016-01-25 14:09:54 +01008821 * In a single ctx::lock section, de-schedule the events and detach the
8822 * context from the task such that we cannot ever get it scheduled back
8823 * in.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008824 */
Peter Zijlstra6a3351b2016-01-25 14:09:54 +01008825 raw_spin_lock_irq(&child_ctx->lock);
Peter Zijlstra63b6da32016-01-14 16:05:37 +01008826 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
Peter Zijlstra4a1c0f22014-06-23 16:12:42 +02008827
8828 /*
Peter Zijlstra63b6da32016-01-14 16:05:37 +01008829 * Now that the context is inactive, destroy the task <-> ctx relation
8830 * and mark the context dead.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008831 */
Peter Zijlstra63b6da32016-01-14 16:05:37 +01008832 RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
8833 put_ctx(child_ctx); /* cannot be last */
8834 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
8835 put_task_struct(current); /* cannot be last */
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008836
Peter Zijlstra211de6e2014-09-30 19:23:08 +02008837 clone_ctx = unclone_ctx(child_ctx);
Peter Zijlstra6a3351b2016-01-25 14:09:54 +01008838 raw_spin_unlock_irq(&child_ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008839
Peter Zijlstra211de6e2014-09-30 19:23:08 +02008840 if (clone_ctx)
8841 put_ctx(clone_ctx);
Peter Zijlstra4a1c0f22014-06-23 16:12:42 +02008842
8843 /*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008844 * Report the task dead after unscheduling the events so that we
8845 * won't get any samples after PERF_RECORD_EXIT. We can however still
8846 * get a few PERF_RECORD_READ events.
8847 */
8848 perf_event_task(child, child_ctx, 0);
8849
Peter Zijlstraebf905f2014-05-29 19:00:24 +02008850 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
Peter Zijlstra8ba289b2016-01-26 13:06:56 +01008851 perf_event_exit_event(child_event, child_ctx, child);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01008852
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008853 mutex_unlock(&child_ctx->mutex);
8854
8855 put_ctx(child_ctx);
8856}
8857
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02008858/*
8859 * When a child task exits, feed back event values to parent events.
8860 */
8861void perf_event_exit_task(struct task_struct *child)
8862{
Peter Zijlstra88821352010-11-09 19:01:43 +01008863 struct perf_event *event, *tmp;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02008864 int ctxn;
8865
Peter Zijlstra88821352010-11-09 19:01:43 +01008866 mutex_lock(&child->perf_event_mutex);
8867 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
8868 owner_entry) {
8869 list_del_init(&event->owner_entry);
8870
8871 /*
8872 * Ensure the list deletion is visible before we clear
8873 * the owner, closes a race against perf_release() where
8874 * we need to serialize on the owner->perf_event_mutex.
8875 */
Peter Zijlstraf47c02c2016-01-26 12:30:14 +01008876 smp_store_release(&event->owner, NULL);
Peter Zijlstra88821352010-11-09 19:01:43 +01008877 }
8878 mutex_unlock(&child->perf_event_mutex);
8879
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02008880 for_each_task_context_nr(ctxn)
8881 perf_event_exit_task_context(child, ctxn);
Jiri Olsa4e93ad62015-11-04 16:00:05 +01008882
8883 /*
8884 * The perf_event_exit_task_context calls perf_event_task
8885 * with child's task_ctx, which generates EXIT events for
8886 * child contexts and sets child->perf_event_ctxp[] to NULL.
8887 * At this point we need to send EXIT events to cpu contexts.
8888 */
8889 perf_event_task(child, NULL, 0);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02008890}
8891
Frederic Weisbecker889ff012010-01-09 20:04:47 +01008892static void perf_free_event(struct perf_event *event,
8893 struct perf_event_context *ctx)
8894{
8895 struct perf_event *parent = event->parent;
8896
8897 if (WARN_ON_ONCE(!parent))
8898 return;
8899
8900 mutex_lock(&parent->child_mutex);
8901 list_del_init(&event->child_list);
8902 mutex_unlock(&parent->child_mutex);
8903
Al Viroa6fa9412012-08-20 14:59:25 +01008904 put_event(parent);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01008905
Peter Zijlstra652884f2015-01-23 11:20:10 +01008906 raw_spin_lock_irq(&ctx->lock);
Peter Zijlstra8a495422010-05-27 15:47:49 +02008907 perf_group_detach(event);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01008908 list_del_event(event, ctx);
Peter Zijlstra652884f2015-01-23 11:20:10 +01008909 raw_spin_unlock_irq(&ctx->lock);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01008910 free_event(event);
8911}
8912
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008913/*
Peter Zijlstra652884f2015-01-23 11:20:10 +01008914 * Free an unexposed, unused context as created by inheritance by
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02008915 * perf_event_init_task below, used by fork() in case of fail.
Peter Zijlstra652884f2015-01-23 11:20:10 +01008916 *
8917 * Not all locks are strictly required, but take them anyway to be nice and
8918 * help out with the lockdep assertions.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008919 */
8920void perf_event_free_task(struct task_struct *task)
8921{
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02008922 struct perf_event_context *ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008923 struct perf_event *event, *tmp;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02008924 int ctxn;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008925
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02008926 for_each_task_context_nr(ctxn) {
8927 ctx = task->perf_event_ctxp[ctxn];
8928 if (!ctx)
8929 continue;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008930
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02008931 mutex_lock(&ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008932again:
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02008933 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
8934 group_entry)
8935 perf_free_event(event, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008936
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02008937 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
8938 group_entry)
8939 perf_free_event(event, ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008940
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02008941 if (!list_empty(&ctx->pinned_groups) ||
8942 !list_empty(&ctx->flexible_groups))
8943 goto again;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008944
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02008945 mutex_unlock(&ctx->mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008946
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02008947 put_ctx(ctx);
8948 }
Ingo Molnarcdd6c482009-09-21 12:02:48 +02008949}
8950
Peter Zijlstra4e231c72010-09-09 21:01:59 +02008951void perf_event_delayed_put(struct task_struct *task)
8952{
8953 int ctxn;
8954
8955 for_each_task_context_nr(ctxn)
8956 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
8957}
8958
Alexei Starovoitove03e7ee2016-01-25 20:59:49 -08008959struct file *perf_event_get(unsigned int fd)
Kaixu Xiaffe86902015-08-06 07:02:32 +00008960{
Alexei Starovoitove03e7ee2016-01-25 20:59:49 -08008961 struct file *file;
Kaixu Xiaffe86902015-08-06 07:02:32 +00008962
Alexei Starovoitove03e7ee2016-01-25 20:59:49 -08008963 file = fget_raw(fd);
8964 if (!file)
8965 return ERR_PTR(-EBADF);
Kaixu Xiaffe86902015-08-06 07:02:32 +00008966
Alexei Starovoitove03e7ee2016-01-25 20:59:49 -08008967 if (file->f_op != &perf_fops) {
8968 fput(file);
8969 return ERR_PTR(-EBADF);
8970 }
Kaixu Xiaffe86902015-08-06 07:02:32 +00008971
Alexei Starovoitove03e7ee2016-01-25 20:59:49 -08008972 return file;
Kaixu Xiaffe86902015-08-06 07:02:32 +00008973}
8974
8975const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
8976{
8977 if (!event)
8978 return ERR_PTR(-EINVAL);
8979
8980 return &event->attr;
8981}
8982
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02008983/*
8984 * inherit a event from parent task to child task:
8985 */
8986static struct perf_event *
8987inherit_event(struct perf_event *parent_event,
8988 struct task_struct *parent,
8989 struct perf_event_context *parent_ctx,
8990 struct task_struct *child,
8991 struct perf_event *group_leader,
8992 struct perf_event_context *child_ctx)
8993{
Jiri Olsa1929def2014-09-12 13:18:27 +02008994 enum perf_event_active_state parent_state = parent_event->state;
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02008995 struct perf_event *child_event;
Peter Zijlstracee010e2010-09-10 12:51:54 +02008996 unsigned long flags;
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02008997
8998 /*
8999 * Instead of creating recursive hierarchies of events,
9000 * we link inherited events back to the original parent,
9001 * which has a filp for sure, which we use as the reference
9002 * count:
9003 */
9004 if (parent_event->parent)
9005 parent_event = parent_event->parent;
9006
9007 child_event = perf_event_alloc(&parent_event->attr,
9008 parent_event->cpu,
Peter Zijlstrad580ff82010-10-14 17:43:23 +02009009 child,
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02009010 group_leader, parent_event,
Matt Fleming79dff512015-01-23 18:45:42 +00009011 NULL, NULL, -1);
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02009012 if (IS_ERR(child_event))
9013 return child_event;
Al Viroa6fa9412012-08-20 14:59:25 +01009014
Peter Zijlstrac6e5b732016-01-15 16:07:41 +02009015 /*
9016 * is_orphaned_event() and list_add_tail(&parent_event->child_list)
9017 * must be under the same lock in order to serialize against
9018 * perf_event_release_kernel(), such that either we must observe
9019 * is_orphaned_event() or they will observe us on the child_list.
9020 */
9021 mutex_lock(&parent_event->child_mutex);
Jiri Olsafadfe7b2014-08-01 14:33:02 +02009022 if (is_orphaned_event(parent_event) ||
9023 !atomic_long_inc_not_zero(&parent_event->refcount)) {
Peter Zijlstrac6e5b732016-01-15 16:07:41 +02009024 mutex_unlock(&parent_event->child_mutex);
Al Viroa6fa9412012-08-20 14:59:25 +01009025 free_event(child_event);
9026 return NULL;
9027 }
9028
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02009029 get_ctx(child_ctx);
9030
9031 /*
9032 * Make the child state follow the state of the parent event,
9033 * not its attr.disabled bit. We hold the parent's mutex,
9034 * so we won't race with perf_event_{en, dis}able_family.
9035 */
Jiri Olsa1929def2014-09-12 13:18:27 +02009036 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02009037 child_event->state = PERF_EVENT_STATE_INACTIVE;
9038 else
9039 child_event->state = PERF_EVENT_STATE_OFF;
9040
9041 if (parent_event->attr.freq) {
9042 u64 sample_period = parent_event->hw.sample_period;
9043 struct hw_perf_event *hwc = &child_event->hw;
9044
9045 hwc->sample_period = sample_period;
9046 hwc->last_period = sample_period;
9047
9048 local64_set(&hwc->period_left, sample_period);
9049 }
9050
9051 child_event->ctx = child_ctx;
9052 child_event->overflow_handler = parent_event->overflow_handler;
Avi Kivity4dc0da82011-06-29 18:42:35 +03009053 child_event->overflow_handler_context
9054 = parent_event->overflow_handler_context;
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02009055
9056 /*
Thomas Gleixner614b6782010-12-03 16:24:32 -02009057 * Precalculate sample_data sizes
9058 */
9059 perf_event__header_size(child_event);
Arnaldo Carvalho de Melo6844c092010-12-03 16:36:35 -02009060 perf_event__id_header_size(child_event);
Thomas Gleixner614b6782010-12-03 16:24:32 -02009061
9062 /*
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02009063 * Link it up in the child's context:
9064 */
Peter Zijlstracee010e2010-09-10 12:51:54 +02009065 raw_spin_lock_irqsave(&child_ctx->lock, flags);
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02009066 add_event_to_ctx(child_event, child_ctx);
Peter Zijlstracee010e2010-09-10 12:51:54 +02009067 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02009068
9069 /*
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02009070 * Link this into the parent event's child list
9071 */
Peter Zijlstra97dee4f2010-09-07 15:35:33 +02009072 list_add_tail(&child_event->child_list, &parent_event->child_list);
9073 mutex_unlock(&parent_event->child_mutex);
9074
9075 return child_event;
9076}
9077
9078static int inherit_group(struct perf_event *parent_event,
9079 struct task_struct *parent,
9080 struct perf_event_context *parent_ctx,
9081 struct task_struct *child,
9082 struct perf_event_context *child_ctx)
9083{
9084 struct perf_event *leader;
9085 struct perf_event *sub;
9086 struct perf_event *child_ctr;
9087
9088 leader = inherit_event(parent_event, parent, parent_ctx,
9089 child, NULL, child_ctx);
9090 if (IS_ERR(leader))
9091 return PTR_ERR(leader);
9092 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
9093 child_ctr = inherit_event(sub, parent, parent_ctx,
9094 child, leader, child_ctx);
9095 if (IS_ERR(child_ctr))
9096 return PTR_ERR(child_ctr);
9097 }
9098 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009099}
9100
Frederic Weisbecker889ff012010-01-09 20:04:47 +01009101static int
9102inherit_task_group(struct perf_event *event, struct task_struct *parent,
9103 struct perf_event_context *parent_ctx,
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02009104 struct task_struct *child, int ctxn,
Frederic Weisbecker889ff012010-01-09 20:04:47 +01009105 int *inherited_all)
9106{
9107 int ret;
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02009108 struct perf_event_context *child_ctx;
Frederic Weisbecker889ff012010-01-09 20:04:47 +01009109
9110 if (!event->attr.inherit) {
9111 *inherited_all = 0;
9112 return 0;
9113 }
9114
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01009115 child_ctx = child->perf_event_ctxp[ctxn];
Frederic Weisbecker889ff012010-01-09 20:04:47 +01009116 if (!child_ctx) {
9117 /*
9118 * This is executed from the parent task context, so
9119 * inherit events that have been marked for cloning.
9120 * First allocate and initialize a context for the
9121 * child.
9122 */
9123
Jiri Olsa734df5a2013-07-09 17:44:10 +02009124 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01009125 if (!child_ctx)
9126 return -ENOMEM;
9127
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02009128 child->perf_event_ctxp[ctxn] = child_ctx;
Frederic Weisbecker889ff012010-01-09 20:04:47 +01009129 }
9130
9131 ret = inherit_group(event, parent, parent_ctx,
9132 child, child_ctx);
9133
9134 if (ret)
9135 *inherited_all = 0;
9136
9137 return ret;
9138}
9139
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009140/*
9141 * Initialize the perf_event context in task_struct
9142 */
Jiri Olsa985c8dc2014-06-24 10:20:24 +02009143static int perf_event_init_context(struct task_struct *child, int ctxn)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009144{
Frederic Weisbecker889ff012010-01-09 20:04:47 +01009145 struct perf_event_context *child_ctx, *parent_ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009146 struct perf_event_context *cloned_ctx;
9147 struct perf_event *event;
9148 struct task_struct *parent = current;
9149 int inherited_all = 1;
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01009150 unsigned long flags;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009151 int ret = 0;
9152
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02009153 if (likely(!parent->perf_event_ctxp[ctxn]))
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009154 return 0;
9155
9156 /*
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009157 * If the parent's context is a clone, pin it so it won't get
9158 * swapped under us.
9159 */
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02009160 parent_ctx = perf_pin_task_context(parent, ctxn);
Peter Zijlstraffb4ef22014-05-05 19:12:20 +02009161 if (!parent_ctx)
9162 return 0;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009163
9164 /*
9165 * No need to check if parent_ctx != NULL here; since we saw
9166 * it non-NULL earlier, the only reason for it to become NULL
9167 * is if we exit, and since we're currently in the middle of
9168 * a fork we can't be exiting at the same time.
9169 */
9170
9171 /*
9172 * Lock the parent list. No need to lock the child - not PID
9173 * hashed yet and not running, so nobody can access it.
9174 */
9175 mutex_lock(&parent_ctx->mutex);
9176
9177 /*
9178 * We dont have to disable NMIs - we are only looking at
9179 * the list, not manipulating it:
9180 */
Frederic Weisbecker889ff012010-01-09 20:04:47 +01009181 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02009182 ret = inherit_task_group(event, parent, parent_ctx,
9183 child, ctxn, &inherited_all);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01009184 if (ret)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009185 break;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009186 }
9187
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01009188 /*
9189 * We can't hold ctx->lock when iterating the ->flexible_group list due
9190 * to allocations, but we need to prevent rotation because
9191 * rotate_ctx() will change the list from interrupt context.
9192 */
9193 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
9194 parent_ctx->rotate_disable = 1;
9195 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
9196
Frederic Weisbecker889ff012010-01-09 20:04:47 +01009197 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02009198 ret = inherit_task_group(event, parent, parent_ctx,
9199 child, ctxn, &inherited_all);
Frederic Weisbecker889ff012010-01-09 20:04:47 +01009200 if (ret)
9201 break;
9202 }
9203
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01009204 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
9205 parent_ctx->rotate_disable = 0;
Thomas Gleixnerdddd3372010-11-24 10:05:55 +01009206
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02009207 child_ctx = child->perf_event_ctxp[ctxn];
Frederic Weisbecker889ff012010-01-09 20:04:47 +01009208
Peter Zijlstra05cbaa22009-12-30 16:00:35 +01009209 if (child_ctx && inherited_all) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009210 /*
9211 * Mark the child context as a clone of the parent
9212 * context, or of whatever the parent is a clone of.
Peter Zijlstrac5ed5142011-01-17 13:45:37 +01009213 *
9214 * Note that if the parent is a clone, the holding of
9215 * parent_ctx->lock avoids it from being uncloned.
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009216 */
Peter Zijlstrac5ed5142011-01-17 13:45:37 +01009217 cloned_ctx = parent_ctx->parent_ctx;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009218 if (cloned_ctx) {
9219 child_ctx->parent_ctx = cloned_ctx;
9220 child_ctx->parent_gen = parent_ctx->parent_gen;
9221 } else {
9222 child_ctx->parent_ctx = parent_ctx;
9223 child_ctx->parent_gen = parent_ctx->generation;
9224 }
9225 get_ctx(child_ctx->parent_ctx);
9226 }
9227
Peter Zijlstrac5ed5142011-01-17 13:45:37 +01009228 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009229 mutex_unlock(&parent_ctx->mutex);
9230
9231 perf_unpin_context(parent_ctx);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01009232 put_ctx(parent_ctx);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009233
9234 return ret;
9235}
9236
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02009237/*
9238 * Initialize the perf_event context in task_struct
9239 */
9240int perf_event_init_task(struct task_struct *child)
9241{
9242 int ctxn, ret;
9243
Oleg Nesterov8550d7c2011-01-19 19:22:28 +01009244 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
9245 mutex_init(&child->perf_event_mutex);
9246 INIT_LIST_HEAD(&child->perf_event_list);
9247
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02009248 for_each_task_context_nr(ctxn) {
9249 ret = perf_event_init_context(child, ctxn);
Peter Zijlstra6c72e3502014-10-02 16:17:02 -07009250 if (ret) {
9251 perf_event_free_task(child);
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02009252 return ret;
Peter Zijlstra6c72e3502014-10-02 16:17:02 -07009253 }
Peter Zijlstra8dc85d52010-09-02 16:50:03 +02009254 }
9255
9256 return 0;
9257}
9258
Paul Mackerras220b1402010-03-10 20:45:52 +11009259static void __init perf_event_init_all_cpus(void)
9260{
Peter Zijlstrab28ab832010-09-06 14:48:15 +02009261 struct swevent_htable *swhash;
Paul Mackerras220b1402010-03-10 20:45:52 +11009262 int cpu;
Paul Mackerras220b1402010-03-10 20:45:52 +11009263
9264 for_each_possible_cpu(cpu) {
Peter Zijlstrab28ab832010-09-06 14:48:15 +02009265 swhash = &per_cpu(swevent_htable, cpu);
9266 mutex_init(&swhash->hlist_mutex);
Mark Rutland2fde4f92015-01-07 15:01:54 +00009267 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
Paul Mackerras220b1402010-03-10 20:45:52 +11009268 }
9269}
9270
Paul Gortmaker0db06282013-06-19 14:53:51 -04009271static void perf_event_init_cpu(int cpu)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009272{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02009273 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009274
Peter Zijlstrab28ab832010-09-06 14:48:15 +02009275 mutex_lock(&swhash->hlist_mutex);
Thomas Gleixner059fcd82016-02-09 20:11:34 +00009276 if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02009277 struct swevent_hlist *hlist;
9278
Peter Zijlstrab28ab832010-09-06 14:48:15 +02009279 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
9280 WARN_ON(!hlist);
9281 rcu_assign_pointer(swhash->swevent_hlist, hlist);
Frederic Weisbecker76e1d902010-04-05 15:35:57 +02009282 }
Peter Zijlstrab28ab832010-09-06 14:48:15 +02009283 mutex_unlock(&swhash->hlist_mutex);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009284}
9285
Dave Young2965faa2015-09-09 15:38:55 -07009286#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
Peter Zijlstra108b02c2010-09-06 14:32:03 +02009287static void __perf_event_exit_context(void *__info)
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009288{
Peter Zijlstra108b02c2010-09-06 14:32:03 +02009289 struct perf_event_context *ctx = __info;
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01009290 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
9291 struct perf_event *event;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009292
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01009293 raw_spin_lock(&ctx->lock);
9294 list_for_each_entry(event, &ctx->event_list, event_entry)
Peter Zijlstra45a0e072016-01-26 13:09:48 +01009295 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
Peter Zijlstrafae3fde2016-01-11 15:00:50 +01009296 raw_spin_unlock(&ctx->lock);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009297}
Peter Zijlstra108b02c2010-09-06 14:32:03 +02009298
9299static void perf_event_exit_cpu_context(int cpu)
9300{
9301 struct perf_event_context *ctx;
9302 struct pmu *pmu;
9303 int idx;
9304
9305 idx = srcu_read_lock(&pmus_srcu);
9306 list_for_each_entry_rcu(pmu, &pmus, entry) {
Peter Zijlstra917bdd12010-09-17 11:28:49 +02009307 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
Peter Zijlstra108b02c2010-09-06 14:32:03 +02009308
9309 mutex_lock(&ctx->mutex);
9310 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
9311 mutex_unlock(&ctx->mutex);
9312 }
9313 srcu_read_unlock(&pmus_srcu, idx);
Peter Zijlstra108b02c2010-09-06 14:32:03 +02009314}
9315
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009316static void perf_event_exit_cpu(int cpu)
9317{
Peter Zijlstrae3703f82014-02-24 12:06:12 +01009318 perf_event_exit_cpu_context(cpu);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009319}
9320#else
9321static inline void perf_event_exit_cpu(int cpu) { }
9322#endif
9323
Peter Zijlstrac2774432010-12-08 15:29:02 +01009324static int
9325perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
9326{
9327 int cpu;
9328
9329 for_each_online_cpu(cpu)
9330 perf_event_exit_cpu(cpu);
9331
9332 return NOTIFY_OK;
9333}
9334
9335/*
9336 * Run the perf reboot notifier at the very last possible moment so that
9337 * the generic watchdog code runs as long as possible.
9338 */
9339static struct notifier_block perf_reboot_notifier = {
9340 .notifier_call = perf_reboot,
9341 .priority = INT_MIN,
9342};
9343
Paul Gortmaker0db06282013-06-19 14:53:51 -04009344static int
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009345perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
9346{
9347 unsigned int cpu = (long)hcpu;
9348
Linus Torvalds4536e4d2011-11-03 07:44:04 -07009349 switch (action & ~CPU_TASKS_FROZEN) {
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009350
9351 case CPU_UP_PREPARE:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009352 perf_event_init_cpu(cpu);
9353 break;
9354
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009355 case CPU_DOWN_PREPARE:
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009356 perf_event_exit_cpu(cpu);
9357 break;
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009358 default:
9359 break;
9360 }
9361
9362 return NOTIFY_OK;
9363}
9364
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009365void __init perf_event_init(void)
9366{
Jason Wessel3c502e72010-11-04 17:33:01 -05009367 int ret;
9368
Peter Zijlstra2e80a822010-11-17 23:17:36 +01009369 idr_init(&pmu_idr);
9370
Paul Mackerras220b1402010-03-10 20:45:52 +11009371 perf_event_init_all_cpus();
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02009372 init_srcu_struct(&pmus_srcu);
Peter Zijlstra2e80a822010-11-17 23:17:36 +01009373 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
9374 perf_pmu_register(&perf_cpu_clock, NULL, -1);
9375 perf_pmu_register(&perf_task_clock, NULL, -1);
Peter Zijlstrab0a873e2010-06-11 13:35:08 +02009376 perf_tp_register();
9377 perf_cpu_notifier(perf_cpu_notify);
Peter Zijlstrac2774432010-12-08 15:29:02 +01009378 register_reboot_notifier(&perf_reboot_notifier);
Jason Wessel3c502e72010-11-04 17:33:01 -05009379
9380 ret = init_hw_breakpoint();
9381 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
Gleb Natapovb2029522011-11-27 17:59:09 +02009382
Jiri Olsab01c3a02012-03-23 15:41:20 +01009383 /*
9384 * Build time assertion that we keep the data_head at the intended
9385 * location. IOW, validation we got the __reserved[] size right.
9386 */
9387 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
9388 != 1024);
Ingo Molnarcdd6c482009-09-21 12:02:48 +02009389}
Peter Zijlstraabe43402010-11-17 23:17:37 +01009390
Cody P Schaferfd979c02015-01-30 13:45:57 -08009391ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
9392 char *page)
9393{
9394 struct perf_pmu_events_attr *pmu_attr =
9395 container_of(attr, struct perf_pmu_events_attr, attr);
9396
9397 if (pmu_attr->event_str)
9398 return sprintf(page, "%s\n", pmu_attr->event_str);
9399
9400 return 0;
9401}
9402
Peter Zijlstraabe43402010-11-17 23:17:37 +01009403static int __init perf_event_sysfs_init(void)
9404{
9405 struct pmu *pmu;
9406 int ret;
9407
9408 mutex_lock(&pmus_lock);
9409
9410 ret = bus_register(&pmu_bus);
9411 if (ret)
9412 goto unlock;
9413
9414 list_for_each_entry(pmu, &pmus, entry) {
9415 if (!pmu->name || pmu->type < 0)
9416 continue;
9417
9418 ret = pmu_dev_alloc(pmu);
9419 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
9420 }
9421 pmu_bus_running = 1;
9422 ret = 0;
9423
9424unlock:
9425 mutex_unlock(&pmus_lock);
9426
9427 return ret;
9428}
9429device_initcall(perf_event_sysfs_init);
Stephane Eraniane5d13672011-02-14 11:20:01 +02009430
9431#ifdef CONFIG_CGROUP_PERF
Tejun Heoeb954192013-08-08 20:11:23 -04009432static struct cgroup_subsys_state *
9433perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
Stephane Eraniane5d13672011-02-14 11:20:01 +02009434{
9435 struct perf_cgroup *jc;
Stephane Eraniane5d13672011-02-14 11:20:01 +02009436
Li Zefan1b15d052011-03-03 14:26:06 +08009437 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
Stephane Eraniane5d13672011-02-14 11:20:01 +02009438 if (!jc)
9439 return ERR_PTR(-ENOMEM);
9440
Stephane Eraniane5d13672011-02-14 11:20:01 +02009441 jc->info = alloc_percpu(struct perf_cgroup_info);
9442 if (!jc->info) {
9443 kfree(jc);
9444 return ERR_PTR(-ENOMEM);
9445 }
9446
Stephane Eraniane5d13672011-02-14 11:20:01 +02009447 return &jc->css;
9448}
9449
Tejun Heoeb954192013-08-08 20:11:23 -04009450static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
Stephane Eraniane5d13672011-02-14 11:20:01 +02009451{
Tejun Heoeb954192013-08-08 20:11:23 -04009452 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
9453
Stephane Eraniane5d13672011-02-14 11:20:01 +02009454 free_percpu(jc->info);
9455 kfree(jc);
9456}
9457
9458static int __perf_cgroup_move(void *info)
9459{
9460 struct task_struct *task = info;
Stephane Eranianddaaf4e2015-11-12 11:00:03 +01009461 rcu_read_lock();
Stephane Eraniane5d13672011-02-14 11:20:01 +02009462 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
Stephane Eranianddaaf4e2015-11-12 11:00:03 +01009463 rcu_read_unlock();
Stephane Eraniane5d13672011-02-14 11:20:01 +02009464 return 0;
9465}
9466
Tejun Heo1f7dd3e52015-12-03 10:18:21 -05009467static void perf_cgroup_attach(struct cgroup_taskset *tset)
Stephane Eraniane5d13672011-02-14 11:20:01 +02009468{
Tejun Heobb9d97b2011-12-12 18:12:21 -08009469 struct task_struct *task;
Tejun Heo1f7dd3e52015-12-03 10:18:21 -05009470 struct cgroup_subsys_state *css;
Tejun Heobb9d97b2011-12-12 18:12:21 -08009471
Tejun Heo1f7dd3e52015-12-03 10:18:21 -05009472 cgroup_taskset_for_each(task, css, tset)
Tejun Heobb9d97b2011-12-12 18:12:21 -08009473 task_function_call(task, __perf_cgroup_move, task);
Stephane Eraniane5d13672011-02-14 11:20:01 +02009474}
9475
Tejun Heo073219e2014-02-08 10:36:58 -05009476struct cgroup_subsys perf_event_cgrp_subsys = {
Tejun Heo92fb9742012-11-19 08:13:38 -08009477 .css_alloc = perf_cgroup_css_alloc,
9478 .css_free = perf_cgroup_css_free,
Tejun Heobb9d97b2011-12-12 18:12:21 -08009479 .attach = perf_cgroup_attach,
Stephane Eraniane5d13672011-02-14 11:20:01 +02009480};
9481#endif /* CONFIG_CGROUP_PERF */