perfcounters: implement "counter inheritance"
Impact: implement new performance feature
Counter inheritance can be used to run performance counters in a workload,
transparently - and pipe back the counter results to the parent counter.
Inheritance for performance counters works the following way: when creating
a counter it can be marked with the .inherit=1 flag. Such counters are then
'inherited' by all child tasks (be they fork()-ed or clone()-ed). These
counters get inherited through exec() boundaries as well (except through
setuid boundaries).
The counter values get added back to the parent counter(s) when the child
task(s) exit - much like stime/utime statistics are gathered. So inherited
counters are ideal to gather summary statistics about an application's
behavior via shell commands, without having to modify that application.
The timec.c command utilizes counter inheritance:
http://redhat.com/~mingo/perfcounters/timec.c
Sample output:
$ ./timec -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
Performance counter stats for 'ls':
163516953 instructions
2295 cache-misses
2855182 branch-misses
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7246028..e5d25bf 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -75,10 +75,11 @@
u64 irq_period;
u32 record_type;
- u32 disabled : 1, /* off by default */
- nmi : 1, /* NMI sampling */
- raw : 1, /* raw event type */
- __reserved_1 : 29;
+ u32 disabled : 1, /* off by default */
+ nmi : 1, /* NMI sampling */
+ raw : 1, /* raw event type */
+ inherit : 1, /* children inherit it */
+ __reserved_1 : 28;
u64 __reserved_2;
};
@@ -138,6 +139,8 @@
PERF_COUNTER_STATE_ACTIVE = 1,
};
+struct file;
+
/**
* struct perf_counter - performance counter kernel representation:
*/
@@ -156,7 +159,10 @@
struct perf_counter_context *ctx;
struct task_struct *task;
+ struct file *filp;
+ unsigned int nr_inherited;
+ struct perf_counter *parent;
/*
* Protect attach/detach:
*/
@@ -210,13 +216,16 @@
extern int perf_max_counters;
#ifdef CONFIG_PERF_COUNTERS
+extern void
+perf_counter_show(struct perf_counter *counter, char *str, int trace);
extern const struct hw_perf_counter_ops *
hw_perf_counter_init(struct perf_counter *counter);
extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
extern void perf_counter_task_tick(struct task_struct *task, int cpu);
-extern void perf_counter_init_task(struct task_struct *task);
+extern void perf_counter_init_task(struct task_struct *child);
+extern void perf_counter_exit_task(struct task_struct *child);
extern void perf_counter_notify(struct pt_regs *regs);
extern void perf_counter_print_debug(void);
extern u64 hw_perf_save_disable(void);
@@ -226,12 +235,15 @@
#else
static inline void
+perf_counter_show(struct perf_counter *counter, char *str, int trace) { }
+static inline void
perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
static inline void
perf_counter_task_sched_out(struct task_struct *task, int cpu) { }
static inline void
perf_counter_task_tick(struct task_struct *task, int cpu) { }
-static inline void perf_counter_init_task(struct task_struct *task) { }
+static inline void perf_counter_init_task(struct task_struct *child) { }
+static inline void perf_counter_exit_task(struct task_struct *child) { }
static inline void perf_counter_notify(struct pt_regs *regs) { }
static inline void perf_counter_print_debug(void) { }
static inline void hw_perf_restore(u64 ctrl) { }
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7e..d336c90 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1093,11 +1093,12 @@
mpol_put(tsk->mempolicy);
tsk->mempolicy = NULL;
#endif
-#ifdef CONFIG_FUTEX
/*
- * This must happen late, after the PID is not
- * hashed anymore:
+ * These must happen late, after the PID is not
+ * hashed anymore, but still at a point that may sleep:
*/
+ perf_counter_exit_task(tsk);
+#ifdef CONFIG_FUTEX
if (unlikely(!list_empty(&tsk->pi_state_list)))
exit_pi_state_list(tsk);
if (unlikely(current->pi_state_cache))
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 416861c..f5e81dd 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -80,8 +80,6 @@
list_del_init(&sibling->list_entry);
list_add_tail(&sibling->list_entry, &ctx->counter_list);
- WARN_ON_ONCE(!sibling->group_leader);
- WARN_ON_ONCE(sibling->group_leader == sibling);
sibling->group_leader = sibling;
}
}
@@ -97,6 +95,7 @@
struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
struct perf_counter *counter = info;
struct perf_counter_context *ctx = counter->ctx;
+ unsigned long flags;
u64 perf_flags;
/*
@@ -107,7 +106,7 @@
if (ctx->task && cpuctx->task_ctx != ctx)
return;
- spin_lock(&ctx->lock);
+ spin_lock_irqsave(&ctx->lock, flags);
if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
counter->hw_ops->hw_perf_counter_disable(counter);
@@ -136,7 +135,7 @@
perf_max_counters - perf_reserved_percpu);
}
- spin_unlock(&ctx->lock);
+ spin_unlock_irqrestore(&ctx->lock, flags);
}
@@ -199,6 +198,7 @@
struct perf_counter *counter = info;
struct perf_counter_context *ctx = counter->ctx;
int cpu = smp_processor_id();
+ unsigned long flags;
u64 perf_flags;
/*
@@ -209,7 +209,7 @@
if (ctx->task && cpuctx->task_ctx != ctx)
return;
- spin_lock(&ctx->lock);
+ spin_lock_irqsave(&ctx->lock, flags);
/*
* Protect the list operation against NMI by disabling the
@@ -232,7 +232,7 @@
if (!ctx->task && cpuctx->max_pertask)
cpuctx->max_pertask--;
- spin_unlock(&ctx->lock);
+ spin_unlock_irqrestore(&ctx->lock, flags);
}
/*
@@ -446,10 +446,9 @@
*/
perf_flags = hw_perf_save_disable();
- list_for_each_entry(counter, &ctx->counter_list, list_entry) {
- WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE);
+ list_for_each_entry(counter, &ctx->counter_list, list_entry)
counter->state = PERF_COUNTER_STATE_OFF;
- }
+
hw_perf_restore(perf_flags);
spin_unlock(&ctx->lock);
@@ -526,26 +525,6 @@
}
/*
- * Initialize the perf_counter context in a task_struct:
- */
-static void
-__perf_counter_init_context(struct perf_counter_context *ctx,
- struct task_struct *task)
-{
- spin_lock_init(&ctx->lock);
- INIT_LIST_HEAD(&ctx->counter_list);
- ctx->nr_counters = 0;
- ctx->task = task;
-}
-/*
- * Initialize the perf_counter context in task_struct
- */
-void perf_counter_init_task(struct task_struct *task)
-{
- __perf_counter_init_context(&task->perf_counter_ctx, task);
-}
-
-/*
* Cross CPU call to read the hardware counter
*/
static void __hw_perf_counter_read(void *info)
@@ -663,7 +642,6 @@
cpuctx = &per_cpu(perf_cpu_context, cpu);
ctx = &cpuctx->ctx;
- WARN_ON_ONCE(ctx->task);
return ctx;
}
@@ -915,12 +893,13 @@
static struct perf_counter *
perf_counter_alloc(struct perf_counter_hw_event *hw_event,
int cpu,
- struct perf_counter *group_leader)
+ struct perf_counter *group_leader,
+ gfp_t gfpflags)
{
const struct hw_perf_counter_ops *hw_ops;
struct perf_counter *counter;
- counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+ counter = kzalloc(sizeof(*counter), gfpflags);
if (!counter)
return NULL;
@@ -947,9 +926,8 @@
hw_ops = NULL;
if (!hw_event->raw && hw_event->type < 0)
hw_ops = sw_perf_counter_init(counter);
- if (!hw_ops) {
+ if (!hw_ops)
hw_ops = hw_perf_counter_init(counter);
- }
if (!hw_ops) {
kfree(counter);
@@ -975,8 +953,10 @@
struct perf_counter *counter, *group_leader;
struct perf_counter_hw_event hw_event;
struct perf_counter_context *ctx;
+ struct file *counter_file = NULL;
struct file *group_file = NULL;
int fput_needed = 0;
+ int fput_needed2 = 0;
int ret;
if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
@@ -1017,25 +997,29 @@
}
ret = -EINVAL;
- counter = perf_counter_alloc(&hw_event, cpu, group_leader);
+ counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
if (!counter)
goto err_put_context;
- perf_install_in_context(ctx, counter, cpu);
-
ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
if (ret < 0)
- goto err_remove_free_put_context;
+ goto err_free_put_context;
+
+ counter_file = fget_light(ret, &fput_needed2);
+ if (!counter_file)
+ goto err_free_put_context;
+
+ counter->filp = counter_file;
+ perf_install_in_context(ctx, counter, cpu);
+
+ fput_light(counter_file, fput_needed2);
out_fput:
fput_light(group_file, fput_needed);
return ret;
-err_remove_free_put_context:
- mutex_lock(&counter->mutex);
- perf_counter_remove_from_context(counter);
- mutex_unlock(&counter->mutex);
+err_free_put_context:
kfree(counter);
err_put_context:
@@ -1044,6 +1028,186 @@
goto out_fput;
}
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+ struct task_struct *task)
+{
+ memset(ctx, 0, sizeof(*ctx));
+ spin_lock_init(&ctx->lock);
+ INIT_LIST_HEAD(&ctx->counter_list);
+ ctx->task = task;
+}
+
+/*
+ * inherit a counter from parent task to child task:
+ */
+static int
+inherit_counter(struct perf_counter *parent_counter,
+ struct task_struct *parent,
+ struct perf_counter_context *parent_ctx,
+ struct task_struct *child,
+ struct perf_counter_context *child_ctx)
+{
+ struct perf_counter *child_counter;
+
+ child_counter = perf_counter_alloc(&parent_counter->hw_event,
+ parent_counter->cpu, NULL,
+ GFP_ATOMIC);
+ if (!child_counter)
+ return -ENOMEM;
+
+ /*
+ * Link it up in the child's context:
+ */
+ child_counter->ctx = child_ctx;
+ child_counter->task = child;
+ list_add_counter(child_counter, child_ctx);
+ child_ctx->nr_counters++;
+
+ child_counter->parent = parent_counter;
+ parent_counter->nr_inherited++;
+ /*
+ * inherit into child's child as well:
+ */
+ child_counter->hw_event.inherit = 1;
+
+ /*
+ * Get a reference to the parent filp - we will fput it
+ * when the child counter exits. This is safe to do because
+ * we are in the parent and we know that the filp still
+ * exists and has a nonzero count:
+ */
+ atomic_long_inc(&parent_counter->filp->f_count);
+
+ return 0;
+}
+
+static void
+__perf_counter_exit_task(struct task_struct *child,
+ struct perf_counter *child_counter,
+ struct perf_counter_context *child_ctx)
+{
+ struct perf_counter *parent_counter;
+ u64 parent_val, child_val;
+ u64 perf_flags;
+
+ /*
+ * Disable and unlink this counter.
+ *
+ * Be careful about zapping the list - IRQ/NMI context
+ * could still be processing it:
+ */
+ local_irq_disable();
+ perf_flags = hw_perf_save_disable();
+
+ if (child_counter->state == PERF_COUNTER_STATE_ACTIVE)
+ child_counter->hw_ops->hw_perf_counter_disable(child_counter);
+ list_del_init(&child_counter->list_entry);
+
+ hw_perf_restore(perf_flags);
+ local_irq_enable();
+
+ parent_counter = child_counter->parent;
+ /*
+ * It can happen that parent exits first, and has counters
+ * that are still around due to the child reference. These
+ * counters need to be zapped - but otherwise linger.
+ */
+ if (!parent_counter)
+ return;
+
+ parent_val = atomic64_read(&parent_counter->count);
+ child_val = atomic64_read(&child_counter->count);
+
+ /*
+ * Add back the child's count to the parent's count:
+ */
+ atomic64_add(child_val, &parent_counter->count);
+
+ fput(parent_counter->filp);
+
+ kfree(child_counter);
+}
+
+/*
+ * When a child task exist, feed back counter values to parent counters.
+ *
+ * Note: we are running in child context, but the PID is not hashed
+ * anymore so new counters will not be added.
+ */
+void perf_counter_exit_task(struct task_struct *child)
+{
+ struct perf_counter *child_counter, *tmp;
+ struct perf_counter_context *child_ctx;
+
+ child_ctx = &child->perf_counter_ctx;
+
+ if (likely(!child_ctx->nr_counters))
+ return;
+
+ list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
+ list_entry)
+ __perf_counter_exit_task(child, child_counter, child_ctx);
+}
+
+/*
+ * Initialize the perf_counter context in task_struct
+ */
+void perf_counter_init_task(struct task_struct *child)
+{
+ struct perf_counter_context *child_ctx, *parent_ctx;
+ struct perf_counter *counter, *parent_counter;
+ struct task_struct *parent = current;
+ unsigned long flags;
+
+ child_ctx = &child->perf_counter_ctx;
+ parent_ctx = &parent->perf_counter_ctx;
+
+ __perf_counter_init_context(child_ctx, child);
+
+ /*
+ * This is executed from the parent task context, so inherit
+ * counters that have been marked for cloning:
+ */
+
+ if (likely(!parent_ctx->nr_counters))
+ return;
+
+ /*
+ * Lock the parent list. No need to lock the child - not PID
+ * hashed yet and not running, so nobody can access it.
+ */
+ spin_lock_irqsave(&parent_ctx->lock, flags);
+
+ /*
+ * We dont have to disable NMIs - we are only looking at
+ * the list, not manipulating it:
+ */
+ list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
+ if (!counter->hw_event.inherit || counter->group_leader != counter)
+ continue;
+
+ /*
+ * Instead of creating recursive hierarchies of counters,
+ * we link inheritd counters back to the original parent,
+ * which has a filp for sure, which we use as the reference
+ * count:
+ */
+ parent_counter = counter;
+ if (counter->parent)
+ parent_counter = counter->parent;
+
+ if (inherit_counter(parent_counter, parent,
+ parent_ctx, child, child_ctx))
+ break;
+ }
+
+ spin_unlock_irqrestore(&parent_ctx->lock, flags);
+}
+
static void __cpuinit perf_counter_init_cpu(int cpu)
{
struct perf_cpu_context *cpuctx;