tracing: Implement event pid filtering
Add the necessary hooks to use the pids loaded in set_event_pid to filter
all the events enabled in the tracing instance that match the pids listed.
Two probes are added to both sched_switch and sched_wakeup tracepoints to be
called before other probes are called and after the other probes are called.
The first is used to set the necessary flags to let the probes know to test
if they should be traced or not.
The sched_switch pre probe will set the "ignore_pid" flag if neither the
previous or next task has a matching pid.
The sched_switch probe will set the "ignore_pid" flag if the next task
does not match the matching pid.
The pre probe allows for probes tracing sched_switch to be traced if
necessary.
The sched_wakeup pre probe will set the "ignore_pid" flag if neither the
current task nor the wakee task has a matching pid.
The sched_wakeup post probe will set the "ignore_pid" flag if the current
task does not have a matching pid.
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index f85693b..429fdfc 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -328,6 +328,7 @@
EVENT_FILE_FL_SOFT_DISABLED_BIT,
EVENT_FILE_FL_TRIGGER_MODE_BIT,
EVENT_FILE_FL_TRIGGER_COND_BIT,
+ EVENT_FILE_FL_PID_FILTER_BIT,
};
/*
@@ -341,6 +342,7 @@
* tracepoint may be enabled)
* TRIGGER_MODE - When set, invoke the triggers associated with the event
* TRIGGER_COND - When set, one or more triggers has an associated filter
+ * PID_FILTER - When set, the event is filtered based on pid
*/
enum {
EVENT_FILE_FL_ENABLED = (1 << EVENT_FILE_FL_ENABLED_BIT),
@@ -351,6 +353,7 @@
EVENT_FILE_FL_SOFT_DISABLED = (1 << EVENT_FILE_FL_SOFT_DISABLED_BIT),
EVENT_FILE_FL_TRIGGER_MODE = (1 << EVENT_FILE_FL_TRIGGER_MODE_BIT),
EVENT_FILE_FL_TRIGGER_COND = (1 << EVENT_FILE_FL_TRIGGER_COND_BIT),
+ EVENT_FILE_FL_PID_FILTER = (1 << EVENT_FILE_FL_PID_FILTER_BIT),
};
struct trace_event_file {
@@ -429,6 +432,8 @@
extern void event_triggers_post_call(struct trace_event_file *file,
enum event_trigger_type tt);
+bool trace_event_ignore_this_pid(struct trace_event_file *trace_file);
+
/**
* trace_trigger_soft_disabled - do triggers and test if soft disabled
* @file: The file pointer of the event to test
@@ -448,6 +453,8 @@
event_triggers_call(file, NULL);
if (eflags & EVENT_FILE_FL_SOFT_DISABLED)
return true;
+ if (eflags & EVENT_FILE_FL_PID_FILTER)
+ return trace_event_ignore_this_pid(file);
}
return false;
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2504810..89ffdaf 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -156,6 +156,8 @@
pid_t pid;
kuid_t uid;
char comm[TASK_COMM_LEN];
+
+ bool ignore_pid;
};
struct tracer;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2ad7014..ab07058 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -22,6 +22,8 @@
#include <linux/slab.h>
#include <linux/delay.h>
+#include <trace/events/sched.h>
+
#include <asm/setup.h>
#include "trace_output.h"
@@ -212,12 +214,32 @@
}
EXPORT_SYMBOL_GPL(trace_event_raw_init);
+bool trace_event_ignore_this_pid(struct trace_event_file *trace_file)
+{
+ struct trace_array *tr = trace_file->tr;
+ struct trace_array_cpu *data;
+ struct trace_pid_list *pid_list;
+
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+ if (!pid_list)
+ return false;
+
+ data = this_cpu_ptr(tr->trace_buffer.data);
+
+ return data->ignore_pid;
+}
+EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid);
+
void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
struct trace_event_file *trace_file,
unsigned long len)
{
struct trace_event_call *event_call = trace_file->event_call;
+ if ((trace_file->flags & EVENT_FILE_FL_PID_FILTER) &&
+ trace_event_ignore_this_pid(trace_file))
+ return NULL;
+
local_save_flags(fbuffer->flags);
fbuffer->pc = preempt_count();
fbuffer->trace_file = trace_file;
@@ -459,15 +481,114 @@
return 1;
}
+static bool
+check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task)
+{
+ pid_t search_pid;
+ pid_t *pid;
+
+ /*
+ * Return false, because if filtered_pids does not exist,
+ * all pids are good to trace.
+ */
+ if (!filtered_pids)
+ return false;
+
+ search_pid = task->pid;
+
+ pid = bsearch(&search_pid, filtered_pids->pids,
+ filtered_pids->nr_pids, sizeof(pid_t),
+ cmp_pid);
+ if (!pid)
+ return true;
+
+ return false;
+}
+
+static void
+event_filter_pid_sched_switch_probe_pre(void *data,
+ struct task_struct *prev, struct task_struct *next)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+
+ this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ check_ignore_pid(pid_list, prev) &&
+ check_ignore_pid(pid_list, next));
+}
+
+static void
+event_filter_pid_sched_switch_probe_post(void *data,
+ struct task_struct *prev, struct task_struct *next)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+
+ this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ check_ignore_pid(pid_list, next));
+}
+
+static void
+event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ /* Nothing to do if we are already tracing */
+ if (!this_cpu_read(tr->trace_buffer.data->ignore_pid))
+ return;
+
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+
+ this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ check_ignore_pid(pid_list, task));
+}
+
+static void
+event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ /* Nothing to do if we are not tracing */
+ if (this_cpu_read(tr->trace_buffer.data->ignore_pid))
+ return;
+
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+
+ /* Set tracing if current is enabled */
+ this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ check_ignore_pid(pid_list, current));
+}
+
static void __ftrace_clear_event_pids(struct trace_array *tr)
{
struct trace_pid_list *pid_list;
+ struct trace_event_file *file;
+ int cpu;
pid_list = rcu_dereference_protected(tr->filtered_pids,
lockdep_is_held(&event_mutex));
if (!pid_list)
return;
+ unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_pre, tr);
+ unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_post, tr);
+
+ unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr);
+ unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr);
+
+ list_for_each_entry(file, &tr->events, list) {
+ clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags);
+ }
+
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(tr->trace_buffer.data, cpu)->ignore_pid = false;
+
rcu_assign_pointer(tr->filtered_pids, NULL);
/* Wait till all users are no longer using pid filtering */
@@ -1429,13 +1550,14 @@
}
static ssize_t
-ftrace_event_pid_write(struct file *file, const char __user *ubuf,
+ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct seq_file *m = file->private_data;
+ struct seq_file *m = filp->private_data;
struct trace_array *tr = m->private;
struct trace_pid_list *filtered_pids = NULL;
struct trace_pid_list *pid_list = NULL;
+ struct trace_event_file *file;
struct trace_parser parser;
unsigned long val;
loff_t this_pos;
@@ -1564,15 +1686,35 @@
rcu_assign_pointer(tr->filtered_pids, pid_list);
- mutex_unlock(&event_mutex);
+ list_for_each_entry(file, &tr->events, list) {
+ set_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags);
+ }
if (filtered_pids) {
synchronize_sched();
free_pages((unsigned long)filtered_pids->pids, filtered_pids->order);
kfree(filtered_pids);
+ } else {
+ /*
+ * Register a probe that is called before all other probes
+ * to set ignore_pid if next or prev do not match.
+ * Register a probe this is called after all other probes
+ * to only keep ignore_pid set if next pid matches.
+ */
+ register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre,
+ tr, INT_MAX);
+ register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post,
+ tr, 0);
+
+ register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre,
+ tr, INT_MAX);
+ register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post,
+ tr, 0);
}
+ mutex_unlock(&event_mutex);
+
ret = read;
*ppos += read;