perf sched: Add sched latency profiling

Add the -l --latency option that reports statistics about the
scheduler latencies.

For now, the latencies are measured in the following sequence
scope:

- task A is sleeping (D or S state)
- task B wakes up A
         ^
         |
         |

   latency timeframe

         |
         |
         v
- task A is scheduled in

Start by recording every scheduler events:

	perf record -e sched:*

and then fetch the results:

	perf sched -l

 Tasks                     count          total              avg            max

migration/0                  2             39849            19924           28826
ksoftirqd/0                  7            756383           108054          373014
migration/1                  5             45391             9078           10452
ksoftirqd/1                  2            399055           199527          359130
events/0                     8           4780110           597513         4500250
events/1                     9           6353057           705895         2986012
kblockd/0                   42          37805097           900121         5077684

The snapshot are in nanoseconds.

- Count: number of snapshots taken for the given task
- Total: total latencies in nanosec
- Avg  : average of latency between wake up and sched in
- Max  : max snapshot latency

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 756fe62..4f9e943 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -31,6 +31,7 @@
 static u64			sample_type;
 
 static int			replay_mode;
+static int			lat_mode;
 
 
 /*
@@ -847,6 +848,269 @@
 	.fork_event = replay_fork_event,
 };
 
+#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
+
+enum thread_state {
+	THREAD_SLEEPING,
+	THREAD_WAKED_UP,
+	THREAD_SCHED_IN,
+	THREAD_IGNORE
+};
+
+struct lat_snapshot {
+	struct list_head	list;
+	enum thread_state	state;
+	u64			wake_up_time;
+	u64			sched_in_time;
+};
+
+struct thread_latency {
+	struct list_head	snapshot_list;
+	struct thread		*thread;
+	struct rb_node		node;
+};
+
+static struct rb_root lat_snapshot_root;
+
+static struct thread_latency *
+thread_latency_search(struct rb_root *root, struct thread *thread)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct thread_latency *lat;
+
+		lat = container_of(node, struct thread_latency, node);
+		if (thread->pid < lat->thread->pid)
+			node = node->rb_left;
+		else if (thread->pid > lat->thread->pid)
+			node = node->rb_right;
+		else {
+			return lat;
+		}
+	}
+	return NULL;
+}
+
+static void
+__thread_latency_insert(struct rb_root *root, struct thread_latency *data)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	while (*new) {
+		struct thread_latency *this;
+
+		this = container_of(*new, struct thread_latency, node);
+		parent = *new;
+		if (data->thread->pid < this->thread->pid)
+			new = &((*new)->rb_left);
+		else if (data->thread->pid > this->thread->pid)
+			new = &((*new)->rb_right);
+		else
+			die("Double thread insertion\n");
+	}
+
+	rb_link_node(&data->node, parent, new);
+	rb_insert_color(&data->node, root);
+}
+
+static void thread_latency_insert(struct thread *thread)
+{
+	struct thread_latency *lat;
+	lat = calloc(sizeof(*lat), 1);
+	if (!lat)
+		die("No memory");
+
+	lat->thread = thread;
+	INIT_LIST_HEAD(&lat->snapshot_list);
+	__thread_latency_insert(&lat_snapshot_root, lat);
+}
+
+static void
+latency_fork_event(struct trace_fork_event *fork_event __used,
+		   struct event *event __used,
+		   int cpu __used,
+		   u64 timestamp __used,
+		   struct thread *thread __used)
+{
+	/* should insert the newcomer */
+}
+
+static char sched_out_state(struct trace_switch_event *switch_event)
+{
+	const char *str = TASK_STATE_TO_CHAR_STR;
+
+	return str[switch_event->prev_state];
+}
+
+static void
+lat_sched_out(struct thread_latency *lat,
+	     struct trace_switch_event *switch_event)
+{
+	struct lat_snapshot *snapshot;
+
+	if (sched_out_state(switch_event) == 'R')
+		return;
+
+	snapshot = calloc(sizeof(*snapshot), 1);
+	if (!snapshot)
+		die("Non memory");
+
+	list_add_tail(&snapshot->list, &lat->snapshot_list);
+}
+
+static void
+lat_sched_in(struct thread_latency *lat, u64 timestamp)
+{
+	struct lat_snapshot *snapshot;
+
+	if (list_empty(&lat->snapshot_list))
+		return;
+
+	snapshot = list_entry(lat->snapshot_list.prev, struct lat_snapshot,
+			      list);
+
+	if (snapshot->state != THREAD_WAKED_UP)
+		return;
+
+	if (timestamp < snapshot->wake_up_time) {
+		snapshot->state = THREAD_IGNORE;
+		return;
+	}
+
+	snapshot->state = THREAD_SCHED_IN;
+	snapshot->sched_in_time = timestamp;
+}
+
+
+static void
+latency_switch_event(struct trace_switch_event *switch_event,
+		     struct event *event __used,
+		     int cpu __used,
+		     u64 timestamp,
+		     struct thread *thread __used)
+{
+	struct thread_latency *out_lat, *in_lat;
+	struct thread *sched_out, *sched_in;
+
+	sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match);
+	sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match);
+
+	in_lat = thread_latency_search(&lat_snapshot_root, sched_in);
+	if (!in_lat) {
+		thread_latency_insert(sched_in);
+		in_lat = thread_latency_search(&lat_snapshot_root, sched_in);
+		if (!in_lat)
+			die("Internal latency tree error");
+	}
+
+	out_lat = thread_latency_search(&lat_snapshot_root, sched_out);
+	if (!out_lat) {
+		thread_latency_insert(sched_out);
+		out_lat = thread_latency_search(&lat_snapshot_root, sched_out);
+		if (!out_lat)
+			die("Internal latency tree error");
+	}
+
+	lat_sched_in(in_lat, timestamp);
+	lat_sched_out(out_lat, switch_event);
+}
+
+static void
+latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
+		     struct event *event __used,
+		     int cpu __used,
+		     u64 timestamp,
+		     struct thread *thread __used)
+{
+	struct thread_latency *lat;
+	struct lat_snapshot *snapshot;
+	struct thread *wakee;
+
+	/* Note for later, it may be interesting to observe the failing cases */
+	if (!wakeup_event->success)
+		return;
+
+	wakee = threads__findnew(wakeup_event->pid, &threads, &last_match);
+	lat = thread_latency_search(&lat_snapshot_root, wakee);
+	if (!lat) {
+		thread_latency_insert(wakee);
+		return;
+	}
+
+	if (list_empty(&lat->snapshot_list))
+		return;
+
+	snapshot = list_entry(lat->snapshot_list.prev, struct lat_snapshot,
+			      list);
+
+	if (snapshot->state != THREAD_SLEEPING)
+		return;
+
+	snapshot->state = THREAD_WAKED_UP;
+	snapshot->wake_up_time = timestamp;
+}
+
+static struct trace_sched_handler lat_ops  = {
+	.wakeup_event = latency_wakeup_event,
+	.switch_event = latency_switch_event,
+	.fork_event = latency_fork_event,
+};
+
+static void output_lat_thread(struct thread_latency *lat)
+{
+	struct lat_snapshot *shot;
+	int count = 0;
+	int i;
+	int ret;
+	u64 max = 0, avg;
+	u64 total = 0, delta;
+
+	list_for_each_entry(shot, &lat->snapshot_list, list) {
+		if (shot->state != THREAD_SCHED_IN)
+			continue;
+
+		count++;
+
+		delta = shot->sched_in_time - shot->wake_up_time;
+		if (delta > max)
+			max = delta;
+		total += delta;
+	}
+
+	if (!count)
+		return;
+
+	ret = printf("%s", lat->thread->comm);
+
+	for (i = 0; i < 25 - ret; i++)
+		printf(" ");
+
+	avg = total / count;
+
+	printf("%5d        %10llu       %10llu      %10llu\n", count, total, avg, max);
+}
+
+static void output_lat_results(void)
+{
+	struct rb_node *next;
+
+	printf(" Tasks");
+	printf("                     count");
+	printf("          total");
+	printf("              avg");
+	printf("            max\n\n");
+
+	next = rb_first(&lat_snapshot_root);
+
+	while (next) {
+		struct thread_latency *lat;
+
+		lat = rb_entry(next, struct thread_latency, node);
+		output_lat_thread(lat);
+		next = rb_next(next);
+	}
+}
 
 static struct trace_sched_handler *trace_handler;
 
@@ -1154,6 +1418,8 @@
 		    "dump raw trace in ASCII"),
 	OPT_BOOLEAN('r', "replay", &replay_mode,
 		    "replay sched behaviour from traces"),
+	OPT_BOOLEAN('l', "latency", &lat_mode,
+		    "measure various latencies"),
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_END()
@@ -1180,22 +1446,30 @@
 
 	if (replay_mode)
 		trace_handler = &replay_ops;
-	else /* We may need a default subcommand */
+	else if (lat_mode)
+		trace_handler = &lat_ops;
+	else /* We may need a default subcommand (perf trace?) */
 		die("Please select a sub command (-r)\n");
 
-	calibrate_run_measurement_overhead();
-	calibrate_sleep_measurement_overhead();
+	if (replay_mode) {
+		calibrate_run_measurement_overhead();
+		calibrate_sleep_measurement_overhead();
 
-	test_calibrations();
+		test_calibrations();
 
-	parse_trace();
-	print_task_traces();
-	add_cross_task_wakeups();
+		parse_trace();
+		print_task_traces();
+		add_cross_task_wakeups();
 
-	create_tasks();
-	printf("------------------------------------------------------------\n");
-	for (i = 0; i < nr_iterations; i++)
-		run_one_test();
+		create_tasks();
+		printf("------------------------------------------------------------\n");
+		for (i = 0; i < nr_iterations; i++)
+			run_one_test();
+	} else if (lat_mode) {
+		setup_pager();
+		__cmd_sched();
+		output_lat_results();
+	}
 
 	return 0;
 }