sched, trace: Fix sched_switch() prev_state argument

For CONFIG_PREEMPT=y kernels the sched_switch(.prev_state) argument isn't
useful because we can get preempted with current->state != TASK_RUNNING
without actually getting removed from the runqueue.

Cure this by treating all preempted tasks as runnable from the tracer's
point of view.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cautiously-acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1275322715.27810.23323.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 4f733ec..b9e1dd6 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -115,6 +115,23 @@
 	     TP_PROTO(struct task_struct *p, int success),
 	     TP_ARGS(p, success));
 
+#ifdef CREATE_TRACE_POINTS
+static inline long __trace_sched_switch_state(struct task_struct *p)
+{
+	long state = p->state;
+
+#ifdef CONFIG_PREEMPT
+	/*
+	 * For all intents and purposes a preempted task is a running task.
+	 */
+	if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)
+		state = TASK_RUNNING;
+#endif
+
+	return state;
+}
+#endif
+
 /*
  * Tracepoint for task switches, performed by the scheduler:
  */
@@ -139,7 +156,7 @@
 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
 		__entry->prev_pid	= prev->pid;
 		__entry->prev_prio	= prev->prio;
-		__entry->prev_state	= prev->state;
+		__entry->prev_state	= __trace_sched_switch_state(prev);
 		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
 		__entry->next_pid	= next->pid;
 		__entry->next_prio	= next->prio;