cpuidle: Move dev->last_residency update to driver enter routine; remove dev->last_state

Cpuidle governor only suggests the state to enter using the
governor->select() interface, but allows the low level driver to
override the recommended state. The actual entered state
may be different because of software or hardware demotion. Software
demotion is done by the back-end cpuidle driver and can be accounted
correctly. Current cpuidle code uses last_state field to capture the
actual state entered and based on that updates the statistics for the
state entered.

Ideally the driver enter routine should update the counters,
and it should return the state actually entered rather than the time
spent there. The generic cpuidle code should simply handle where
the counters live in the sysfs namespace, not updating the counters.

Reference:
https://lkml.org/lkml/2011/3/25/52

Signed-off-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
Signed-off-by: Trinabh Gupta <g.trinabh@gmail.com>
Tested-by: Jean Pihet <j-pihet@ti.com>
Reviewed-by: Kevin Hilman <khilman@ti.com>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Kevin Hilman <khilman@ti.com>
Signed-off-by: Len Brown <len.brown@intel.com>
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 431ab11..9cd08ce 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -741,22 +741,24 @@
 /**
  * acpi_idle_enter_c1 - enters an ACPI C1 state-type
  * @dev: the target CPU
- * @state: the state data
+ * @index: index of target state
  *
  * This is equivalent to the HALT instruction.
  */
 static int acpi_idle_enter_c1(struct cpuidle_device *dev,
-			      struct cpuidle_state *state)
+				int index)
 {
 	ktime_t  kt1, kt2;
 	s64 idle_time;
 	struct acpi_processor *pr;
+	struct cpuidle_state *state = &dev->states[index];
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
 
 	pr = __this_cpu_read(processors);
+	dev->last_residency = 0;
 
 	if (unlikely(!pr))
-		return 0;
+		return -EINVAL;
 
 	local_irq_disable();
 
@@ -764,7 +766,7 @@
 	if (acpi_idle_suspend) {
 		local_irq_enable();
 		cpu_relax();
-		return 0;
+		return -EINVAL;
 	}
 
 	lapic_timer_state_broadcast(pr, cx, 1);
@@ -773,37 +775,46 @@
 	kt2 = ktime_get_real();
 	idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
+	/* Update device last_residency*/
+	dev->last_residency = (int)idle_time;
+
 	local_irq_enable();
 	cx->usage++;
 	lapic_timer_state_broadcast(pr, cx, 0);
 
-	return idle_time;
+	return index;
 }
 
 /**
  * acpi_idle_enter_simple - enters an ACPI state without BM handling
  * @dev: the target CPU
- * @state: the state data
+ * @index: the index of suggested state
  */
 static int acpi_idle_enter_simple(struct cpuidle_device *dev,
-				  struct cpuidle_state *state)
+				int index)
 {
 	struct acpi_processor *pr;
+	struct cpuidle_state *state = &dev->states[index];
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
 	ktime_t  kt1, kt2;
 	s64 idle_time_ns;
 	s64 idle_time;
 
 	pr = __this_cpu_read(processors);
+	dev->last_residency = 0;
 
 	if (unlikely(!pr))
-		return 0;
-
-	if (acpi_idle_suspend)
-		return(acpi_idle_enter_c1(dev, state));
+		return -EINVAL;
 
 	local_irq_disable();
 
+	if (acpi_idle_suspend) {
+		local_irq_enable();
+		cpu_relax();
+		return -EINVAL;
+	}
+
+
 	if (cx->entry_method != ACPI_CSTATE_FFH) {
 		current_thread_info()->status &= ~TS_POLLING;
 		/*
@@ -815,7 +826,7 @@
 		if (unlikely(need_resched())) {
 			current_thread_info()->status |= TS_POLLING;
 			local_irq_enable();
-			return 0;
+			return -EINVAL;
 		}
 	}
 
@@ -837,6 +848,9 @@
 	idle_time = idle_time_ns;
 	do_div(idle_time, NSEC_PER_USEC);
 
+	/* Update device last_residency*/
+	dev->last_residency = (int)idle_time;
+
 	/* Tell the scheduler how much we idled: */
 	sched_clock_idle_wakeup_event(idle_time_ns);
 
@@ -848,7 +862,7 @@
 
 	lapic_timer_state_broadcast(pr, cx, 0);
 	cx->time += idle_time;
-	return idle_time;
+	return index;
 }
 
 static int c3_cpu_count;
@@ -857,14 +871,15 @@
 /**
  * acpi_idle_enter_bm - enters C3 with proper BM handling
  * @dev: the target CPU
- * @state: the state data
+ * @index: the index of suggested state
  *
  * If BM is detected, the deepest non-C3 idle state is entered instead.
  */
 static int acpi_idle_enter_bm(struct cpuidle_device *dev,
-			      struct cpuidle_state *state)
+				int index)
 {
 	struct acpi_processor *pr;
+	struct cpuidle_state *state = &dev->states[index];
 	struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
 	ktime_t  kt1, kt2;
 	s64 idle_time_ns;
@@ -872,22 +887,26 @@
 
 
 	pr = __this_cpu_read(processors);
+	dev->last_residency = 0;
 
 	if (unlikely(!pr))
-		return 0;
+		return -EINVAL;
 
-	if (acpi_idle_suspend)
-		return(acpi_idle_enter_c1(dev, state));
+
+	if (acpi_idle_suspend) {
+		cpu_relax();
+		return -EINVAL;
+	}
 
 	if (!cx->bm_sts_skip && acpi_idle_bm_check()) {
-		if (dev->safe_state) {
-			dev->last_state = dev->safe_state;
-			return dev->safe_state->enter(dev, dev->safe_state);
+		if (dev->safe_state_index >= 0) {
+			return dev->states[dev->safe_state_index].enter(dev,
+						dev->safe_state_index);
 		} else {
 			local_irq_disable();
 			acpi_safe_halt();
 			local_irq_enable();
-			return 0;
+			return -EINVAL;
 		}
 	}
 
@@ -904,7 +923,7 @@
 		if (unlikely(need_resched())) {
 			current_thread_info()->status |= TS_POLLING;
 			local_irq_enable();
-			return 0;
+			return -EINVAL;
 		}
 	}
 
@@ -954,6 +973,9 @@
 	idle_time = idle_time_ns;
 	do_div(idle_time, NSEC_PER_USEC);
 
+	/* Update device last_residency*/
+	dev->last_residency = (int)idle_time;
+
 	/* Tell the scheduler how much we idled: */
 	sched_clock_idle_wakeup_event(idle_time_ns);
 
@@ -965,7 +987,7 @@
 
 	lapic_timer_state_broadcast(pr, cx, 0);
 	cx->time += idle_time;
-	return idle_time;
+	return index;
 }
 
 struct cpuidle_driver acpi_idle_driver = {
@@ -992,6 +1014,7 @@
 	}
 
 	dev->cpu = pr->id;
+	dev->safe_state_index = -1;
 	for (i = 0; i < CPUIDLE_STATE_MAX; i++) {
 		dev->states[i].name[0] = '\0';
 		dev->states[i].desc[0] = '\0';
@@ -1027,13 +1050,13 @@
 				state->flags |= CPUIDLE_FLAG_TIME_VALID;
 
 			state->enter = acpi_idle_enter_c1;
-			dev->safe_state = state;
+			dev->safe_state_index = count;
 			break;
 
 			case ACPI_STATE_C2:
 			state->flags |= CPUIDLE_FLAG_TIME_VALID;
 			state->enter = acpi_idle_enter_simple;
-			dev->safe_state = state;
+			dev->safe_state_index = count;
 			break;
 
 			case ACPI_STATE_C3:
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index d4c5423..88bd121 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -62,7 +62,7 @@
 {
 	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
 	struct cpuidle_state *target_state;
-	int next_state;
+	int next_state, entered_state;
 
 	if (off)
 		return -ENODEV;
@@ -102,26 +102,27 @@
 
 	target_state = &dev->states[next_state];
 
-	/* enter the state and update stats */
-	dev->last_state = target_state;
-
 	trace_power_start(POWER_CSTATE, next_state, dev->cpu);
 	trace_cpu_idle(next_state, dev->cpu);
 
-	dev->last_residency = target_state->enter(dev, target_state);
+	entered_state = target_state->enter(dev, next_state);
 
 	trace_power_end(dev->cpu);
 	trace_cpu_idle(PWR_EVENT_EXIT, dev->cpu);
 
-	if (dev->last_state)
-		target_state = dev->last_state;
-
-	target_state->time += (unsigned long long)dev->last_residency;
-	target_state->usage++;
+	if (entered_state >= 0) {
+		/* Update cpuidle counters */
+		/* This can be moved to within driver enter routine
+		 * but that results in multiple copies of same code.
+		 */
+		dev->states[entered_state].time +=
+				(unsigned long long)dev->last_residency;
+		dev->states[entered_state].usage++;
+	}
 
 	/* give the governor an opportunity to reflect on the outcome */
 	if (cpuidle_curr_governor->reflect)
-		cpuidle_curr_governor->reflect(dev);
+		cpuidle_curr_governor->reflect(dev, entered_state);
 
 	return 0;
 }
@@ -172,11 +173,10 @@
 EXPORT_SYMBOL_GPL(cpuidle_resume_and_unlock);
 
 #ifdef CONFIG_ARCH_HAS_CPU_RELAX
-static int poll_idle(struct cpuidle_device *dev, struct cpuidle_state *st)
+static int poll_idle(struct cpuidle_device *dev, int index)
 {
 	ktime_t	t1, t2;
 	s64 diff;
-	int ret;
 
 	t1 = ktime_get();
 	local_irq_enable();
@@ -188,8 +188,9 @@
 	if (diff > INT_MAX)
 		diff = INT_MAX;
 
-	ret = (int) diff;
-	return ret;
+	dev->last_residency = (int) diff;
+
+	return index;
 }
 
 static void poll_idle_init(struct cpuidle_device *dev)
@@ -248,7 +249,6 @@
 		dev->states[i].time = 0;
 	}
 	dev->last_residency = 0;
-	dev->last_state = NULL;
 
 	smp_wmb();
 
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 12c9890..6a686a7 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -153,11 +153,24 @@
 	return 0;
 }
 
+/**
+ * ladder_reflect - update the correct last_state_idx
+ * @dev: the CPU
+ * @index: the index of actual state entered
+ */
+static void ladder_reflect(struct cpuidle_device *dev, int index)
+{
+	struct ladder_device *ldev = &__get_cpu_var(ladder_devices);
+	if (index > 0)
+		ldev->last_state_idx = index;
+}
+
 static struct cpuidle_governor ladder_governor = {
 	.name =		"ladder",
 	.rating =	10,
 	.enable =	ladder_enable_device,
 	.select =	ladder_select_state,
+	.reflect =	ladder_reflect,
 	.owner =	THIS_MODULE,
 };
 
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index c47f3d0..e4b200c 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -310,14 +310,17 @@
 /**
  * menu_reflect - records that data structures need update
  * @dev: the CPU
+ * @index: the index of actual entered state
  *
  * NOTE: it's important to be fast here because this operation will add to
  *       the overall exit latency.
  */
-static void menu_reflect(struct cpuidle_device *dev)
+static void menu_reflect(struct cpuidle_device *dev, int index)
 {
 	struct menu_device *data = &__get_cpu_var(menu_devices);
-	data->needs_update = 1;
+	data->last_state_idx = index;
+	if (index >= 0)
+		data->needs_update = 1;
 }
 
 /**
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index a46dddf..a1c888d 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -81,7 +81,7 @@
 static unsigned int lapic_timer_reliable_states = (1 << 1);	 /* Default to only C1 */
 
 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
-static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state);
+static int intel_idle(struct cpuidle_device *dev, int index);
 
 static struct cpuidle_state *cpuidle_state_table;
 
@@ -209,12 +209,13 @@
 /**
  * intel_idle
  * @dev: cpuidle_device
- * @state: cpuidle state
+ * @index: index of cpuidle state
  *
  */
-static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state)
+static int intel_idle(struct cpuidle_device *dev, int index)
 {
 	unsigned long ecx = 1; /* break on interrupt flag */
+	struct cpuidle_state *state = &dev->states[index];
 	unsigned long eax = (unsigned long)cpuidle_get_statedata(state);
 	unsigned int cstate;
 	ktime_t kt_before, kt_after;
@@ -256,7 +257,10 @@
 	if (!(lapic_timer_reliable_states & (1 << (cstate))))
 		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
 
-	return usec_delta;
+	/* Update cpuidle counters */
+	dev->last_residency = (int)usec_delta;
+
+	return index;
 }
 
 static void __setup_broadcast_timer(void *arg)