blob: f9374dd071be9266def07e7c578c2f7e642345e9 [file] [log] [blame]
Len Brown4f86d3a2007-10-03 18:58:00 -04001/*
2 * menu.c - the menu idle governor
3 *
4 * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
Arjan van de Ven69d25872009-09-21 17:04:08 -07005 * Copyright (C) 2009 Intel Corporation
6 * Author:
7 * Arjan van de Ven <arjan@linux.intel.com>
Len Brown4f86d3a2007-10-03 18:58:00 -04008 *
Arjan van de Ven69d25872009-09-21 17:04:08 -07009 * This code is licenced under the GPL version 2 as described
10 * in the COPYING file that acompanies the Linux Kernel.
Len Brown4f86d3a2007-10-03 18:58:00 -040011 */
12
13#include <linux/kernel.h>
14#include <linux/cpuidle.h>
Len Brown4f86d3a2007-10-03 18:58:00 -040015#include <linux/time.h>
16#include <linux/ktime.h>
17#include <linux/hrtimer.h>
18#include <linux/tick.h>
Arjan van de Ven69d25872009-09-21 17:04:08 -070019#include <linux/sched.h>
Ingo Molnar4f177222017-02-08 08:45:17 +010020#include <linux/sched/loadavg.h>
Ingo Molnar03441a32017-02-08 18:51:35 +010021#include <linux/sched/stat.h>
Stephen Hemminger57875362010-01-08 14:43:08 -080022#include <linux/math64.h>
Len Brown4f86d3a2007-10-03 18:58:00 -040023
Tuukka Tikkanendecd51b2013-08-14 19:02:40 +030024/*
25 * Please note when changing the tuning values:
26 * If (MAX_INTERESTING-1) * RESOLUTION > UINT_MAX, the result of
27 * a scaling operation multiplication may overflow on 32 bit platforms.
28 * In that case, #define RESOLUTION as ULL to get 64 bit result:
29 * #define RESOLUTION 1024ULL
30 *
31 * The default values do not overflow.
32 */
Arjan van de Ven69d25872009-09-21 17:04:08 -070033#define BUCKETS 12
Mel Gormanae779302014-08-06 14:19:18 +010034#define INTERVAL_SHIFT 3
35#define INTERVALS (1UL << INTERVAL_SHIFT)
Arjan van de Ven69d25872009-09-21 17:04:08 -070036#define RESOLUTION 1024
Arjan van de Ven1f85f872010-05-24 14:32:59 -070037#define DECAY 8
Arjan van de Ven69d25872009-09-21 17:04:08 -070038#define MAX_INTERESTING 50000
Arjan van de Ven1f85f872010-05-24 14:32:59 -070039
Arjan van de Ven69d25872009-09-21 17:04:08 -070040
41/*
42 * Concepts and ideas behind the menu governor
43 *
44 * For the menu governor, there are 3 decision factors for picking a C
45 * state:
46 * 1) Energy break even point
47 * 2) Performance impact
48 * 3) Latency tolerance (from pmqos infrastructure)
49 * These these three factors are treated independently.
50 *
51 * Energy break even point
52 * -----------------------
53 * C state entry and exit have an energy cost, and a certain amount of time in
54 * the C state is required to actually break even on this cost. CPUIDLE
55 * provides us this duration in the "target_residency" field. So all that we
56 * need is a good prediction of how long we'll be idle. Like the traditional
57 * menu governor, we start with the actual known "next timer event" time.
58 *
59 * Since there are other source of wakeups (interrupts for example) than
60 * the next timer event, this estimation is rather optimistic. To get a
61 * more realistic estimate, a correction factor is applied to the estimate,
62 * that is based on historic behavior. For example, if in the past the actual
63 * duration always was 50% of the next timer tick, the correction factor will
64 * be 0.5.
65 *
66 * menu uses a running average for this correction factor, however it uses a
67 * set of factors, not just a single factor. This stems from the realization
68 * that the ratio is dependent on the order of magnitude of the expected
69 * duration; if we expect 500 milliseconds of idle time the likelihood of
70 * getting an interrupt very early is much higher than if we expect 50 micro
71 * seconds of idle time. A second independent factor that has big impact on
72 * the actual factor is if there is (disk) IO outstanding or not.
73 * (as a special twist, we consider every sleep longer than 50 milliseconds
74 * as perfect; there are no power gains for sleeping longer than this)
75 *
76 * For these two reasons we keep an array of 12 independent factors, that gets
77 * indexed based on the magnitude of the expected duration as well as the
78 * "is IO outstanding" property.
79 *
Arjan van de Ven1f85f872010-05-24 14:32:59 -070080 * Repeatable-interval-detector
81 * ----------------------------
82 * There are some cases where "next timer" is a completely unusable predictor:
83 * Those cases where the interval is fixed, for example due to hardware
84 * interrupt mitigation, but also due to fixed transfer rate devices such as
85 * mice.
86 * For this, we use a different predictor: We track the duration of the last 8
87 * intervals and if the stand deviation of these 8 intervals is below a
88 * threshold value, we use the average of these intervals as prediction.
89 *
Arjan van de Ven69d25872009-09-21 17:04:08 -070090 * Limiting Performance Impact
91 * ---------------------------
92 * C states, especially those with large exit latencies, can have a real
Lucas De Marchi20e33412010-09-07 12:53:49 -040093 * noticeable impact on workloads, which is not acceptable for most sysadmins,
Arjan van de Ven69d25872009-09-21 17:04:08 -070094 * and in addition, less performance has a power price of its own.
95 *
96 * As a general rule of thumb, menu assumes that the following heuristic
97 * holds:
98 * The busier the system, the less impact of C states is acceptable
99 *
100 * This rule-of-thumb is implemented using a performance-multiplier:
101 * If the exit latency times the performance multiplier is longer than
102 * the predicted duration, the C state is not considered a candidate
103 * for selection due to a too high performance impact. So the higher
104 * this multiplier is, the longer we need to be idle to pick a deep C
105 * state, and thus the less likely a busy CPU will hit such a deep
106 * C state.
107 *
108 * Two factors are used in determing this multiplier:
109 * a value of 10 is added for each point of "per cpu load average" we have.
110 * a value of 5 points is added for each process that is waiting for
111 * IO on this CPU.
112 * (these values are experimentally determined)
113 *
114 * The load average factor gives a longer term (few seconds) input to the
115 * decision, while the iowait value gives a cpu local instantanious input.
116 * The iowait factor may look low, but realize that this is also already
117 * represented in the system load average.
118 *
119 */
Len Brown4f86d3a2007-10-03 18:58:00 -0400120
121struct menu_device {
122 int last_state_idx;
Corrado Zoccolo672917d2009-09-21 17:04:09 -0700123 int needs_update;
Rafael J. Wysocki45f1ff52018-03-22 17:50:49 +0100124 int tick_wakeup;
Len Brown4f86d3a2007-10-03 18:58:00 -0400125
tuukka.tikkanen@linaro.org5dc2f5a2014-02-24 08:29:31 +0200126 unsigned int next_timer_us;
Tuukka Tikkanen51f245b2013-08-14 19:02:41 +0300127 unsigned int predicted_us;
Arjan van de Ven69d25872009-09-21 17:04:08 -0700128 unsigned int bucket;
Tuukka Tikkanen51f245b2013-08-14 19:02:41 +0300129 unsigned int correction_factor[BUCKETS];
Tuukka Tikkanen939e33b2013-08-14 19:02:38 +0300130 unsigned int intervals[INTERVALS];
Arjan van de Ven1f85f872010-05-24 14:32:59 -0700131 int interval_ptr;
Len Brown4f86d3a2007-10-03 18:58:00 -0400132};
133
Arjan van de Ven69d25872009-09-21 17:04:08 -0700134
135#define LOAD_INT(x) ((x) >> FSHIFT)
136#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
137
Mel Gorman372ba8c2014-08-06 14:19:21 +0100138static inline int get_loadavg(unsigned long load)
Arjan van de Ven69d25872009-09-21 17:04:08 -0700139{
Mel Gorman372ba8c2014-08-06 14:19:21 +0100140 return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10;
Arjan van de Ven69d25872009-09-21 17:04:08 -0700141}
142
Mel Gorman64b4ca52014-08-06 14:19:20 +0100143static inline int which_bucket(unsigned int duration, unsigned long nr_iowaiters)
Arjan van de Ven69d25872009-09-21 17:04:08 -0700144{
145 int bucket = 0;
146
147 /*
148 * We keep two groups of stats; one with no
149 * IO pending, one without.
150 * This allows us to calculate
151 * E(duration)|iowait
152 */
Mel Gorman64b4ca52014-08-06 14:19:20 +0100153 if (nr_iowaiters)
Arjan van de Ven69d25872009-09-21 17:04:08 -0700154 bucket = BUCKETS/2;
155
156 if (duration < 10)
157 return bucket;
158 if (duration < 100)
159 return bucket + 1;
160 if (duration < 1000)
161 return bucket + 2;
162 if (duration < 10000)
163 return bucket + 3;
164 if (duration < 100000)
165 return bucket + 4;
166 return bucket + 5;
167}
168
169/*
170 * Return a multiplier for the exit latency that is intended
171 * to take performance requirements into account.
172 * The more performance critical we estimate the system
173 * to be, the higher this multiplier, and thus the higher
174 * the barrier to go to an expensive C state.
175 */
Mel Gorman372ba8c2014-08-06 14:19:21 +0100176static inline int performance_multiplier(unsigned long nr_iowaiters, unsigned long load)
Arjan van de Ven69d25872009-09-21 17:04:08 -0700177{
178 int mult = 1;
179
180 /* for higher loadavg, we are more reluctant */
181
Colin Cross75ceade2011-09-19 16:42:44 -0700182 /*
183 * this doesn't work as intended - it is almost always 0, but can
184 * sometimes, depending on workload, spike very high into the hundreds
185 * even when the average cpu load is under 10%.
186 */
187 /* mult += 2 * get_loadavg(); */
Arjan van de Ven69d25872009-09-21 17:04:08 -0700188
189 /* for IO wait tasks (per cpu!) we add 5x each */
Mel Gorman64b4ca52014-08-06 14:19:20 +0100190 mult += 10 * nr_iowaiters;
Arjan van de Ven69d25872009-09-21 17:04:08 -0700191
192 return mult;
193}
194
Len Brown4f86d3a2007-10-03 18:58:00 -0400195static DEFINE_PER_CPU(struct menu_device, menu_devices);
196
Deepthi Dharwar46bcfad2011-10-28 16:20:42 +0530197static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
Corrado Zoccolo672917d2009-09-21 17:04:09 -0700198
Arjan van de Ven1f85f872010-05-24 14:32:59 -0700199/*
200 * Try detecting repeating patterns by keeping track of the last 8
201 * intervals, and checking if the standard deviation of that set
202 * of points is below a threshold. If it is... then use the
203 * average of these 8 points as the estimated value.
204 */
Rik van Riele132b9b2016-03-16 12:14:00 -0400205static unsigned int get_typical_interval(struct menu_device *data)
Arjan van de Ven1f85f872010-05-24 14:32:59 -0700206{
Tuukka Tikkanen4cd46bc2013-08-14 19:02:37 +0300207 int i, divisor;
Rasmus Villemoes3b996692016-02-16 20:19:19 +0100208 unsigned int max, thresh, avg;
209 uint64_t sum, variance;
Tuukka Tikkanen0e96d5a2013-08-14 19:02:39 +0300210
211 thresh = UINT_MAX; /* Discard outliers above this value */
Youquan Songc96ca4f2012-10-26 12:27:07 +0200212
213again:
Arjan van de Ven1f85f872010-05-24 14:32:59 -0700214
Tuukka Tikkanen0e96d5a2013-08-14 19:02:39 +0300215 /* First calculate the average of past intervals */
Tuukka Tikkanen4cd46bc2013-08-14 19:02:37 +0300216 max = 0;
Rasmus Villemoes3b996692016-02-16 20:19:19 +0100217 sum = 0;
Tuukka Tikkanen4cd46bc2013-08-14 19:02:37 +0300218 divisor = 0;
Youquan Songc96ca4f2012-10-26 12:27:07 +0200219 for (i = 0; i < INTERVALS; i++) {
Tuukka Tikkanen0e96d5a2013-08-14 19:02:39 +0300220 unsigned int value = data->intervals[i];
Youquan Songc96ca4f2012-10-26 12:27:07 +0200221 if (value <= thresh) {
Rasmus Villemoes3b996692016-02-16 20:19:19 +0100222 sum += value;
Youquan Songc96ca4f2012-10-26 12:27:07 +0200223 divisor++;
224 if (value > max)
225 max = value;
226 }
227 }
Mel Gormanae779302014-08-06 14:19:18 +0100228 if (divisor == INTERVALS)
Rasmus Villemoes3b996692016-02-16 20:19:19 +0100229 avg = sum >> INTERVAL_SHIFT;
Mel Gormanae779302014-08-06 14:19:18 +0100230 else
Rasmus Villemoes3b996692016-02-16 20:19:19 +0100231 avg = div_u64(sum, divisor);
Arjan van de Ven1f85f872010-05-24 14:32:59 -0700232
Rasmus Villemoes7024b182016-02-16 20:19:18 +0100233 /* Then try to determine variance */
234 variance = 0;
Youquan Songc96ca4f2012-10-26 12:27:07 +0200235 for (i = 0; i < INTERVALS; i++) {
Tuukka Tikkanen0e96d5a2013-08-14 19:02:39 +0300236 unsigned int value = data->intervals[i];
Youquan Songc96ca4f2012-10-26 12:27:07 +0200237 if (value <= thresh) {
Rasmus Villemoes3b996692016-02-16 20:19:19 +0100238 int64_t diff = (int64_t)value - avg;
Rasmus Villemoes7024b182016-02-16 20:19:18 +0100239 variance += diff * diff;
Youquan Songc96ca4f2012-10-26 12:27:07 +0200240 }
241 }
Mel Gormanae779302014-08-06 14:19:18 +0100242 if (divisor == INTERVALS)
Rasmus Villemoes7024b182016-02-16 20:19:18 +0100243 variance >>= INTERVAL_SHIFT;
Mel Gormanae779302014-08-06 14:19:18 +0100244 else
Rasmus Villemoes7024b182016-02-16 20:19:18 +0100245 do_div(variance, divisor);
Mel Gormanae779302014-08-06 14:19:18 +0100246
Arjan van de Ven1f85f872010-05-24 14:32:59 -0700247 /*
Rasmus Villemoes7024b182016-02-16 20:19:18 +0100248 * The typical interval is obtained when standard deviation is
249 * small (stddev <= 20 us, variance <= 400 us^2) or standard
250 * deviation is small compared to the average interval (avg >
251 * 6*stddev, avg^2 > 36*variance). The average is smaller than
252 * UINT_MAX aka U32_MAX, so computing its square does not
253 * overflow a u64. We simply reject this candidate average if
254 * the standard deviation is greater than 715 s (which is
255 * rather unlikely).
Tuukka Tikkanen0d6a7ff2013-08-14 19:02:36 +0300256 *
Tuukka Tikkanen330647a2013-08-14 19:02:34 +0300257 * Use this result only if there is no timer to wake us up sooner.
Arjan van de Ven1f85f872010-05-24 14:32:59 -0700258 */
Rasmus Villemoes7024b182016-02-16 20:19:18 +0100259 if (likely(variance <= U64_MAX/36)) {
Rasmus Villemoes3b996692016-02-16 20:19:19 +0100260 if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3))
Rasmus Villemoes7024b182016-02-16 20:19:18 +0100261 || variance <= 400) {
Rik van Riele132b9b2016-03-16 12:14:00 -0400262 return avg;
Tuukka Tikkanen0d6a7ff2013-08-14 19:02:36 +0300263 }
Youquan Song69a37be2012-10-26 12:26:41 +0200264 }
Tuukka Tikkanen017099e2013-08-14 19:02:35 +0300265
266 /*
267 * If we have outliers to the upside in our distribution, discard
268 * those by setting the threshold to exclude these outliers, then
269 * calculate the average and standard deviation again. Once we get
270 * down to the bottom 3/4 of our samples, stop excluding samples.
271 *
272 * This can deal with workloads that have long pauses interspersed
273 * with sporadic activity with a bunch of short pauses.
274 */
275 if ((divisor * 4) <= INTERVALS * 3)
Rik van Riele132b9b2016-03-16 12:14:00 -0400276 return UINT_MAX;
Tuukka Tikkanen017099e2013-08-14 19:02:35 +0300277
278 thresh = max - 1;
279 goto again;
Arjan van de Ven1f85f872010-05-24 14:32:59 -0700280}
281
Len Brown4f86d3a2007-10-03 18:58:00 -0400282/**
283 * menu_select - selects the next idle state to enter
Deepthi Dharwar46bcfad2011-10-28 16:20:42 +0530284 * @drv: cpuidle driver containing state data
Len Brown4f86d3a2007-10-03 18:58:00 -0400285 * @dev: the CPU
Rafael J. Wysocki45f1ff52018-03-22 17:50:49 +0100286 * @stop_tick: indication on whether or not to stop the tick
Len Brown4f86d3a2007-10-03 18:58:00 -0400287 */
Rafael J. Wysocki45f1ff52018-03-22 17:50:49 +0100288static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
289 bool *stop_tick)
Len Brown4f86d3a2007-10-03 18:58:00 -0400290{
Christoph Lameter229b6862014-08-17 12:30:30 -0500291 struct menu_device *data = this_cpu_ptr(&menu_devices);
Rafael J. Wysocki0fc784f2018-05-30 13:43:01 +0200292 int latency_req = cpuidle_governor_latency_req(dev->cpu);
Len Brown4f86d3a2007-10-03 18:58:00 -0400293 int i;
Nicholas Piggin3ed09c92017-06-26 15:38:15 +1000294 int first_idx;
295 int idx;
tuukka.tikkanen@linaro.org96e95182014-02-24 08:29:35 +0200296 unsigned int interactivity_req;
Rik van Riele132b9b2016-03-16 12:14:00 -0400297 unsigned int expected_interval;
Mel Gorman372ba8c2014-08-06 14:19:21 +0100298 unsigned long nr_iowaiters, cpu_load;
Rafael J. Wysocki296bb1e2018-04-05 19:12:34 +0200299 ktime_t delta_next;
Arjan van de Ven69d25872009-09-21 17:04:08 -0700300
Corrado Zoccolo672917d2009-09-21 17:04:09 -0700301 if (data->needs_update) {
Deepthi Dharwar46bcfad2011-10-28 16:20:42 +0530302 menu_update(drv, dev);
Corrado Zoccolo672917d2009-09-21 17:04:09 -0700303 data->needs_update = 0;
304 }
305
venkatesh.pallipadi@intel.coma2bd92022008-07-30 19:21:42 -0700306 /* Special case when user has set very strict latency requirement */
Rafael J. Wysocki45f1ff52018-03-22 17:50:49 +0100307 if (unlikely(latency_req == 0)) {
308 *stop_tick = false;
venkatesh.pallipadi@intel.coma2bd92022008-07-30 19:21:42 -0700309 return 0;
Rafael J. Wysocki45f1ff52018-03-22 17:50:49 +0100310 }
venkatesh.pallipadi@intel.coma2bd92022008-07-30 19:21:42 -0700311
Arjan van de Ven69d25872009-09-21 17:04:08 -0700312 /* determine the expected residency time, round up */
Rafael J. Wysocki296bb1e2018-04-05 19:12:34 +0200313 data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length(&delta_next));
Len Brown4f86d3a2007-10-03 18:58:00 -0400314
Mel Gorman372ba8c2014-08-06 14:19:21 +0100315 get_iowait_load(&nr_iowaiters, &cpu_load);
Mel Gorman64b4ca52014-08-06 14:19:20 +0100316 data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
Arjan van de Ven69d25872009-09-21 17:04:08 -0700317
Arjan van de Ven69d25872009-09-21 17:04:08 -0700318 /*
Tuukka Tikkanen51f245b2013-08-14 19:02:41 +0300319 * Force the result of multiplication to be 64 bits even if both
320 * operands are 32 bits.
321 * Make sure to round up for half microseconds.
322 */
Javi Merinoee3c86f2015-04-16 12:43:51 -0700323 data->predicted_us = DIV_ROUND_CLOSEST_ULL((uint64_t)data->next_timer_us *
Tuukka Tikkanen51f245b2013-08-14 19:02:41 +0300324 data->correction_factor[data->bucket],
Stephen Hemminger57875362010-01-08 14:43:08 -0800325 RESOLUTION * DECAY);
Arjan van de Ven69d25872009-09-21 17:04:08 -0700326
Rik van Riele132b9b2016-03-16 12:14:00 -0400327 expected_interval = get_typical_interval(data);
328 expected_interval = min(expected_interval, data->next_timer_us);
tuukka.tikkanen@linaro.org96e95182014-02-24 08:29:35 +0200329
Rafael J. Wysockidc2251b2017-08-23 23:19:57 +0200330 first_idx = 0;
331 if (drv->states[0].flags & CPUIDLE_FLAG_POLLING) {
332 struct cpuidle_state *s = &drv->states[1];
Rafael J. Wysocki0c313cb2016-03-20 01:33:35 +0100333 unsigned int polling_threshold;
334
Rafael J. Wysocki9c4b2862016-01-14 23:24:22 +0100335 /*
Rafael J. Wysocki50f7ccc2018-08-16 12:55:01 +0200336 * Default to a physical idle state, not to busy polling, unless
337 * a timer is going to trigger really really soon.
Rafael J. Wysocki9c4b2862016-01-14 23:24:22 +0100338 */
Rafael J. Wysocki0c313cb2016-03-20 01:33:35 +0100339 polling_threshold = max_t(unsigned int, 20, s->target_residency);
340 if (data->next_timer_us > polling_threshold &&
341 latency_req > s->exit_latency && !s->disabled &&
Rafael J. Wysockidc2251b2017-08-23 23:19:57 +0200342 !dev->states_usage[1].disable)
343 first_idx = 1;
Rafael J. Wysocki9c4b2862016-01-14 23:24:22 +0100344 }
Arjan van de Ven69d25872009-09-21 17:04:08 -0700345
Ai Li71abbbf2010-08-09 17:20:13 -0700346 /*
Rik van Riele132b9b2016-03-16 12:14:00 -0400347 * Use the lowest expected idle interval to pick the idle state.
348 */
349 data->predicted_us = min(data->predicted_us, expected_interval);
350
Rafael J. Wysocki87c9fe62018-04-05 19:12:43 +0200351 if (tick_nohz_tick_stopped()) {
352 /*
353 * If the tick is already stopped, the cost of possible short
354 * idle duration misprediction is much higher, because the CPU
355 * may be stuck in a shallow idle state for a long time as a
Rafael J. Wysocki5ef499c2018-08-14 12:34:40 +0200356 * result of it. In that case say we might mispredict and use
357 * the known time till the closest timer event for the idle
358 * state selection.
Rafael J. Wysocki87c9fe62018-04-05 19:12:43 +0200359 */
360 if (data->predicted_us < TICK_USEC)
Rafael J. Wysocki5ef499c2018-08-14 12:34:40 +0200361 data->predicted_us = ktime_to_us(delta_next);
Rafael J. Wysocki87c9fe62018-04-05 19:12:43 +0200362 } else {
363 /*
364 * Use the performance multiplier and the user-configurable
365 * latency_req to determine the maximum exit latency.
366 */
367 interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load);
368 if (latency_req > interactivity_req)
369 latency_req = interactivity_req;
370 }
Rik van Riele132b9b2016-03-16 12:14:00 -0400371
Rafael J. Wysocki45f1ff52018-03-22 17:50:49 +0100372 expected_interval = data->predicted_us;
Rik van Riele132b9b2016-03-16 12:14:00 -0400373 /*
Ai Li71abbbf2010-08-09 17:20:13 -0700374 * Find the idle state with the lowest power while satisfying
375 * our constraints.
376 */
Nicholas Piggin3ed09c92017-06-26 15:38:15 +1000377 idx = -1;
378 for (i = first_idx; i < drv->state_count; i++) {
Deepthi Dharwar46bcfad2011-10-28 16:20:42 +0530379 struct cpuidle_state *s = &drv->states[i];
ShuoX Liudc7fd272012-07-03 19:05:31 +0200380 struct cpuidle_state_usage *su = &dev->states_usage[i];
Len Brown4f86d3a2007-10-03 18:58:00 -0400381
Rafael J. Wysockicbc9ef02012-07-03 19:07:42 +0200382 if (s->disabled || su->disable)
ShuoX Liu3a533962012-03-28 15:19:11 -0700383 continue;
Nicholas Piggin3ed09c92017-06-26 15:38:15 +1000384 if (idx == -1)
385 idx = i; /* first enabled state */
Rafael J. Wysocki5ef499c2018-08-14 12:34:40 +0200386 if (s->target_residency > data->predicted_us) {
Rafael J. Wysocki757ab152018-08-21 10:44:10 +0200387 if (data->predicted_us < TICK_USEC)
Rafael J. Wysocki5ef499c2018-08-14 12:34:40 +0200388 break;
389
Rafael J. Wysocki757ab152018-08-21 10:44:10 +0200390 if (!tick_nohz_tick_stopped()) {
391 /*
392 * If the state selected so far is shallow,
393 * waking up early won't hurt, so retain the
394 * tick in that case and let the governor run
395 * again in the next iteration of the loop.
396 */
397 expected_interval = drv->states[idx].target_residency;
398 break;
399 }
400
Rafael J. Wysocki5ef499c2018-08-14 12:34:40 +0200401 /*
402 * If the state selected so far is shallow and this
403 * state's target residency matches the time till the
404 * closest timer event, select this one to avoid getting
405 * stuck in the shallow one for too long.
406 */
407 if (drv->states[idx].target_residency < TICK_USEC &&
408 s->target_residency <= ktime_to_us(delta_next))
409 idx = i;
410
411 goto out;
412 }
Rafael J. Wysocki45f1ff52018-03-22 17:50:49 +0100413 if (s->exit_latency > latency_req) {
414 /*
415 * If we break out of the loop for latency reasons, use
416 * the target residency of the selected state as the
417 * expected idle duration so that the tick is retained
418 * as long as that target residency is low enough.
419 */
420 expected_interval = drv->states[idx].target_residency;
Alex Shi8e37e1a2017-01-12 21:27:02 +0800421 break;
Rafael J. Wysocki45f1ff52018-03-22 17:50:49 +0100422 }
Nicholas Piggin3ed09c92017-06-26 15:38:15 +1000423 idx = i;
Len Brown4f86d3a2007-10-03 18:58:00 -0400424 }
425
Nicholas Piggin3ed09c92017-06-26 15:38:15 +1000426 if (idx == -1)
427 idx = 0; /* No states enabled. Must use 0. */
428
Rafael J. Wysocki45f1ff52018-03-22 17:50:49 +0100429 /*
430 * Don't stop the tick if the selected state is a polling one or if the
431 * expected idle duration is shorter than the tick period length.
432 */
Rafael J. Wysocki5ef499c2018-08-14 12:34:40 +0200433 if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
434 expected_interval < TICK_USEC) && !tick_nohz_tick_stopped()) {
Rafael J. Wysocki296bb1e2018-04-05 19:12:34 +0200435 unsigned int delta_next_us = ktime_to_us(delta_next);
436
Rafael J. Wysocki45f1ff52018-03-22 17:50:49 +0100437 *stop_tick = false;
438
Rafael J. Wysocki5ef499c2018-08-14 12:34:40 +0200439 if (idx > 0 && drv->states[idx].target_residency > delta_next_us) {
Rafael J. Wysocki296bb1e2018-04-05 19:12:34 +0200440 /*
441 * The tick is not going to be stopped and the target
442 * residency of the state to be returned is not within
443 * the time until the next timer event including the
444 * tick, so try to correct that.
445 */
446 for (i = idx - 1; i >= 0; i--) {
Rafael J. Wysockif390c5e2018-08-14 12:39:02 +0200447 if (drv->states[i].disabled ||
448 dev->states_usage[i].disable)
Rafael J. Wysocki296bb1e2018-04-05 19:12:34 +0200449 continue;
450
451 idx = i;
452 if (drv->states[i].target_residency <= delta_next_us)
453 break;
454 }
455 }
456 }
457
Rafael J. Wysocki5ef499c2018-08-14 12:34:40 +0200458out:
Nicholas Piggin3ed09c92017-06-26 15:38:15 +1000459 data->last_state_idx = idx;
460
Arjan van de Ven69d25872009-09-21 17:04:08 -0700461 return data->last_state_idx;
Len Brown4f86d3a2007-10-03 18:58:00 -0400462}
463
464/**
Corrado Zoccolo672917d2009-09-21 17:04:09 -0700465 * menu_reflect - records that data structures need update
Len Brown4f86d3a2007-10-03 18:58:00 -0400466 * @dev: the CPU
Deepthi Dharware978aa72011-10-28 16:20:09 +0530467 * @index: the index of actual entered state
Len Brown4f86d3a2007-10-03 18:58:00 -0400468 *
469 * NOTE: it's important to be fast here because this operation will add to
470 * the overall exit latency.
471 */
Deepthi Dharware978aa72011-10-28 16:20:09 +0530472static void menu_reflect(struct cpuidle_device *dev, int index)
Len Brown4f86d3a2007-10-03 18:58:00 -0400473{
Christoph Lameter229b6862014-08-17 12:30:30 -0500474 struct menu_device *data = this_cpu_ptr(&menu_devices);
Rafael J. Wysockia802ea92015-05-04 22:53:28 +0200475
Deepthi Dharware978aa72011-10-28 16:20:09 +0530476 data->last_state_idx = index;
Rafael J. Wysockia802ea92015-05-04 22:53:28 +0200477 data->needs_update = 1;
Rafael J. Wysocki45f1ff52018-03-22 17:50:49 +0100478 data->tick_wakeup = tick_nohz_idle_got_tick();
Corrado Zoccolo672917d2009-09-21 17:04:09 -0700479}
480
481/**
482 * menu_update - attempts to guess what happened after entry
Deepthi Dharwar46bcfad2011-10-28 16:20:42 +0530483 * @drv: cpuidle driver containing state data
Corrado Zoccolo672917d2009-09-21 17:04:09 -0700484 * @dev: the CPU
485 */
Deepthi Dharwar46bcfad2011-10-28 16:20:42 +0530486static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
Corrado Zoccolo672917d2009-09-21 17:04:09 -0700487{
Christoph Lameter229b6862014-08-17 12:30:30 -0500488 struct menu_device *data = this_cpu_ptr(&menu_devices);
Len Brown4f86d3a2007-10-03 18:58:00 -0400489 int last_idx = data->last_state_idx;
Deepthi Dharwar46bcfad2011-10-28 16:20:42 +0530490 struct cpuidle_state *target = &drv->states[last_idx];
venkatesh.pallipadi@intel.com320eee72008-07-30 19:21:43 -0700491 unsigned int measured_us;
Tuukka Tikkanen51f245b2013-08-14 19:02:41 +0300492 unsigned int new_factor;
Len Brown4f86d3a2007-10-03 18:58:00 -0400493
494 /*
tuukka.tikkanen@linaro.org61c66d62014-02-24 08:29:34 +0200495 * Try to figure out how much time passed between entry to low
496 * power state and occurrence of the wakeup event.
497 *
498 * If the entered idle state didn't support residency measurements,
Len Brown4108b3d2014-12-16 01:52:06 -0500499 * we use them anyway if they are short, and if long,
500 * truncate to the whole expected time.
tuukka.tikkanen@linaro.org61c66d62014-02-24 08:29:34 +0200501 *
502 * Any measured amount of time will include the exit latency.
503 * Since we are interested in when the wakeup begun, not when it
Antonio Ospite2fba5372014-06-04 14:03:45 +0200504 * was completed, we must subtract the exit latency. However, if
tuukka.tikkanen@linaro.org61c66d62014-02-24 08:29:34 +0200505 * the measured amount of time is less than the exit latency,
506 * assume the state was never reached and the exit latency is 0.
Len Brown4f86d3a2007-10-03 18:58:00 -0400507 */
Len Brown4108b3d2014-12-16 01:52:06 -0500508
Rafael J. Wysocki45f1ff52018-03-22 17:50:49 +0100509 if (data->tick_wakeup && data->next_timer_us > TICK_USEC) {
510 /*
511 * The nohz code said that there wouldn't be any events within
512 * the tick boundary (if the tick was stopped), but the idle
513 * duration predictor had a differing opinion. Since the CPU
514 * was woken up by a tick (that wasn't stopped after all), the
515 * predictor was not quite right, so assume that the CPU could
516 * have been idle long (but not forever) to help the idle
517 * duration predictor do a better job next time.
518 */
519 measured_us = 9 * MAX_INTERESTING / 10;
520 } else {
521 /* measured value */
522 measured_us = cpuidle_get_last_residency(dev);
Len Brown4108b3d2014-12-16 01:52:06 -0500523
Rafael J. Wysocki45f1ff52018-03-22 17:50:49 +0100524 /* Deduct exit latency */
525 if (measured_us > 2 * target->exit_latency)
526 measured_us -= target->exit_latency;
527 else
528 measured_us /= 2;
529 }
Len Brown4108b3d2014-12-16 01:52:06 -0500530
531 /* Make sure our coefficients do not exceed unity */
532 if (measured_us > data->next_timer_us)
tuukka.tikkanen@linaro.org7ac26432014-02-24 08:29:33 +0200533 measured_us = data->next_timer_us;
Arjan van de Ven69d25872009-09-21 17:04:08 -0700534
Tuukka Tikkanen51f245b2013-08-14 19:02:41 +0300535 /* Update our correction ratio */
536 new_factor = data->correction_factor[data->bucket];
537 new_factor -= new_factor / DECAY;
Arjan van de Ven69d25872009-09-21 17:04:08 -0700538
tuukka.tikkanen@linaro.org5dc2f5a2014-02-24 08:29:31 +0200539 if (data->next_timer_us > 0 && measured_us < MAX_INTERESTING)
540 new_factor += RESOLUTION * measured_us / data->next_timer_us;
venkatesh.pallipadi@intel.com320eee72008-07-30 19:21:43 -0700541 else
Arjan van de Ven69d25872009-09-21 17:04:08 -0700542 /*
543 * we were idle so long that we count it as a perfect
544 * prediction
545 */
546 new_factor += RESOLUTION;
venkatesh.pallipadi@intel.com320eee72008-07-30 19:21:43 -0700547
Arjan van de Ven69d25872009-09-21 17:04:08 -0700548 /*
549 * We don't want 0 as factor; we always want at least
Tuukka Tikkanen51f245b2013-08-14 19:02:41 +0300550 * a tiny bit of estimated time. Fortunately, due to rounding,
551 * new_factor will stay nonzero regardless of measured_us values
552 * and the compiler can eliminate this test as long as DECAY > 1.
Arjan van de Ven69d25872009-09-21 17:04:08 -0700553 */
Tuukka Tikkanen51f245b2013-08-14 19:02:41 +0300554 if (DECAY == 1 && unlikely(new_factor == 0))
Arjan van de Ven69d25872009-09-21 17:04:08 -0700555 new_factor = 1;
venkatesh.pallipadi@intel.com320eee72008-07-30 19:21:43 -0700556
Arjan van de Ven69d25872009-09-21 17:04:08 -0700557 data->correction_factor[data->bucket] = new_factor;
Arjan van de Ven1f85f872010-05-24 14:32:59 -0700558
559 /* update the repeating-pattern data */
tuukka.tikkanen@linaro.org61c66d62014-02-24 08:29:34 +0200560 data->intervals[data->interval_ptr++] = measured_us;
Arjan van de Ven1f85f872010-05-24 14:32:59 -0700561 if (data->interval_ptr >= INTERVALS)
562 data->interval_ptr = 0;
Len Brown4f86d3a2007-10-03 18:58:00 -0400563}
564
565/**
566 * menu_enable_device - scans a CPU's states and does setup
Deepthi Dharwar46bcfad2011-10-28 16:20:42 +0530567 * @drv: cpuidle driver
Len Brown4f86d3a2007-10-03 18:58:00 -0400568 * @dev: the CPU
569 */
Deepthi Dharwar46bcfad2011-10-28 16:20:42 +0530570static int menu_enable_device(struct cpuidle_driver *drv,
571 struct cpuidle_device *dev)
Len Brown4f86d3a2007-10-03 18:58:00 -0400572{
573 struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
Chander Kashyapbed4d592014-04-22 18:08:04 +0530574 int i;
Len Brown4f86d3a2007-10-03 18:58:00 -0400575
576 memset(data, 0, sizeof(struct menu_device));
577
Chander Kashyapbed4d592014-04-22 18:08:04 +0530578 /*
579 * if the correction factor is 0 (eg first time init or cpu hotplug
580 * etc), we actually want to start out with a unity factor.
581 */
582 for(i = 0; i < BUCKETS; i++)
583 data->correction_factor[i] = RESOLUTION * DECAY;
584
Len Brown4f86d3a2007-10-03 18:58:00 -0400585 return 0;
586}
587
588static struct cpuidle_governor menu_governor = {
589 .name = "menu",
590 .rating = 20,
591 .enable = menu_enable_device,
592 .select = menu_select,
593 .reflect = menu_reflect,
Len Brown4f86d3a2007-10-03 18:58:00 -0400594};
595
596/**
597 * init_menu - initializes the governor
598 */
599static int __init init_menu(void)
600{
601 return cpuidle_register_governor(&menu_governor);
602}
603
Daniel Lezcano137b9442013-06-12 15:08:48 +0200604postcore_initcall(init_menu);