blob: dc669e370ee58a0c2d8ece4670ce36dd7426260b [file] [log] [blame]
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001/*
2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3 *
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
8 *
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11 *
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15 *
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
Peter Zijlstra21805082007-08-25 18:41:53 +020018 *
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
Peter Zijlstra90eec102015-11-16 11:08:45 +010020 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020021 */
22
Christian Ehrhardt1983a922009-11-30 12:16:47 +010023#include <linux/sched.h>
Mel Gormancb251762016-02-05 09:08:36 +000024#include <linux/latencytop.h>
Sisir Koppaka3436ae12011-03-26 18:22:55 +053025#include <linux/cpumask.h>
Nicolas Pitre83a0a962014-09-04 11:32:10 -040026#include <linux/cpuidle.h>
Peter Zijlstra029632f2011-10-25 10:00:11 +020027#include <linux/slab.h>
28#include <linux/profile.h>
29#include <linux/interrupt.h>
Peter Zijlstracbee9f82012-10-25 14:16:43 +020030#include <linux/mempolicy.h>
Mel Gormane14808b2012-11-19 10:59:15 +000031#include <linux/migrate.h>
Peter Zijlstracbee9f82012-10-25 14:16:43 +020032#include <linux/task_work.h>
Srivatsa Vaddagiri26c21542016-05-31 09:08:38 -070033#include <linux/module.h>
Peter Zijlstra029632f2011-10-25 10:00:11 +020034
Peter Zijlstra029632f2011-10-25 10:00:11 +020035#include "sched.h"
Patrick Bellasiedd28d32015-07-07 15:33:20 +010036#include "tune.h"
Joonwoo Parkf7d6cd42017-01-17 15:19:43 -080037#include "walt.h"
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070038#include <trace/events/sched.h>
Arjan van de Ven97455122008-01-25 21:08:34 +010039
Joonwoo Parkf7d6cd42017-01-17 15:19:43 -080040#ifdef CONFIG_SCHED_WALT
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +053041
Syed Rameez Mustafa20acfe72017-01-30 09:35:46 +053042static inline bool task_fits_max(struct task_struct *p, int cpu);
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +053043static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p,
44 u32 new_task_load, u32 new_pred_demand);
45static void walt_fixup_nr_big_tasks(struct rq *rq, struct task_struct *p,
46 int delta, bool inc);
47#endif /* CONFIG_SCHED_WALT */
48
49#if defined(CONFIG_SCHED_WALT) && defined(CONFIG_CFS_BANDWIDTH)
50
51static void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq);
52static void walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq,
53 struct task_struct *p);
54static void walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq,
55 struct task_struct *p);
56static void walt_inc_throttled_cfs_rq_stats(struct walt_sched_stats *stats,
57 struct cfs_rq *cfs_rq);
58static void walt_dec_throttled_cfs_rq_stats(struct walt_sched_stats *stats,
59 struct cfs_rq *cfs_rq);
60#else
61static inline void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq) {}
62static inline void
63walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) {}
64static inline void
65walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) {}
66
67#define walt_inc_throttled_cfs_rq_stats(...)
68#define walt_dec_throttled_cfs_rq_stats(...)
69
Syed Rameez Mustafa20acfe72017-01-30 09:35:46 +053070#endif
71
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +053072
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020073/*
Peter Zijlstra21805082007-08-25 18:41:53 +020074 * Targeted preemption latency for CPU-bound tasks:
Takuya Yoshikawa864616e2010-10-14 16:09:13 +090075 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020076 *
Peter Zijlstra21805082007-08-25 18:41:53 +020077 * NOTE: this latency value is not the same as the concept of
Ingo Molnard274a4c2007-10-15 17:00:14 +020078 * 'timeslice length' - timeslices in CFS are of variable length
79 * and have no persistent notion like in traditional, time-slice
80 * based scheduling concepts.
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020081 *
Ingo Molnard274a4c2007-10-15 17:00:14 +020082 * (to see the precise effective timeslice length of your workload,
83 * run vmstat and monitor the context-switches (cs) field)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020084 */
Mike Galbraith21406922010-03-11 17:17:15 +010085unsigned int sysctl_sched_latency = 6000000ULL;
86unsigned int normalized_sysctl_sched_latency = 6000000ULL;
Ingo Molnar2bd8e6d2007-10-15 17:00:02 +020087
Joonwoo Park2a308ae2017-01-04 15:13:08 -080088unsigned int sysctl_sched_is_big_little = 1;
Juri Lelli1931b932016-07-29 14:04:11 +010089unsigned int sysctl_sched_sync_hint_enable = 1;
Srinath Sridharanbf47bdd2016-07-14 09:57:29 +010090unsigned int sysctl_sched_cstate_aware = 1;
Joonwoo Park858d5752017-08-21 12:09:49 -070091DEFINE_PER_CPU_READ_MOSTLY(int, sched_load_boost);
Juri Lelli1931b932016-07-29 14:04:11 +010092
Srivatsa Vaddagiri26c21542016-05-31 09:08:38 -070093#ifdef CONFIG_SCHED_WALT
94unsigned int sysctl_sched_use_walt_cpu_util = 1;
95unsigned int sysctl_sched_use_walt_task_util = 1;
96#endif
Ingo Molnar2bd8e6d2007-10-15 17:00:02 +020097/*
Christian Ehrhardt1983a922009-11-30 12:16:47 +010098 * The initial- and re-scaling of tunables is configurable
99 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
100 *
101 * Options are:
102 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
103 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
104 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
105 */
106enum sched_tunable_scaling sysctl_sched_tunable_scaling
107 = SCHED_TUNABLESCALING_LOG;
108
109/*
Peter Zijlstrab2be5e92007-11-09 22:39:37 +0100110 * Minimal preemption granularity for CPU-bound tasks:
Takuya Yoshikawa864616e2010-10-14 16:09:13 +0900111 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
Peter Zijlstrab2be5e92007-11-09 22:39:37 +0100112 */
Ingo Molnar0bf377b2010-09-12 08:14:52 +0200113unsigned int sysctl_sched_min_granularity = 750000ULL;
114unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
Peter Zijlstrab2be5e92007-11-09 22:39:37 +0100115
116/*
117 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
118 */
Ingo Molnar0bf377b2010-09-12 08:14:52 +0200119static unsigned int sched_nr_latency = 8;
Peter Zijlstrab2be5e92007-11-09 22:39:37 +0100120
121/*
Mike Galbraith2bba22c2009-09-09 15:41:37 +0200122 * After fork, child runs first. If set to 0 (default) then
Ingo Molnar2bd8e6d2007-10-15 17:00:02 +0200123 * parent will (try to) run first.
124 */
Mike Galbraith2bba22c2009-09-09 15:41:37 +0200125unsigned int sysctl_sched_child_runs_first __read_mostly;
Peter Zijlstra21805082007-08-25 18:41:53 +0200126
127/*
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200128 * SCHED_OTHER wake-up granularity.
Mike Galbraith172e0822009-09-09 15:41:37 +0200129 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200130 *
131 * This option delays the preemption effects of decoupled workloads
132 * and reduces their over-scheduling. Synchronous workloads will still
133 * have immediate wakeup/sleep latencies.
134 */
Mike Galbraith172e0822009-09-09 15:41:37 +0200135unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +0100136unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200137
Ingo Molnarda84d962007-10-15 17:00:18 +0200138const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
139
Paul Turnera7a4f8a2010-11-15 15:47:06 -0800140/*
141 * The exponential sliding window over which load is averaged for shares
142 * distribution.
143 * (default: 10msec)
144 */
145unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
146
Paul Turnerec12cb72011-07-21 09:43:30 -0700147#ifdef CONFIG_CFS_BANDWIDTH
148/*
149 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
150 * each time a cfs_rq requests quota.
151 *
152 * Note: in the case that the slice exceeds the runtime remaining (either due
153 * to consumption or the quota being specified to be smaller than the slice)
154 * we will always only issue the remaining available time.
155 *
156 * default: 5 msec, units: microseconds
157 */
158unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
159#endif
160
Morten Rasmussen32731632016-07-25 14:34:26 +0100161/*
162 * The margin used when comparing utilization with CPU capacity:
Morten Rasmussen5dbcdde2016-10-14 14:41:12 +0100163 * util * margin < capacity * 1024
Morten Rasmussen32731632016-07-25 14:34:26 +0100164 */
Joonwoo Parkb02fc002017-06-16 11:58:58 -0700165unsigned int sysctl_sched_capacity_margin = 1078; /* ~5% margin */
166unsigned int sysctl_sched_capacity_margin_down = 1205; /* ~15% margin */
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +0530167#define capacity_margin sysctl_sched_capacity_margin
Morten Rasmussen32731632016-07-25 14:34:26 +0100168
Abhijeet Dharmapurikar53ee4232018-06-15 09:34:34 -0700169#ifdef CONFIG_SCHED_WALT
170unsigned int sysctl_sched_min_task_util_for_boost_colocation;
171#endif
Pavankumar Kondeti916e7032019-03-08 10:04:40 +0530172static unsigned int __maybe_unused sched_small_task_threshold = 102;
Abhijeet Dharmapurikar53ee4232018-06-15 09:34:34 -0700173
Paul Gortmaker85276322013-04-19 15:10:50 -0400174static inline void update_load_add(struct load_weight *lw, unsigned long inc)
175{
176 lw->weight += inc;
177 lw->inv_weight = 0;
178}
179
180static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
181{
182 lw->weight -= dec;
183 lw->inv_weight = 0;
184}
185
186static inline void update_load_set(struct load_weight *lw, unsigned long w)
187{
188 lw->weight = w;
189 lw->inv_weight = 0;
190}
191
Peter Zijlstra029632f2011-10-25 10:00:11 +0200192/*
193 * Increase the granularity value when there are more CPUs,
194 * because with more CPUs the 'effective latency' as visible
195 * to users decreases. But the relationship is not linear,
196 * so pick a second-best guess by going with the log2 of the
197 * number of CPUs.
198 *
199 * This idea comes from the SD scheduler of Con Kolivas:
200 */
Nicholas Mc Guire58ac93e2015-05-15 21:05:42 +0200201static unsigned int get_update_sysctl_factor(void)
Peter Zijlstra029632f2011-10-25 10:00:11 +0200202{
Nicholas Mc Guire58ac93e2015-05-15 21:05:42 +0200203 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
Peter Zijlstra029632f2011-10-25 10:00:11 +0200204 unsigned int factor;
205
206 switch (sysctl_sched_tunable_scaling) {
207 case SCHED_TUNABLESCALING_NONE:
208 factor = 1;
209 break;
210 case SCHED_TUNABLESCALING_LINEAR:
211 factor = cpus;
212 break;
213 case SCHED_TUNABLESCALING_LOG:
214 default:
215 factor = 1 + ilog2(cpus);
216 break;
217 }
218
219 return factor;
220}
221
222static void update_sysctl(void)
223{
224 unsigned int factor = get_update_sysctl_factor();
225
226#define SET_SYSCTL(name) \
227 (sysctl_##name = (factor) * normalized_sysctl_##name)
228 SET_SYSCTL(sched_min_granularity);
229 SET_SYSCTL(sched_latency);
230 SET_SYSCTL(sched_wakeup_granularity);
231#undef SET_SYSCTL
232}
233
234void sched_init_granularity(void)
235{
236 update_sysctl();
237}
238
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100239#define WMULT_CONST (~0U)
Peter Zijlstra029632f2011-10-25 10:00:11 +0200240#define WMULT_SHIFT 32
241
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100242static void __update_inv_weight(struct load_weight *lw)
Peter Zijlstra029632f2011-10-25 10:00:11 +0200243{
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100244 unsigned long w;
Peter Zijlstra029632f2011-10-25 10:00:11 +0200245
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100246 if (likely(lw->inv_weight))
247 return;
248
249 w = scale_load_down(lw->weight);
250
251 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
252 lw->inv_weight = 1;
253 else if (unlikely(!w))
254 lw->inv_weight = WMULT_CONST;
Peter Zijlstra029632f2011-10-25 10:00:11 +0200255 else
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100256 lw->inv_weight = WMULT_CONST / w;
257}
Peter Zijlstra029632f2011-10-25 10:00:11 +0200258
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100259/*
260 * delta_exec * weight / lw.weight
261 * OR
262 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
263 *
Yuyang Du1c3de5e2016-03-30 07:07:51 +0800264 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100265 * we're guaranteed shift stays positive because inv_weight is guaranteed to
266 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
267 *
268 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
269 * weight/lw.weight <= 1, and therefore our shift will also be positive.
270 */
271static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
272{
273 u64 fact = scale_load_down(weight);
274 int shift = WMULT_SHIFT;
Peter Zijlstra029632f2011-10-25 10:00:11 +0200275
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100276 __update_inv_weight(lw);
277
278 if (unlikely(fact >> 32)) {
279 while (fact >> 32) {
280 fact >>= 1;
281 shift--;
282 }
Peter Zijlstra029632f2011-10-25 10:00:11 +0200283 }
284
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100285 /* hint to use a 32x32->64 mul */
286 fact = (u64)(u32)fact * lw->inv_weight;
Peter Zijlstra029632f2011-10-25 10:00:11 +0200287
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100288 while (fact >> 32) {
289 fact >>= 1;
290 shift--;
291 }
292
293 return mul_u64_u32_shr(delta_exec, fact, shift);
Peter Zijlstra029632f2011-10-25 10:00:11 +0200294}
295
Peter Zijlstra029632f2011-10-25 10:00:11 +0200296const struct sched_class fair_sched_class;
Peter Zijlstraa4c2f002008-10-17 19:27:03 +0200297
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200298/**************************************************************
299 * CFS operations on generic schedulable entities:
300 */
301
302#ifdef CONFIG_FAIR_GROUP_SCHED
303
304/* cpu runqueue to which this cfs_rq is attached */
305static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
306{
307 return cfs_rq->rq;
308}
309
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200310/* An entity is a task if it doesn't "own" a runqueue */
311#define entity_is_task(se) (!se->my_q)
312
Peter Zijlstra8f488942009-07-24 12:25:30 +0200313static inline struct task_struct *task_of(struct sched_entity *se)
314{
Peter Zijlstra9148a3a2016-09-20 22:34:51 +0200315 SCHED_WARN_ON(!entity_is_task(se));
Peter Zijlstra8f488942009-07-24 12:25:30 +0200316 return container_of(se, struct task_struct, se);
317}
318
Peter Zijlstrab7581492008-04-19 19:45:00 +0200319/* Walk up scheduling entities hierarchy */
320#define for_each_sched_entity(se) \
321 for (; se; se = se->parent)
322
323static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
324{
325 return p->se.cfs_rq;
326}
327
328/* runqueue on which this entity is (to be) queued */
329static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
330{
331 return se->cfs_rq;
332}
333
334/* runqueue "owned" by this group */
335static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
336{
337 return grp->my_q;
338}
339
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800340static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
341{
342 if (!cfs_rq->on_list) {
Vincent Guittot96956e22016-11-08 10:53:44 +0100343 struct rq *rq = rq_of(cfs_rq);
344 int cpu = cpu_of(rq);
Paul Turner67e86252010-11-15 15:47:05 -0800345 /*
346 * Ensure we either appear before our parent (if already
347 * enqueued) or force our parent to appear after us when it is
Vincent Guittot96956e22016-11-08 10:53:44 +0100348 * enqueued. The fact that we always enqueue bottom-up
349 * reduces this to two cases and a special case for the root
350 * cfs_rq. Furthermore, it also means that we will always reset
351 * tmp_alone_branch either when the branch is connected
352 * to a tree or when we reach the beg of the tree
Paul Turner67e86252010-11-15 15:47:05 -0800353 */
354 if (cfs_rq->tg->parent &&
Vincent Guittot96956e22016-11-08 10:53:44 +0100355 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
356 /*
357 * If parent is already on the list, we add the child
358 * just before. Thanks to circular linked property of
359 * the list, this means to put the child at the tail
360 * of the list that starts by parent.
361 */
Paul Turner67e86252010-11-15 15:47:05 -0800362 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
Vincent Guittot96956e22016-11-08 10:53:44 +0100363 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
364 /*
365 * The branch is now connected to its tree so we can
366 * reset tmp_alone_branch to the beginning of the
367 * list.
368 */
369 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
370 } else if (!cfs_rq->tg->parent) {
371 /*
372 * cfs rq without parent should be put
373 * at the tail of the list.
374 */
375 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
376 &rq->leaf_cfs_rq_list);
377 /*
378 * We have reach the beg of a tree so we can reset
379 * tmp_alone_branch to the beginning of the list.
380 */
381 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
382 } else {
383 /*
384 * The parent has not already been added so we want to
385 * make sure that it will be put after us.
386 * tmp_alone_branch points to the beg of the branch
387 * where we will add parent.
388 */
389 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
390 rq->tmp_alone_branch);
391 /*
392 * update tmp_alone_branch to points to the new beg
393 * of the branch
394 */
395 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
Paul Turner67e86252010-11-15 15:47:05 -0800396 }
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800397
398 cfs_rq->on_list = 1;
399 }
400}
401
402static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
403{
404 if (cfs_rq->on_list) {
405 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
406 cfs_rq->on_list = 0;
407 }
408}
409
Peter Zijlstrab7581492008-04-19 19:45:00 +0200410/* Iterate thr' all leaf cfs_rq's on a runqueue */
411#define for_each_leaf_cfs_rq(rq, cfs_rq) \
412 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
413
414/* Do the two (enqueued) entities belong to the same group ? */
Peter Zijlstrafed14d42012-02-11 06:05:00 +0100415static inline struct cfs_rq *
Peter Zijlstrab7581492008-04-19 19:45:00 +0200416is_same_group(struct sched_entity *se, struct sched_entity *pse)
417{
418 if (se->cfs_rq == pse->cfs_rq)
Peter Zijlstrafed14d42012-02-11 06:05:00 +0100419 return se->cfs_rq;
Peter Zijlstrab7581492008-04-19 19:45:00 +0200420
Peter Zijlstrafed14d42012-02-11 06:05:00 +0100421 return NULL;
Peter Zijlstrab7581492008-04-19 19:45:00 +0200422}
423
424static inline struct sched_entity *parent_entity(struct sched_entity *se)
425{
426 return se->parent;
427}
428
Peter Zijlstra464b7522008-10-24 11:06:15 +0200429static void
430find_matching_se(struct sched_entity **se, struct sched_entity **pse)
431{
432 int se_depth, pse_depth;
433
434 /*
435 * preemption test can be made between sibling entities who are in the
436 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
437 * both tasks until we find their ancestors who are siblings of common
438 * parent.
439 */
440
441 /* First walk up until both entities are at same depth */
Peter Zijlstrafed14d42012-02-11 06:05:00 +0100442 se_depth = (*se)->depth;
443 pse_depth = (*pse)->depth;
Peter Zijlstra464b7522008-10-24 11:06:15 +0200444
445 while (se_depth > pse_depth) {
446 se_depth--;
447 *se = parent_entity(*se);
448 }
449
450 while (pse_depth > se_depth) {
451 pse_depth--;
452 *pse = parent_entity(*pse);
453 }
454
455 while (!is_same_group(*se, *pse)) {
456 *se = parent_entity(*se);
457 *pse = parent_entity(*pse);
458 }
459}
460
Peter Zijlstra8f488942009-07-24 12:25:30 +0200461#else /* !CONFIG_FAIR_GROUP_SCHED */
462
463static inline struct task_struct *task_of(struct sched_entity *se)
464{
465 return container_of(se, struct task_struct, se);
466}
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200467
468static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
469{
470 return container_of(cfs_rq, struct rq, cfs);
471}
472
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200473#define entity_is_task(se) 1
474
Peter Zijlstrab7581492008-04-19 19:45:00 +0200475#define for_each_sched_entity(se) \
476 for (; se; se = NULL)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200477
Peter Zijlstrab7581492008-04-19 19:45:00 +0200478static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200479{
Peter Zijlstrab7581492008-04-19 19:45:00 +0200480 return &task_rq(p)->cfs;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200481}
482
Peter Zijlstrab7581492008-04-19 19:45:00 +0200483static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
484{
485 struct task_struct *p = task_of(se);
486 struct rq *rq = task_rq(p);
487
488 return &rq->cfs;
489}
490
491/* runqueue "owned" by this group */
492static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
493{
494 return NULL;
495}
496
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800497static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
498{
499}
500
501static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
502{
503}
504
Peter Zijlstrab7581492008-04-19 19:45:00 +0200505#define for_each_leaf_cfs_rq(rq, cfs_rq) \
506 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
507
Peter Zijlstrab7581492008-04-19 19:45:00 +0200508static inline struct sched_entity *parent_entity(struct sched_entity *se)
509{
510 return NULL;
511}
512
Peter Zijlstra464b7522008-10-24 11:06:15 +0200513static inline void
514find_matching_se(struct sched_entity **se, struct sched_entity **pse)
515{
516}
517
Peter Zijlstrab7581492008-04-19 19:45:00 +0200518#endif /* CONFIG_FAIR_GROUP_SCHED */
519
Peter Zijlstra6c16a6d2012-03-21 13:07:16 -0700520static __always_inline
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100521void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200522
523/**************************************************************
524 * Scheduling class tree data structure manipulation methods:
525 */
526
Andrei Epure1bf08232013-03-12 21:12:24 +0200527static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
Peter Zijlstra02e04312007-10-15 17:00:07 +0200528{
Andrei Epure1bf08232013-03-12 21:12:24 +0200529 s64 delta = (s64)(vruntime - max_vruntime);
Peter Zijlstra368059a2007-10-15 17:00:11 +0200530 if (delta > 0)
Andrei Epure1bf08232013-03-12 21:12:24 +0200531 max_vruntime = vruntime;
Peter Zijlstra02e04312007-10-15 17:00:07 +0200532
Andrei Epure1bf08232013-03-12 21:12:24 +0200533 return max_vruntime;
Peter Zijlstra02e04312007-10-15 17:00:07 +0200534}
535
Ingo Molnar0702e3e2007-10-15 17:00:14 +0200536static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstrab0ffd242007-10-15 17:00:12 +0200537{
538 s64 delta = (s64)(vruntime - min_vruntime);
539 if (delta < 0)
540 min_vruntime = vruntime;
541
542 return min_vruntime;
543}
544
Fabio Checconi54fdc582009-07-16 12:32:27 +0200545static inline int entity_before(struct sched_entity *a,
546 struct sched_entity *b)
547{
548 return (s64)(a->vruntime - b->vruntime) < 0;
549}
550
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200551static void update_min_vruntime(struct cfs_rq *cfs_rq)
552{
Peter Zijlstrab60205c2016-09-20 21:58:12 +0200553 struct sched_entity *curr = cfs_rq->curr;
554
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200555 u64 vruntime = cfs_rq->min_vruntime;
556
Peter Zijlstrab60205c2016-09-20 21:58:12 +0200557 if (curr) {
558 if (curr->on_rq)
559 vruntime = curr->vruntime;
560 else
561 curr = NULL;
562 }
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200563
564 if (cfs_rq->rb_leftmost) {
565 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
566 struct sched_entity,
567 run_node);
568
Peter Zijlstrab60205c2016-09-20 21:58:12 +0200569 if (!curr)
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200570 vruntime = se->vruntime;
571 else
572 vruntime = min_vruntime(vruntime, se->vruntime);
573 }
574
Andrei Epure1bf08232013-03-12 21:12:24 +0200575 /* ensure we never gain time by being placed backwards. */
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200576 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
Peter Zijlstra3fe16982011-04-05 17:23:48 +0200577#ifndef CONFIG_64BIT
578 smp_wmb();
579 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
580#endif
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200581}
582
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200583/*
584 * Enqueue an entity into the rb-tree:
585 */
Ingo Molnar0702e3e2007-10-15 17:00:14 +0200586static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200587{
588 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
589 struct rb_node *parent = NULL;
590 struct sched_entity *entry;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200591 int leftmost = 1;
592
593 /*
594 * Find the right place in the rbtree:
595 */
596 while (*link) {
597 parent = *link;
598 entry = rb_entry(parent, struct sched_entity, run_node);
599 /*
600 * We dont care about collisions. Nodes with
601 * the same key stay together.
602 */
Stephan Baerwolf2bd2d6f2011-07-20 14:46:59 +0200603 if (entity_before(se, entry)) {
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200604 link = &parent->rb_left;
605 } else {
606 link = &parent->rb_right;
607 leftmost = 0;
608 }
609 }
610
611 /*
612 * Maintain a cache of leftmost tree entries (it is frequently
613 * used):
614 */
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200615 if (leftmost)
Ingo Molnar57cb4992007-10-15 17:00:11 +0200616 cfs_rq->rb_leftmost = &se->run_node;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200617
618 rb_link_node(&se->run_node, parent, link);
619 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200620}
621
Ingo Molnar0702e3e2007-10-15 17:00:14 +0200622static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200623{
Peter Zijlstra3fe69742008-03-14 20:55:51 +0100624 if (cfs_rq->rb_leftmost == &se->run_node) {
625 struct rb_node *next_node;
Peter Zijlstra3fe69742008-03-14 20:55:51 +0100626
627 next_node = rb_next(&se->run_node);
628 cfs_rq->rb_leftmost = next_node;
Peter Zijlstra3fe69742008-03-14 20:55:51 +0100629 }
Ingo Molnare9acbff2007-10-15 17:00:04 +0200630
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200631 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200632}
633
Peter Zijlstra029632f2011-10-25 10:00:11 +0200634struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200635{
Peter Zijlstraf4b67552008-11-04 21:25:07 +0100636 struct rb_node *left = cfs_rq->rb_leftmost;
637
638 if (!left)
639 return NULL;
640
641 return rb_entry(left, struct sched_entity, run_node);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200642}
643
Rik van Rielac53db52011-02-01 09:51:03 -0500644static struct sched_entity *__pick_next_entity(struct sched_entity *se)
645{
646 struct rb_node *next = rb_next(&se->run_node);
647
648 if (!next)
649 return NULL;
650
651 return rb_entry(next, struct sched_entity, run_node);
652}
653
654#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra029632f2011-10-25 10:00:11 +0200655struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
Peter Zijlstraaeb73b02007-10-15 17:00:05 +0200656{
Ingo Molnar7eee3e62008-02-22 10:32:21 +0100657 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
Peter Zijlstraaeb73b02007-10-15 17:00:05 +0200658
Balbir Singh70eee742008-02-22 13:25:53 +0530659 if (!last)
660 return NULL;
Ingo Molnar7eee3e62008-02-22 10:32:21 +0100661
662 return rb_entry(last, struct sched_entity, run_node);
Peter Zijlstraaeb73b02007-10-15 17:00:05 +0200663}
664
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200665/**************************************************************
666 * Scheduling class statistics methods:
667 */
668
Christian Ehrhardtacb4a842009-11-30 12:16:48 +0100669int sched_proc_update_handler(struct ctl_table *table, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -0700670 void __user *buffer, size_t *lenp,
Peter Zijlstrab2be5e92007-11-09 22:39:37 +0100671 loff_t *ppos)
672{
Alexey Dobriyan8d65af72009-09-23 15:57:19 -0700673 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Nicholas Mc Guire58ac93e2015-05-15 21:05:42 +0200674 unsigned int factor = get_update_sysctl_factor();
Peter Zijlstrab2be5e92007-11-09 22:39:37 +0100675
676 if (ret || !write)
677 return ret;
678
679 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
680 sysctl_sched_min_granularity);
681
Christian Ehrhardtacb4a842009-11-30 12:16:48 +0100682#define WRT_SYSCTL(name) \
683 (normalized_sysctl_##name = sysctl_##name / (factor))
684 WRT_SYSCTL(sched_min_granularity);
685 WRT_SYSCTL(sched_latency);
686 WRT_SYSCTL(sched_wakeup_granularity);
Christian Ehrhardtacb4a842009-11-30 12:16:48 +0100687#undef WRT_SYSCTL
688
Peter Zijlstrab2be5e92007-11-09 22:39:37 +0100689 return 0;
690}
691#endif
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200692
693/*
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200694 * delta /= w
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200695 */
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100696static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200697{
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200698 if (unlikely(se->load.weight != NICE_0_LOAD))
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100699 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200700
701 return delta;
702}
703
704/*
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200705 * The idea is to set a period in which each task runs once.
706 *
Borislav Petkov532b1852012-08-08 16:16:04 +0200707 * When there are too many tasks (sched_nr_latency) we have to stretch
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200708 * this period because otherwise the slices get too small.
709 *
710 * p = (nr <= nl) ? l : l*nr/nl
711 */
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +0200712static u64 __sched_period(unsigned long nr_running)
713{
Boqun Feng8e2b0bf2015-07-02 22:25:52 +0800714 if (unlikely(nr_running > sched_nr_latency))
715 return nr_running * sysctl_sched_min_granularity;
716 else
717 return sysctl_sched_latency;
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +0200718}
719
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200720/*
721 * We calculate the wall-time slice from the period by taking a part
722 * proportional to the weight.
723 *
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200724 * s = p*P[w/rw]
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200725 */
Peter Zijlstra6d0f0ebd2007-10-15 17:00:05 +0200726static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
Peter Zijlstra21805082007-08-25 18:41:53 +0200727{
Mike Galbraith0a582442009-01-02 12:16:42 +0100728 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200729
Mike Galbraith0a582442009-01-02 12:16:42 +0100730 for_each_sched_entity(se) {
Lin Ming6272d682009-01-15 17:17:15 +0100731 struct load_weight *load;
Christian Engelmayer3104bf02009-06-16 10:35:12 +0200732 struct load_weight lw;
Lin Ming6272d682009-01-15 17:17:15 +0100733
734 cfs_rq = cfs_rq_of(se);
735 load = &cfs_rq->load;
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200736
Mike Galbraith0a582442009-01-02 12:16:42 +0100737 if (unlikely(!se->on_rq)) {
Christian Engelmayer3104bf02009-06-16 10:35:12 +0200738 lw = cfs_rq->load;
Mike Galbraith0a582442009-01-02 12:16:42 +0100739
740 update_load_add(&lw, se->load.weight);
741 load = &lw;
742 }
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100743 slice = __calc_delta(slice, se->load.weight, load);
Mike Galbraith0a582442009-01-02 12:16:42 +0100744 }
745 return slice;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200746}
747
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200748/*
Andrei Epure660cc002013-03-11 12:03:20 +0200749 * We calculate the vruntime slice of a to-be-inserted task.
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200750 *
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200751 * vs = s/w
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200752 */
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200753static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200754{
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200755 return calc_delta_fair(sched_slice(cfs_rq, se), se);
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200756}
757
Alex Shia75cdaa2013-06-20 10:18:47 +0800758#ifdef CONFIG_SMP
Morten Rasmussen772bd008c2016-06-22 18:03:13 +0100759static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
Mel Gormanfb13c7e2013-10-07 11:29:17 +0100760static unsigned long task_h_load(struct task_struct *p);
761
Yuyang Du9d89c252015-07-15 08:04:37 +0800762/*
763 * We choose a half-life close to 1 scheduling period.
Leo Yan84fb5a12015-09-15 18:57:37 +0800764 * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
765 * dependent on this value.
Yuyang Du9d89c252015-07-15 08:04:37 +0800766 */
767#define LOAD_AVG_PERIOD 32
768#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
Leo Yan84fb5a12015-09-15 18:57:37 +0800769#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
Alex Shia75cdaa2013-06-20 10:18:47 +0800770
Yuyang Du540247f2015-07-15 08:04:39 +0800771/* Give new sched_entity start runnable values to heavy its load in infant time */
772void init_entity_runnable_average(struct sched_entity *se)
Alex Shia75cdaa2013-06-20 10:18:47 +0800773{
Yuyang Du540247f2015-07-15 08:04:39 +0800774 struct sched_avg *sa = &se->avg;
Alex Shia75cdaa2013-06-20 10:18:47 +0800775
Yuyang Du9d89c252015-07-15 08:04:37 +0800776 sa->last_update_time = 0;
777 /*
778 * sched_avg's period_contrib should be strictly less then 1024, so
779 * we give it 1023 to make sure it is almost a period (1024us), and
780 * will definitely be update (after enqueue).
781 */
782 sa->period_contrib = 1023;
Vincent Guittotb5a9b342016-10-19 14:45:23 +0200783 /*
784 * Tasks are intialized with full load to be seen as heavy tasks until
785 * they get a chance to stabilize to their real load level.
786 * Group entities are intialized with zero load to reflect the fact that
787 * nothing has been attached to the task group yet.
788 */
789 if (entity_is_task(se))
790 sa->load_avg = scale_load_down(se->load.weight);
Yuyang Du9d89c252015-07-15 08:04:37 +0800791 sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
Todd Kjosc6a6f3b2016-03-11 16:44:16 -0800792
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800793 /*
794 * At this point, util_avg won't be used in select_task_rq_fair anyway
795 */
Viresh Kumar92550132017-11-02 15:13:26 +0530796 sa->util_avg = 0;
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800797 sa->util_sum = 0;
Yuyang Du9d89c252015-07-15 08:04:37 +0800798 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
Alex Shia75cdaa2013-06-20 10:18:47 +0800799}
Yuyang Du7ea241a2015-07-15 08:04:42 +0800800
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200801static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
Vincent Guittot793cfff2016-11-08 10:53:42 +0100802static void attach_entity_cfs_rq(struct sched_entity *se);
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200803
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800804/*
805 * With new tasks being created, their initial util_avgs are extrapolated
806 * based on the cfs_rq's current util_avg:
807 *
808 * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
809 *
810 * However, in many cases, the above util_avg does not give a desired
811 * value. Moreover, the sum of the util_avgs may be divergent, such
812 * as when the series is a harmonic series.
813 *
814 * To solve this problem, we also cap the util_avg of successive tasks to
815 * only 1/2 of the left utilization budget:
816 *
817 * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
818 *
819 * where n denotes the nth task.
820 *
821 * For example, a simplest series from the beginning would be like:
822 *
823 * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
824 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
825 *
826 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
827 * if util_avg > util_avg_cap.
828 */
829void post_init_entity_util_avg(struct sched_entity *se)
830{
831 struct cfs_rq *cfs_rq = cfs_rq_of(se);
832 struct sched_avg *sa = &se->avg;
Yuyang Du172895e2016-04-05 12:12:27 +0800833 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800834
835 if (cap > 0) {
836 if (cfs_rq->avg.util_avg != 0) {
837 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
838 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
839
840 if (sa->util_avg > cap)
841 sa->util_avg = cap;
842 } else {
843 sa->util_avg = cap;
844 }
845 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
846 }
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200847
848 if (entity_is_task(se)) {
849 struct task_struct *p = task_of(se);
850 if (p->sched_class != &fair_sched_class) {
851 /*
852 * For !fair tasks do:
853 *
854 update_cfs_rq_load_avg(now, cfs_rq, false);
855 attach_entity_load_avg(cfs_rq, se);
856 switched_from_fair(rq, p);
857 *
858 * such that the next switched_to_fair() has the
859 * expected state.
860 */
Vincent Guittot793cfff2016-11-08 10:53:42 +0100861 se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200862 return;
863 }
864 }
865
Vincent Guittot793cfff2016-11-08 10:53:42 +0100866 attach_entity_cfs_rq(se);
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800867}
868
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200869#else /* !CONFIG_SMP */
Yuyang Du540247f2015-07-15 08:04:39 +0800870void init_entity_runnable_average(struct sched_entity *se)
Alex Shia75cdaa2013-06-20 10:18:47 +0800871{
872}
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800873void post_init_entity_util_avg(struct sched_entity *se)
874{
875}
Peter Zijlstra3d30544f2016-06-21 14:27:50 +0200876static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
877{
878}
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200879#endif /* CONFIG_SMP */
Alex Shia75cdaa2013-06-20 10:18:47 +0800880
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200881/*
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100882 * Update the current task's runtime statistics.
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200883 */
Ingo Molnarb7cc0892007-08-09 11:16:47 +0200884static void update_curr(struct cfs_rq *cfs_rq)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200885{
Ingo Molnar429d43b2007-10-15 17:00:03 +0200886 struct sched_entity *curr = cfs_rq->curr;
Frederic Weisbecker78becc22013-04-12 01:51:02 +0200887 u64 now = rq_clock_task(rq_of(cfs_rq));
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100888 u64 delta_exec;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200889
890 if (unlikely(!curr))
891 return;
892
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100893 delta_exec = now - curr->exec_start;
894 if (unlikely((s64)delta_exec <= 0))
Peter Zijlstra34f28ec2008-12-16 08:45:31 +0100895 return;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200896
Ingo Molnar8ebc91d2007-10-15 17:00:03 +0200897 curr->exec_start = now;
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +0100898
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100899 schedstat_set(curr->statistics.exec_max,
900 max(delta_exec, curr->statistics.exec_max));
901
902 curr->sum_exec_runtime += delta_exec;
Josh Poimboeufae928822016-06-17 12:43:24 -0500903 schedstat_add(cfs_rq->exec_clock, delta_exec);
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100904
905 curr->vruntime += calc_delta_fair(delta_exec, curr);
906 update_min_vruntime(cfs_rq);
907
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +0100908 if (entity_is_task(curr)) {
909 struct task_struct *curtask = task_of(curr);
910
Ingo Molnarf977bb42009-09-13 18:15:54 +0200911 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +0100912 cpuacct_charge(curtask, delta_exec);
Frank Mayharf06febc2008-09-12 09:54:39 -0700913 account_group_exec_runtime(curtask, delta_exec);
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +0100914 }
Paul Turnerec12cb72011-07-21 09:43:30 -0700915
916 account_cfs_rq_runtime(cfs_rq, delta_exec);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200917}
918
Stanislaw Gruszka6e998912014-11-12 16:58:44 +0100919static void update_curr_fair(struct rq *rq)
920{
921 update_curr(cfs_rq_of(&rq->curr->se));
922}
923
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200924static inline void
Ingo Molnar5870db52007-08-09 11:16:47 +0200925update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200926{
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500927 u64 wait_start, prev_wait_start;
928
929 if (!schedstat_enabled())
930 return;
931
932 wait_start = rq_clock(rq_of(cfs_rq));
933 prev_wait_start = schedstat_val(se->statistics.wait_start);
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800934
935 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500936 likely(wait_start > prev_wait_start))
937 wait_start -= prev_wait_start;
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800938
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500939 schedstat_set(se->statistics.wait_start, wait_start);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200940}
941
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500942static inline void
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800943update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
944{
945 struct task_struct *p;
Mel Gormancb251762016-02-05 09:08:36 +0000946 u64 delta;
947
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500948 if (!schedstat_enabled())
949 return;
950
951 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800952
953 if (entity_is_task(se)) {
954 p = task_of(se);
955 if (task_on_rq_migrating(p)) {
956 /*
957 * Preserve migrating task's wait time so wait_start
958 * time stamp can be adjusted to accumulate wait time
959 * prior to migration.
960 */
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500961 schedstat_set(se->statistics.wait_start, delta);
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800962 return;
963 }
964 trace_sched_stat_wait(p, delta);
965 }
966
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500967 schedstat_set(se->statistics.wait_max,
968 max(schedstat_val(se->statistics.wait_max), delta));
969 schedstat_inc(se->statistics.wait_count);
970 schedstat_add(se->statistics.wait_sum, delta);
971 schedstat_set(se->statistics.wait_start, 0);
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800972}
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800973
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500974static inline void
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500975update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
976{
977 struct task_struct *tsk = NULL;
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500978 u64 sleep_start, block_start;
979
980 if (!schedstat_enabled())
981 return;
982
983 sleep_start = schedstat_val(se->statistics.sleep_start);
984 block_start = schedstat_val(se->statistics.block_start);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500985
986 if (entity_is_task(se))
987 tsk = task_of(se);
988
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500989 if (sleep_start) {
990 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500991
992 if ((s64)delta < 0)
993 delta = 0;
994
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500995 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
996 schedstat_set(se->statistics.sleep_max, delta);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500997
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500998 schedstat_set(se->statistics.sleep_start, 0);
999 schedstat_add(se->statistics.sum_sleep_runtime, delta);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -05001000
1001 if (tsk) {
1002 account_scheduler_latency(tsk, delta >> 10, 1);
1003 trace_sched_stat_sleep(tsk, delta);
1004 }
1005 }
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05001006 if (block_start) {
1007 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -05001008
1009 if ((s64)delta < 0)
1010 delta = 0;
1011
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05001012 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
1013 schedstat_set(se->statistics.block_max, delta);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -05001014
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05001015 schedstat_set(se->statistics.block_start, 0);
1016 schedstat_add(se->statistics.sum_sleep_runtime, delta);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -05001017
1018 if (tsk) {
1019 if (tsk->in_iowait) {
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05001020 schedstat_add(se->statistics.iowait_sum, delta);
1021 schedstat_inc(se->statistics.iowait_count);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -05001022 trace_sched_stat_iowait(tsk, delta);
1023 }
1024
1025 trace_sched_stat_blocked(tsk, delta);
Riley Andrews4c873b42015-10-02 00:39:53 -07001026 trace_sched_blocked_reason(tsk);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -05001027
1028 /*
1029 * Blocking time is in units of nanosecs, so shift by
1030 * 20 to get a milliseconds-range estimation of the
1031 * amount of time that the task spent sleeping:
1032 */
1033 if (unlikely(prof_on == SLEEP_PROFILING)) {
1034 profile_hits(SLEEP_PROFILING,
1035 (void *)get_wchan(tsk),
1036 delta >> 20);
1037 }
1038 account_scheduler_latency(tsk, delta >> 10, 0);
1039 }
1040 }
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001041}
1042
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001043/*
1044 * Task is being enqueued - update stats:
1045 */
Mel Gormancb251762016-02-05 09:08:36 +00001046static inline void
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -05001047update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001048{
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05001049 if (!schedstat_enabled())
1050 return;
1051
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001052 /*
1053 * Are we enqueueing a waiting task? (for current tasks
1054 * a dequeue/enqueue event is a NOP)
1055 */
Ingo Molnar429d43b2007-10-15 17:00:03 +02001056 if (se != cfs_rq->curr)
Ingo Molnar5870db52007-08-09 11:16:47 +02001057 update_stats_wait_start(cfs_rq, se);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -05001058
1059 if (flags & ENQUEUE_WAKEUP)
1060 update_stats_enqueue_sleeper(cfs_rq, se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001061}
1062
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001063static inline void
Mel Gormancb251762016-02-05 09:08:36 +00001064update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001065{
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05001066
1067 if (!schedstat_enabled())
1068 return;
1069
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001070 /*
1071 * Mark the end of the wait period if dequeueing a
1072 * waiting task:
1073 */
Ingo Molnar429d43b2007-10-15 17:00:03 +02001074 if (se != cfs_rq->curr)
Ingo Molnar9ef0a962007-08-09 11:16:47 +02001075 update_stats_wait_end(cfs_rq, se);
Mel Gormancb251762016-02-05 09:08:36 +00001076
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05001077 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1078 struct task_struct *tsk = task_of(se);
Mel Gormancb251762016-02-05 09:08:36 +00001079
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05001080 if (tsk->state & TASK_INTERRUPTIBLE)
1081 schedstat_set(se->statistics.sleep_start,
1082 rq_clock(rq_of(cfs_rq)));
1083 if (tsk->state & TASK_UNINTERRUPTIBLE)
1084 schedstat_set(se->statistics.block_start,
1085 rq_clock(rq_of(cfs_rq)));
Mel Gormancb251762016-02-05 09:08:36 +00001086 }
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001087}
1088
1089/*
1090 * We are picking a new current task - update its stats:
1091 */
1092static inline void
Ingo Molnar79303e92007-08-09 11:16:47 +02001093update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001094{
1095 /*
1096 * We are starting a new run period:
1097 */
Frederic Weisbecker78becc22013-04-12 01:51:02 +02001098 se->exec_start = rq_clock_task(rq_of(cfs_rq));
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001099}
1100
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001101/**************************************************
1102 * Scheduling class queueing methods:
1103 */
1104
Peter Zijlstracbee9f82012-10-25 14:16:43 +02001105#ifdef CONFIG_NUMA_BALANCING
1106/*
Mel Gorman598f0ec2013-10-07 11:28:55 +01001107 * Approximate time to scan a full NUMA task in ms. The task scan period is
1108 * calculated based on the tasks virtual memory size and
1109 * numa_balancing_scan_size.
Peter Zijlstracbee9f82012-10-25 14:16:43 +02001110 */
Mel Gorman598f0ec2013-10-07 11:28:55 +01001111unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1112unsigned int sysctl_numa_balancing_scan_period_max = 60000;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02001113
1114/* Portion of address space to scan in MB */
1115unsigned int sysctl_numa_balancing_scan_size = 256;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02001116
Peter Zijlstra4b96a29b2012-10-25 14:16:47 +02001117/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1118unsigned int sysctl_numa_balancing_scan_delay = 1000;
1119
Mel Gorman598f0ec2013-10-07 11:28:55 +01001120static unsigned int task_nr_scan_windows(struct task_struct *p)
1121{
1122 unsigned long rss = 0;
1123 unsigned long nr_scan_pages;
1124
1125 /*
1126 * Calculations based on RSS as non-present and empty pages are skipped
1127 * by the PTE scanner and NUMA hinting faults should be trapped based
1128 * on resident pages
1129 */
1130 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1131 rss = get_mm_rss(p->mm);
1132 if (!rss)
1133 rss = nr_scan_pages;
1134
1135 rss = round_up(rss, nr_scan_pages);
1136 return rss / nr_scan_pages;
1137}
1138
1139/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1140#define MAX_SCAN_WINDOW 2560
1141
1142static unsigned int task_scan_min(struct task_struct *p)
1143{
Jason Low316c1608d2015-04-28 13:00:20 -07001144 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
Mel Gorman598f0ec2013-10-07 11:28:55 +01001145 unsigned int scan, floor;
1146 unsigned int windows = 1;
1147
Kirill Tkhai64192652014-10-16 14:39:37 +04001148 if (scan_size < MAX_SCAN_WINDOW)
1149 windows = MAX_SCAN_WINDOW / scan_size;
Mel Gorman598f0ec2013-10-07 11:28:55 +01001150 floor = 1000 / windows;
1151
1152 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1153 return max_t(unsigned int, floor, scan);
1154}
1155
1156static unsigned int task_scan_max(struct task_struct *p)
1157{
1158 unsigned int smin = task_scan_min(p);
1159 unsigned int smax;
1160
1161 /* Watch for min being lower than max due to floor calculations */
1162 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1163 return max(smin, smax);
1164}
1165
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01001166static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1167{
1168 rq->nr_numa_running += (p->numa_preferred_nid != -1);
1169 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1170}
1171
1172static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1173{
1174 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1175 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1176}
1177
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01001178struct numa_group {
1179 atomic_t refcount;
1180
1181 spinlock_t lock; /* nr_tasks, tasks */
1182 int nr_tasks;
Mel Gormane29cf082013-10-07 11:29:22 +01001183 pid_t gid;
Rik van Riel4142c3e2016-01-25 17:07:39 -05001184 int active_nodes;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01001185
1186 struct rcu_head rcu;
Mel Gorman989348b2013-10-07 11:29:40 +01001187 unsigned long total_faults;
Rik van Riel4142c3e2016-01-25 17:07:39 -05001188 unsigned long max_faults_cpu;
Rik van Riel7e2703e2014-01-27 17:03:45 -05001189 /*
1190 * Faults_cpu is used to decide whether memory should move
1191 * towards the CPU. As a consequence, these stats are weighted
1192 * more by CPU use than by memory faults.
1193 */
Rik van Riel50ec8a42014-01-27 17:03:42 -05001194 unsigned long *faults_cpu;
Mel Gorman989348b2013-10-07 11:29:40 +01001195 unsigned long faults[0];
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01001196};
1197
Rik van Rielbe1e4e72014-01-27 17:03:48 -05001198/* Shared or private faults. */
1199#define NR_NUMA_HINT_FAULT_TYPES 2
1200
1201/* Memory and CPU locality */
1202#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1203
1204/* Averaged statistics, and temporary buffers. */
1205#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1206
Mel Gormane29cf082013-10-07 11:29:22 +01001207pid_t task_numa_group_id(struct task_struct *p)
1208{
1209 return p->numa_group ? p->numa_group->gid : 0;
1210}
1211
Iulia Manda44dba3d2014-10-31 02:13:31 +02001212/*
1213 * The averaged statistics, shared & private, memory & cpu,
1214 * occupy the first half of the array. The second half of the
1215 * array is for current counters, which are averaged into the
1216 * first set by task_numa_placement.
1217 */
1218static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
Mel Gormanac8e8952013-10-07 11:29:03 +01001219{
Iulia Manda44dba3d2014-10-31 02:13:31 +02001220 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
Mel Gormanac8e8952013-10-07 11:29:03 +01001221}
1222
1223static inline unsigned long task_faults(struct task_struct *p, int nid)
1224{
Iulia Manda44dba3d2014-10-31 02:13:31 +02001225 if (!p->numa_faults)
Mel Gormanac8e8952013-10-07 11:29:03 +01001226 return 0;
1227
Iulia Manda44dba3d2014-10-31 02:13:31 +02001228 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1229 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
Mel Gormanac8e8952013-10-07 11:29:03 +01001230}
1231
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001232static inline unsigned long group_faults(struct task_struct *p, int nid)
1233{
1234 if (!p->numa_group)
1235 return 0;
1236
Iulia Manda44dba3d2014-10-31 02:13:31 +02001237 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1238 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001239}
1240
Rik van Riel20e07de2014-01-27 17:03:43 -05001241static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1242{
Iulia Manda44dba3d2014-10-31 02:13:31 +02001243 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1244 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
Rik van Riel20e07de2014-01-27 17:03:43 -05001245}
1246
Rik van Riel4142c3e2016-01-25 17:07:39 -05001247/*
1248 * A node triggering more than 1/3 as many NUMA faults as the maximum is
1249 * considered part of a numa group's pseudo-interleaving set. Migrations
1250 * between these nodes are slowed down, to allow things to settle down.
1251 */
1252#define ACTIVE_NODE_FRACTION 3
1253
1254static bool numa_is_active_node(int nid, struct numa_group *ng)
1255{
1256 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1257}
1258
Rik van Riel6c6b1192014-10-17 03:29:52 -04001259/* Handle placement on systems where not all nodes are directly connected. */
1260static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1261 int maxdist, bool task)
1262{
1263 unsigned long score = 0;
1264 int node;
1265
1266 /*
1267 * All nodes are directly connected, and the same distance
1268 * from each other. No need for fancy placement algorithms.
1269 */
1270 if (sched_numa_topology_type == NUMA_DIRECT)
1271 return 0;
1272
1273 /*
1274 * This code is called for each node, introducing N^2 complexity,
1275 * which should be ok given the number of nodes rarely exceeds 8.
1276 */
1277 for_each_online_node(node) {
1278 unsigned long faults;
1279 int dist = node_distance(nid, node);
1280
1281 /*
1282 * The furthest away nodes in the system are not interesting
1283 * for placement; nid was already counted.
1284 */
1285 if (dist == sched_max_numa_distance || node == nid)
1286 continue;
1287
1288 /*
1289 * On systems with a backplane NUMA topology, compare groups
1290 * of nodes, and move tasks towards the group with the most
1291 * memory accesses. When comparing two nodes at distance
1292 * "hoplimit", only nodes closer by than "hoplimit" are part
1293 * of each group. Skip other nodes.
1294 */
1295 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1296 dist > maxdist)
1297 continue;
1298
1299 /* Add up the faults from nearby nodes. */
1300 if (task)
1301 faults = task_faults(p, node);
1302 else
1303 faults = group_faults(p, node);
1304
1305 /*
1306 * On systems with a glueless mesh NUMA topology, there are
1307 * no fixed "groups of nodes". Instead, nodes that are not
1308 * directly connected bounce traffic through intermediate
1309 * nodes; a numa_group can occupy any set of nodes.
1310 * The further away a node is, the less the faults count.
1311 * This seems to result in good task placement.
1312 */
1313 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1314 faults *= (sched_max_numa_distance - dist);
1315 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1316 }
1317
1318 score += faults;
1319 }
1320
1321 return score;
1322}
1323
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001324/*
1325 * These return the fraction of accesses done by a particular task, or
1326 * task group, on a particular numa node. The group weight is given a
1327 * larger multiplier, in order to group tasks together that are almost
1328 * evenly spread out between numa nodes.
1329 */
Rik van Riel7bd95322014-10-17 03:29:51 -04001330static inline unsigned long task_weight(struct task_struct *p, int nid,
1331 int dist)
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001332{
Rik van Riel7bd95322014-10-17 03:29:51 -04001333 unsigned long faults, total_faults;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001334
Iulia Manda44dba3d2014-10-31 02:13:31 +02001335 if (!p->numa_faults)
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001336 return 0;
1337
1338 total_faults = p->total_numa_faults;
1339
1340 if (!total_faults)
1341 return 0;
1342
Rik van Riel7bd95322014-10-17 03:29:51 -04001343 faults = task_faults(p, nid);
Rik van Riel6c6b1192014-10-17 03:29:52 -04001344 faults += score_nearby_nodes(p, nid, dist, true);
1345
Rik van Riel7bd95322014-10-17 03:29:51 -04001346 return 1000 * faults / total_faults;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001347}
1348
Rik van Riel7bd95322014-10-17 03:29:51 -04001349static inline unsigned long group_weight(struct task_struct *p, int nid,
1350 int dist)
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001351{
Rik van Riel7bd95322014-10-17 03:29:51 -04001352 unsigned long faults, total_faults;
1353
1354 if (!p->numa_group)
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001355 return 0;
1356
Rik van Riel7bd95322014-10-17 03:29:51 -04001357 total_faults = p->numa_group->total_faults;
1358
1359 if (!total_faults)
1360 return 0;
1361
1362 faults = group_faults(p, nid);
Rik van Riel6c6b1192014-10-17 03:29:52 -04001363 faults += score_nearby_nodes(p, nid, dist, false);
1364
Rik van Riel7bd95322014-10-17 03:29:51 -04001365 return 1000 * faults / total_faults;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001366}
1367
Rik van Riel10f39042014-01-27 17:03:44 -05001368bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1369 int src_nid, int dst_cpu)
1370{
1371 struct numa_group *ng = p->numa_group;
1372 int dst_nid = cpu_to_node(dst_cpu);
1373 int last_cpupid, this_cpupid;
1374
1375 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1376
1377 /*
1378 * Multi-stage node selection is used in conjunction with a periodic
1379 * migration fault to build a temporal task<->page relation. By using
1380 * a two-stage filter we remove short/unlikely relations.
1381 *
1382 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1383 * a task's usage of a particular page (n_p) per total usage of this
1384 * page (n_t) (in a given time-span) to a probability.
1385 *
1386 * Our periodic faults will sample this probability and getting the
1387 * same result twice in a row, given these samples are fully
1388 * independent, is then given by P(n)^2, provided our sample period
1389 * is sufficiently short compared to the usage pattern.
1390 *
1391 * This quadric squishes small probabilities, making it less likely we
1392 * act on an unlikely task<->page relation.
1393 */
1394 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1395 if (!cpupid_pid_unset(last_cpupid) &&
1396 cpupid_to_nid(last_cpupid) != dst_nid)
1397 return false;
1398
1399 /* Always allow migrate on private faults */
1400 if (cpupid_match_pid(p, last_cpupid))
1401 return true;
1402
1403 /* A shared fault, but p->numa_group has not been set up yet. */
1404 if (!ng)
1405 return true;
1406
1407 /*
Rik van Riel4142c3e2016-01-25 17:07:39 -05001408 * Destination node is much more heavily used than the source
1409 * node? Allow migration.
Rik van Riel10f39042014-01-27 17:03:44 -05001410 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05001411 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1412 ACTIVE_NODE_FRACTION)
Rik van Riel10f39042014-01-27 17:03:44 -05001413 return true;
1414
1415 /*
Rik van Riel4142c3e2016-01-25 17:07:39 -05001416 * Distribute memory according to CPU & memory use on each node,
1417 * with 3/4 hysteresis to avoid unnecessary memory migrations:
1418 *
1419 * faults_cpu(dst) 3 faults_cpu(src)
1420 * --------------- * - > ---------------
1421 * faults_mem(dst) 4 faults_mem(src)
Rik van Riel10f39042014-01-27 17:03:44 -05001422 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05001423 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1424 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
Rik van Riel10f39042014-01-27 17:03:44 -05001425}
1426
Mel Gormane6628d52013-10-07 11:29:02 +01001427static unsigned long weighted_cpuload(const int cpu);
Mel Gorman58d081b2013-10-07 11:29:10 +01001428static unsigned long source_load(int cpu, int type);
1429static unsigned long target_load(int cpu, int type);
Nicolas Pitreced549f2014-05-26 18:19:38 -04001430static unsigned long capacity_of(int cpu);
Mel Gorman58d081b2013-10-07 11:29:10 +01001431static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
Mel Gormane6628d52013-10-07 11:29:02 +01001432
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001433/* Cached statistics for all CPUs within a node */
Mel Gorman58d081b2013-10-07 11:29:10 +01001434struct numa_stats {
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001435 unsigned long nr_running;
Mel Gorman58d081b2013-10-07 11:29:10 +01001436 unsigned long load;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001437
1438 /* Total compute capacity of CPUs on a node */
Nicolas Pitre5ef20ca2014-05-26 18:19:34 -04001439 unsigned long compute_capacity;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001440
1441 /* Approximate capacity in terms of runnable tasks on a node */
Nicolas Pitre5ef20ca2014-05-26 18:19:34 -04001442 unsigned long task_capacity;
Nicolas Pitre1b6a7492014-05-26 18:19:35 -04001443 int has_free_capacity;
Mel Gorman58d081b2013-10-07 11:29:10 +01001444};
Mel Gormane6628d52013-10-07 11:29:02 +01001445
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001446/*
1447 * XXX borrowed from update_sg_lb_stats
1448 */
1449static void update_numa_stats(struct numa_stats *ns, int nid)
1450{
Rik van Riel83d7f242014-08-04 13:23:28 -04001451 int smt, cpu, cpus = 0;
1452 unsigned long capacity;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001453
1454 memset(ns, 0, sizeof(*ns));
1455 for_each_cpu(cpu, cpumask_of_node(nid)) {
1456 struct rq *rq = cpu_rq(cpu);
1457
1458 ns->nr_running += rq->nr_running;
1459 ns->load += weighted_cpuload(cpu);
Nicolas Pitreced549f2014-05-26 18:19:38 -04001460 ns->compute_capacity += capacity_of(cpu);
Peter Zijlstra5eca82a2013-11-06 18:47:57 +01001461
1462 cpus++;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001463 }
1464
Peter Zijlstra5eca82a2013-11-06 18:47:57 +01001465 /*
1466 * If we raced with hotplug and there are no CPUs left in our mask
1467 * the @ns structure is NULL'ed and task_numa_compare() will
1468 * not find this node attractive.
1469 *
Nicolas Pitre1b6a7492014-05-26 18:19:35 -04001470 * We'll either bail at !has_free_capacity, or we'll detect a huge
1471 * imbalance and bail there.
Peter Zijlstra5eca82a2013-11-06 18:47:57 +01001472 */
1473 if (!cpus)
1474 return;
1475
Rik van Riel83d7f242014-08-04 13:23:28 -04001476 /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1477 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1478 capacity = cpus / smt; /* cores */
1479
1480 ns->task_capacity = min_t(unsigned, capacity,
1481 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
Nicolas Pitre1b6a7492014-05-26 18:19:35 -04001482 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001483}
1484
Mel Gorman58d081b2013-10-07 11:29:10 +01001485struct task_numa_env {
1486 struct task_struct *p;
1487
1488 int src_cpu, src_nid;
1489 int dst_cpu, dst_nid;
1490
1491 struct numa_stats src_stats, dst_stats;
1492
Wanpeng Li40ea2b42013-12-05 19:10:17 +08001493 int imbalance_pct;
Rik van Riel7bd95322014-10-17 03:29:51 -04001494 int dist;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001495
1496 struct task_struct *best_task;
1497 long best_imp;
Mel Gorman58d081b2013-10-07 11:29:10 +01001498 int best_cpu;
1499};
1500
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001501static void task_numa_assign(struct task_numa_env *env,
1502 struct task_struct *p, long imp)
1503{
1504 if (env->best_task)
1505 put_task_struct(env->best_task);
Oleg Nesterovbac78572016-05-18 21:57:33 +02001506 if (p)
1507 get_task_struct(p);
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001508
1509 env->best_task = p;
1510 env->best_imp = imp;
1511 env->best_cpu = env->dst_cpu;
1512}
1513
Rik van Riel28a21742014-06-23 11:46:13 -04001514static bool load_too_imbalanced(long src_load, long dst_load,
Rik van Riele63da032014-05-14 13:22:21 -04001515 struct task_numa_env *env)
1516{
Rik van Riele4991b22015-05-27 15:04:27 -04001517 long imb, old_imb;
1518 long orig_src_load, orig_dst_load;
Rik van Riel28a21742014-06-23 11:46:13 -04001519 long src_capacity, dst_capacity;
1520
1521 /*
1522 * The load is corrected for the CPU capacity available on each node.
1523 *
1524 * src_load dst_load
1525 * ------------ vs ---------
1526 * src_capacity dst_capacity
1527 */
1528 src_capacity = env->src_stats.compute_capacity;
1529 dst_capacity = env->dst_stats.compute_capacity;
Rik van Riele63da032014-05-14 13:22:21 -04001530
1531 /* We care about the slope of the imbalance, not the direction. */
Rik van Riele4991b22015-05-27 15:04:27 -04001532 if (dst_load < src_load)
1533 swap(dst_load, src_load);
Rik van Riele63da032014-05-14 13:22:21 -04001534
1535 /* Is the difference below the threshold? */
Rik van Riele4991b22015-05-27 15:04:27 -04001536 imb = dst_load * src_capacity * 100 -
1537 src_load * dst_capacity * env->imbalance_pct;
Rik van Riele63da032014-05-14 13:22:21 -04001538 if (imb <= 0)
1539 return false;
1540
1541 /*
1542 * The imbalance is above the allowed threshold.
Rik van Riele4991b22015-05-27 15:04:27 -04001543 * Compare it with the old imbalance.
Rik van Riele63da032014-05-14 13:22:21 -04001544 */
Rik van Riel28a21742014-06-23 11:46:13 -04001545 orig_src_load = env->src_stats.load;
Rik van Riele4991b22015-05-27 15:04:27 -04001546 orig_dst_load = env->dst_stats.load;
Rik van Riel28a21742014-06-23 11:46:13 -04001547
Rik van Riele4991b22015-05-27 15:04:27 -04001548 if (orig_dst_load < orig_src_load)
1549 swap(orig_dst_load, orig_src_load);
Rik van Riele63da032014-05-14 13:22:21 -04001550
Rik van Riele4991b22015-05-27 15:04:27 -04001551 old_imb = orig_dst_load * src_capacity * 100 -
1552 orig_src_load * dst_capacity * env->imbalance_pct;
1553
1554 /* Would this change make things worse? */
1555 return (imb > old_imb);
Rik van Riele63da032014-05-14 13:22:21 -04001556}
1557
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001558/*
1559 * This checks if the overall compute and NUMA accesses of the system would
1560 * be improved if the source tasks was migrated to the target dst_cpu taking
1561 * into account that it might be best if task running on the dst_cpu should
1562 * be exchanged with the source task
1563 */
Rik van Riel887c2902013-10-07 11:29:31 +01001564static void task_numa_compare(struct task_numa_env *env,
1565 long taskimp, long groupimp)
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001566{
1567 struct rq *src_rq = cpu_rq(env->src_cpu);
1568 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1569 struct task_struct *cur;
Rik van Riel28a21742014-06-23 11:46:13 -04001570 long src_load, dst_load;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001571 long load;
Rik van Riel1c5d3eb2014-06-23 11:46:15 -04001572 long imp = env->p->numa_group ? groupimp : taskimp;
Rik van Riel0132c3e2014-06-23 11:46:16 -04001573 long moveimp = imp;
Rik van Riel7bd95322014-10-17 03:29:51 -04001574 int dist = env->dist;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001575
1576 rcu_read_lock();
Oleg Nesterovbac78572016-05-18 21:57:33 +02001577 cur = task_rcu_dereference(&dst_rq->curr);
1578 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001579 cur = NULL;
1580
1581 /*
Peter Zijlstra7af68332014-11-10 10:54:35 +01001582 * Because we have preemption enabled we can get migrated around and
1583 * end try selecting ourselves (current == env->p) as a swap candidate.
1584 */
1585 if (cur == env->p)
1586 goto unlock;
1587
1588 /*
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001589 * "imp" is the fault differential for the source task between the
1590 * source and destination node. Calculate the total differential for
1591 * the source task and potential destination task. The more negative
1592 * the value is, the more rmeote accesses that would be expected to
1593 * be incurred if the tasks were swapped.
1594 */
1595 if (cur) {
1596 /* Skip this swap candidate if cannot move to the source cpu */
1597 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1598 goto unlock;
1599
Rik van Riel887c2902013-10-07 11:29:31 +01001600 /*
1601 * If dst and source tasks are in the same NUMA group, or not
Rik van Rielca28aa532013-10-07 11:29:32 +01001602 * in any group then look only at task weights.
Rik van Riel887c2902013-10-07 11:29:31 +01001603 */
Rik van Rielca28aa532013-10-07 11:29:32 +01001604 if (cur->numa_group == env->p->numa_group) {
Rik van Riel7bd95322014-10-17 03:29:51 -04001605 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1606 task_weight(cur, env->dst_nid, dist);
Rik van Rielca28aa532013-10-07 11:29:32 +01001607 /*
1608 * Add some hysteresis to prevent swapping the
1609 * tasks within a group over tiny differences.
1610 */
1611 if (cur->numa_group)
1612 imp -= imp/16;
Rik van Riel887c2902013-10-07 11:29:31 +01001613 } else {
Rik van Rielca28aa532013-10-07 11:29:32 +01001614 /*
1615 * Compare the group weights. If a task is all by
1616 * itself (not part of a group), use the task weight
1617 * instead.
1618 */
Rik van Rielca28aa532013-10-07 11:29:32 +01001619 if (cur->numa_group)
Rik van Riel7bd95322014-10-17 03:29:51 -04001620 imp += group_weight(cur, env->src_nid, dist) -
1621 group_weight(cur, env->dst_nid, dist);
Rik van Rielca28aa532013-10-07 11:29:32 +01001622 else
Rik van Riel7bd95322014-10-17 03:29:51 -04001623 imp += task_weight(cur, env->src_nid, dist) -
1624 task_weight(cur, env->dst_nid, dist);
Rik van Riel887c2902013-10-07 11:29:31 +01001625 }
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001626 }
1627
Rik van Riel0132c3e2014-06-23 11:46:16 -04001628 if (imp <= env->best_imp && moveimp <= env->best_imp)
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001629 goto unlock;
1630
1631 if (!cur) {
1632 /* Is there capacity at our destination? */
Rik van Rielb932c032014-08-04 13:23:27 -04001633 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
Nicolas Pitre1b6a7492014-05-26 18:19:35 -04001634 !env->dst_stats.has_free_capacity)
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001635 goto unlock;
1636
1637 goto balance;
1638 }
1639
1640 /* Balance doesn't matter much if we're running a task per cpu */
Rik van Riel0132c3e2014-06-23 11:46:16 -04001641 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1642 dst_rq->nr_running == 1)
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001643 goto assign;
1644
1645 /*
1646 * In the overloaded case, try and keep the load balanced.
1647 */
1648balance:
Peter Zijlstrae720fff2014-07-11 16:01:53 +02001649 load = task_h_load(env->p);
1650 dst_load = env->dst_stats.load + load;
1651 src_load = env->src_stats.load - load;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001652
Rik van Riel0132c3e2014-06-23 11:46:16 -04001653 if (moveimp > imp && moveimp > env->best_imp) {
1654 /*
1655 * If the improvement from just moving env->p direction is
1656 * better than swapping tasks around, check if a move is
1657 * possible. Store a slightly smaller score than moveimp,
1658 * so an actually idle CPU will win.
1659 */
1660 if (!load_too_imbalanced(src_load, dst_load, env)) {
1661 imp = moveimp - 1;
1662 cur = NULL;
1663 goto assign;
1664 }
1665 }
1666
1667 if (imp <= env->best_imp)
1668 goto unlock;
1669
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001670 if (cur) {
Peter Zijlstrae720fff2014-07-11 16:01:53 +02001671 load = task_h_load(cur);
1672 dst_load -= load;
1673 src_load += load;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001674 }
1675
Rik van Riel28a21742014-06-23 11:46:13 -04001676 if (load_too_imbalanced(src_load, dst_load, env))
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001677 goto unlock;
1678
Rik van Rielba7e5a22014-09-04 16:35:30 -04001679 /*
1680 * One idle CPU per node is evaluated for a task numa move.
1681 * Call select_idle_sibling to maybe find a better one.
1682 */
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02001683 if (!cur) {
1684 /*
1685 * select_idle_siblings() uses an per-cpu cpumask that
1686 * can be used from IRQ context.
1687 */
1688 local_irq_disable();
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01001689 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1690 env->dst_cpu);
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02001691 local_irq_enable();
1692 }
Rik van Rielba7e5a22014-09-04 16:35:30 -04001693
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001694assign:
1695 task_numa_assign(env, cur, imp);
1696unlock:
1697 rcu_read_unlock();
1698}
1699
Rik van Riel887c2902013-10-07 11:29:31 +01001700static void task_numa_find_cpu(struct task_numa_env *env,
1701 long taskimp, long groupimp)
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001702{
1703 int cpu;
1704
1705 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1706 /* Skip this CPU if the source task cannot migrate */
1707 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1708 continue;
1709
1710 env->dst_cpu = cpu;
Rik van Riel887c2902013-10-07 11:29:31 +01001711 task_numa_compare(env, taskimp, groupimp);
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001712 }
1713}
1714
Rik van Riel6f9aad02015-05-28 09:52:49 -04001715/* Only move tasks to a NUMA node less busy than the current node. */
1716static bool numa_has_capacity(struct task_numa_env *env)
1717{
1718 struct numa_stats *src = &env->src_stats;
1719 struct numa_stats *dst = &env->dst_stats;
1720
1721 if (src->has_free_capacity && !dst->has_free_capacity)
1722 return false;
1723
1724 /*
1725 * Only consider a task move if the source has a higher load
1726 * than the destination, corrected for CPU capacity on each node.
1727 *
1728 * src->load dst->load
1729 * --------------------- vs ---------------------
1730 * src->compute_capacity dst->compute_capacity
1731 */
Srikar Dronamraju44dcb042015-06-16 17:26:00 +05301732 if (src->load * dst->compute_capacity * env->imbalance_pct >
1733
1734 dst->load * src->compute_capacity * 100)
Rik van Riel6f9aad02015-05-28 09:52:49 -04001735 return true;
1736
1737 return false;
1738}
1739
Mel Gorman58d081b2013-10-07 11:29:10 +01001740static int task_numa_migrate(struct task_struct *p)
Mel Gormane6628d52013-10-07 11:29:02 +01001741{
Mel Gorman58d081b2013-10-07 11:29:10 +01001742 struct task_numa_env env = {
1743 .p = p,
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001744
Mel Gorman58d081b2013-10-07 11:29:10 +01001745 .src_cpu = task_cpu(p),
Ingo Molnarb32e86b2013-10-07 11:29:30 +01001746 .src_nid = task_node(p),
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001747
1748 .imbalance_pct = 112,
1749
1750 .best_task = NULL,
1751 .best_imp = 0,
Rik van Riel4142c3e2016-01-25 17:07:39 -05001752 .best_cpu = -1,
Mel Gorman58d081b2013-10-07 11:29:10 +01001753 };
1754 struct sched_domain *sd;
Rik van Riel887c2902013-10-07 11:29:31 +01001755 unsigned long taskweight, groupweight;
Rik van Riel7bd95322014-10-17 03:29:51 -04001756 int nid, ret, dist;
Rik van Riel887c2902013-10-07 11:29:31 +01001757 long taskimp, groupimp;
Mel Gormane6628d52013-10-07 11:29:02 +01001758
Mel Gorman58d081b2013-10-07 11:29:10 +01001759 /*
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001760 * Pick the lowest SD_NUMA domain, as that would have the smallest
1761 * imbalance and would be the first to start moving tasks about.
1762 *
1763 * And we want to avoid any moving of tasks about, as that would create
1764 * random movement of tasks -- counter the numa conditions we're trying
1765 * to satisfy here.
Mel Gorman58d081b2013-10-07 11:29:10 +01001766 */
Mel Gormane6628d52013-10-07 11:29:02 +01001767 rcu_read_lock();
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001768 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
Rik van Riel46a73e82013-11-11 19:29:25 -05001769 if (sd)
1770 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
Mel Gormane6628d52013-10-07 11:29:02 +01001771 rcu_read_unlock();
1772
Rik van Riel46a73e82013-11-11 19:29:25 -05001773 /*
1774 * Cpusets can break the scheduler domain tree into smaller
1775 * balance domains, some of which do not cross NUMA boundaries.
1776 * Tasks that are "trapped" in such domains cannot be migrated
1777 * elsewhere, so there is no point in (re)trying.
1778 */
1779 if (unlikely(!sd)) {
Wanpeng Lide1b3012013-12-12 15:23:24 +08001780 p->numa_preferred_nid = task_node(p);
Rik van Riel46a73e82013-11-11 19:29:25 -05001781 return -EINVAL;
1782 }
1783
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001784 env.dst_nid = p->numa_preferred_nid;
Rik van Riel7bd95322014-10-17 03:29:51 -04001785 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1786 taskweight = task_weight(p, env.src_nid, dist);
1787 groupweight = group_weight(p, env.src_nid, dist);
1788 update_numa_stats(&env.src_stats, env.src_nid);
1789 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1790 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001791 update_numa_stats(&env.dst_stats, env.dst_nid);
Mel Gorman58d081b2013-10-07 11:29:10 +01001792
Rik van Riela43455a2014-06-04 16:09:42 -04001793 /* Try to find a spot on the preferred nid. */
Rik van Riel6f9aad02015-05-28 09:52:49 -04001794 if (numa_has_capacity(&env))
1795 task_numa_find_cpu(&env, taskimp, groupimp);
Rik van Riele1dda8a2013-10-07 11:29:19 +01001796
Rik van Riel9de05d42014-10-09 17:27:47 -04001797 /*
1798 * Look at other nodes in these cases:
1799 * - there is no space available on the preferred_nid
1800 * - the task is part of a numa_group that is interleaved across
1801 * multiple NUMA nodes; in order to better consolidate the group,
1802 * we need to check other locations.
1803 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05001804 if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001805 for_each_online_node(nid) {
1806 if (nid == env.src_nid || nid == p->numa_preferred_nid)
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001807 continue;
1808
Rik van Riel7bd95322014-10-17 03:29:51 -04001809 dist = node_distance(env.src_nid, env.dst_nid);
Rik van Riel6c6b1192014-10-17 03:29:52 -04001810 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1811 dist != env.dist) {
1812 taskweight = task_weight(p, env.src_nid, dist);
1813 groupweight = group_weight(p, env.src_nid, dist);
1814 }
Rik van Riel7bd95322014-10-17 03:29:51 -04001815
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001816 /* Only consider nodes where both task and groups benefit */
Rik van Riel7bd95322014-10-17 03:29:51 -04001817 taskimp = task_weight(p, nid, dist) - taskweight;
1818 groupimp = group_weight(p, nid, dist) - groupweight;
Rik van Riel887c2902013-10-07 11:29:31 +01001819 if (taskimp < 0 && groupimp < 0)
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001820 continue;
1821
Rik van Riel7bd95322014-10-17 03:29:51 -04001822 env.dist = dist;
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001823 env.dst_nid = nid;
1824 update_numa_stats(&env.dst_stats, env.dst_nid);
Rik van Riel6f9aad02015-05-28 09:52:49 -04001825 if (numa_has_capacity(&env))
1826 task_numa_find_cpu(&env, taskimp, groupimp);
Mel Gorman58d081b2013-10-07 11:29:10 +01001827 }
1828 }
1829
Rik van Riel68d1b022014-04-11 13:00:29 -04001830 /*
1831 * If the task is part of a workload that spans multiple NUMA nodes,
1832 * and is migrating into one of the workload's active nodes, remember
1833 * this node as the task's preferred numa node, so the workload can
1834 * settle down.
1835 * A task that migrated to a second choice node will be better off
1836 * trying for a better one later. Do not set the preferred node here.
1837 */
Rik van Rieldb015da2014-06-23 11:41:34 -04001838 if (p->numa_group) {
Rik van Riel4142c3e2016-01-25 17:07:39 -05001839 struct numa_group *ng = p->numa_group;
1840
Rik van Rieldb015da2014-06-23 11:41:34 -04001841 if (env.best_cpu == -1)
1842 nid = env.src_nid;
1843 else
1844 nid = env.dst_nid;
1845
Rik van Riel4142c3e2016-01-25 17:07:39 -05001846 if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
Rik van Rieldb015da2014-06-23 11:41:34 -04001847 sched_setnuma(p, env.dst_nid);
1848 }
1849
1850 /* No better CPU than the current one was found. */
1851 if (env.best_cpu == -1)
1852 return -EAGAIN;
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01001853
Rik van Riel04bb2f92013-10-07 11:29:36 +01001854 /*
1855 * Reset the scan period if the task is being rescheduled on an
1856 * alternative node to recheck if the tasks is now properly placed.
1857 */
1858 p->numa_scan_period = task_scan_min(p);
1859
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001860 if (env.best_task == NULL) {
Mel Gorman286549d2014-01-21 15:51:03 -08001861 ret = migrate_task_to(p, env.best_cpu);
1862 if (ret != 0)
1863 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001864 return ret;
1865 }
1866
1867 ret = migrate_swap(p, env.best_task);
Mel Gorman286549d2014-01-21 15:51:03 -08001868 if (ret != 0)
1869 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001870 put_task_struct(env.best_task);
1871 return ret;
Mel Gormane6628d52013-10-07 11:29:02 +01001872}
1873
Mel Gorman6b9a7462013-10-07 11:29:11 +01001874/* Attempt to migrate a task to a CPU on the preferred node. */
1875static void numa_migrate_preferred(struct task_struct *p)
1876{
Rik van Riel5085e2a2014-04-11 13:00:28 -04001877 unsigned long interval = HZ;
1878
Rik van Riel2739d3e2013-10-07 11:29:41 +01001879 /* This task has no NUMA fault statistics yet */
Iulia Manda44dba3d2014-10-31 02:13:31 +02001880 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
Rik van Riel2739d3e2013-10-07 11:29:41 +01001881 return;
1882
1883 /* Periodically retry migrating the task to the preferred node */
Rik van Riel5085e2a2014-04-11 13:00:28 -04001884 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1885 p->numa_migrate_retry = jiffies + interval;
Rik van Riel2739d3e2013-10-07 11:29:41 +01001886
Mel Gorman6b9a7462013-10-07 11:29:11 +01001887 /* Success if task is already running on preferred CPU */
Wanpeng Lide1b3012013-12-12 15:23:24 +08001888 if (task_node(p) == p->numa_preferred_nid)
Mel Gorman6b9a7462013-10-07 11:29:11 +01001889 return;
1890
Mel Gorman6b9a7462013-10-07 11:29:11 +01001891 /* Otherwise, try migrate to a CPU on the preferred node */
Rik van Riel2739d3e2013-10-07 11:29:41 +01001892 task_numa_migrate(p);
Mel Gorman6b9a7462013-10-07 11:29:11 +01001893}
1894
Rik van Riel04bb2f92013-10-07 11:29:36 +01001895/*
Rik van Riel4142c3e2016-01-25 17:07:39 -05001896 * Find out how many nodes on the workload is actively running on. Do this by
Rik van Riel20e07de2014-01-27 17:03:43 -05001897 * tracking the nodes from which NUMA hinting faults are triggered. This can
1898 * be different from the set of nodes where the workload's memory is currently
1899 * located.
Rik van Riel20e07de2014-01-27 17:03:43 -05001900 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05001901static void numa_group_count_active_nodes(struct numa_group *numa_group)
Rik van Riel20e07de2014-01-27 17:03:43 -05001902{
1903 unsigned long faults, max_faults = 0;
Rik van Riel4142c3e2016-01-25 17:07:39 -05001904 int nid, active_nodes = 0;
Rik van Riel20e07de2014-01-27 17:03:43 -05001905
1906 for_each_online_node(nid) {
1907 faults = group_faults_cpu(numa_group, nid);
1908 if (faults > max_faults)
1909 max_faults = faults;
1910 }
1911
1912 for_each_online_node(nid) {
1913 faults = group_faults_cpu(numa_group, nid);
Rik van Riel4142c3e2016-01-25 17:07:39 -05001914 if (faults * ACTIVE_NODE_FRACTION > max_faults)
1915 active_nodes++;
Rik van Riel20e07de2014-01-27 17:03:43 -05001916 }
Rik van Riel4142c3e2016-01-25 17:07:39 -05001917
1918 numa_group->max_faults_cpu = max_faults;
1919 numa_group->active_nodes = active_nodes;
Rik van Riel20e07de2014-01-27 17:03:43 -05001920}
1921
1922/*
Rik van Riel04bb2f92013-10-07 11:29:36 +01001923 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1924 * increments. The more local the fault statistics are, the higher the scan
Rik van Riela22b4b02014-06-23 11:41:35 -04001925 * period will be for the next scan window. If local/(local+remote) ratio is
1926 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1927 * the scan period will decrease. Aim for 70% local accesses.
Rik van Riel04bb2f92013-10-07 11:29:36 +01001928 */
1929#define NUMA_PERIOD_SLOTS 10
Rik van Riela22b4b02014-06-23 11:41:35 -04001930#define NUMA_PERIOD_THRESHOLD 7
Rik van Riel04bb2f92013-10-07 11:29:36 +01001931
1932/*
1933 * Increase the scan period (slow down scanning) if the majority of
1934 * our memory is already on our local node, or if the majority of
1935 * the page accesses are shared with other processes.
1936 * Otherwise, decrease the scan period.
1937 */
1938static void update_task_scan_period(struct task_struct *p,
1939 unsigned long shared, unsigned long private)
1940{
1941 unsigned int period_slot;
1942 int ratio;
1943 int diff;
1944
1945 unsigned long remote = p->numa_faults_locality[0];
1946 unsigned long local = p->numa_faults_locality[1];
1947
1948 /*
1949 * If there were no record hinting faults then either the task is
1950 * completely idle or all activity is areas that are not of interest
Mel Gorman074c2382015-03-25 15:55:42 -07001951 * to automatic numa balancing. Related to that, if there were failed
1952 * migration then it implies we are migrating too quickly or the local
1953 * node is overloaded. In either case, scan slower
Rik van Riel04bb2f92013-10-07 11:29:36 +01001954 */
Mel Gorman074c2382015-03-25 15:55:42 -07001955 if (local + shared == 0 || p->numa_faults_locality[2]) {
Rik van Riel04bb2f92013-10-07 11:29:36 +01001956 p->numa_scan_period = min(p->numa_scan_period_max,
1957 p->numa_scan_period << 1);
1958
1959 p->mm->numa_next_scan = jiffies +
1960 msecs_to_jiffies(p->numa_scan_period);
1961
1962 return;
1963 }
1964
1965 /*
1966 * Prepare to scale scan period relative to the current period.
1967 * == NUMA_PERIOD_THRESHOLD scan period stays the same
1968 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1969 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1970 */
1971 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1972 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1973 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1974 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1975 if (!slot)
1976 slot = 1;
1977 diff = slot * period_slot;
1978 } else {
1979 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1980
1981 /*
1982 * Scale scan rate increases based on sharing. There is an
1983 * inverse relationship between the degree of sharing and
1984 * the adjustment made to the scanning period. Broadly
1985 * speaking the intent is that there is little point
1986 * scanning faster if shared accesses dominate as it may
1987 * simply bounce migrations uselessly
1988 */
Yasuaki Ishimatsu2847c902014-10-22 16:04:35 +09001989 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
Rik van Riel04bb2f92013-10-07 11:29:36 +01001990 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1991 }
1992
1993 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1994 task_scan_min(p), task_scan_max(p));
1995 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1996}
1997
Rik van Riel7e2703e2014-01-27 17:03:45 -05001998/*
1999 * Get the fraction of time the task has been running since the last
2000 * NUMA placement cycle. The scheduler keeps similar statistics, but
2001 * decays those on a 32ms period, which is orders of magnitude off
2002 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
2003 * stats only if the task is so new there are no NUMA statistics yet.
2004 */
2005static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2006{
2007 u64 runtime, delta, now;
2008 /* Use the start of this time slice to avoid calculations. */
2009 now = p->se.exec_start;
2010 runtime = p->se.sum_exec_runtime;
2011
2012 if (p->last_task_numa_placement) {
2013 delta = runtime - p->last_sum_exec_runtime;
2014 *period = now - p->last_task_numa_placement;
Xie XiuQic9e5f602019-04-20 16:34:16 +08002015
2016 /* Avoid time going backwards, prevent potential divide error: */
2017 if (unlikely((s64)*period < 0))
2018 *period = 0;
Rik van Riel7e2703e2014-01-27 17:03:45 -05002019 } else {
Yuyang Du9d89c252015-07-15 08:04:37 +08002020 delta = p->se.avg.load_sum / p->se.load.weight;
2021 *period = LOAD_AVG_MAX;
Rik van Riel7e2703e2014-01-27 17:03:45 -05002022 }
2023
2024 p->last_sum_exec_runtime = runtime;
2025 p->last_task_numa_placement = now;
2026
2027 return delta;
2028}
2029
Rik van Riel54009412014-10-17 03:29:53 -04002030/*
2031 * Determine the preferred nid for a task in a numa_group. This needs to
2032 * be done in a way that produces consistent results with group_weight,
2033 * otherwise workloads might not converge.
2034 */
2035static int preferred_group_nid(struct task_struct *p, int nid)
2036{
2037 nodemask_t nodes;
2038 int dist;
2039
2040 /* Direct connections between all NUMA nodes. */
2041 if (sched_numa_topology_type == NUMA_DIRECT)
2042 return nid;
2043
2044 /*
2045 * On a system with glueless mesh NUMA topology, group_weight
2046 * scores nodes according to the number of NUMA hinting faults on
2047 * both the node itself, and on nearby nodes.
2048 */
2049 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2050 unsigned long score, max_score = 0;
2051 int node, max_node = nid;
2052
2053 dist = sched_max_numa_distance;
2054
2055 for_each_online_node(node) {
2056 score = group_weight(p, node, dist);
2057 if (score > max_score) {
2058 max_score = score;
2059 max_node = node;
2060 }
2061 }
2062 return max_node;
2063 }
2064
2065 /*
2066 * Finding the preferred nid in a system with NUMA backplane
2067 * interconnect topology is more involved. The goal is to locate
2068 * tasks from numa_groups near each other in the system, and
2069 * untangle workloads from different sides of the system. This requires
2070 * searching down the hierarchy of node groups, recursively searching
2071 * inside the highest scoring group of nodes. The nodemask tricks
2072 * keep the complexity of the search down.
2073 */
2074 nodes = node_online_map;
2075 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2076 unsigned long max_faults = 0;
Jan Beulich81907472015-01-23 08:25:38 +00002077 nodemask_t max_group = NODE_MASK_NONE;
Rik van Riel54009412014-10-17 03:29:53 -04002078 int a, b;
2079
2080 /* Are there nodes at this distance from each other? */
2081 if (!find_numa_distance(dist))
2082 continue;
2083
2084 for_each_node_mask(a, nodes) {
2085 unsigned long faults = 0;
2086 nodemask_t this_group;
2087 nodes_clear(this_group);
2088
2089 /* Sum group's NUMA faults; includes a==b case. */
2090 for_each_node_mask(b, nodes) {
2091 if (node_distance(a, b) < dist) {
2092 faults += group_faults(p, b);
2093 node_set(b, this_group);
2094 node_clear(b, nodes);
2095 }
2096 }
2097
2098 /* Remember the top group. */
2099 if (faults > max_faults) {
2100 max_faults = faults;
2101 max_group = this_group;
2102 /*
2103 * subtle: at the smallest distance there is
2104 * just one node left in each "group", the
2105 * winner is the preferred nid.
2106 */
2107 nid = a;
2108 }
2109 }
2110 /* Next round, evaluate the nodes within max_group. */
Jan Beulich890a5402015-02-09 12:30:00 +01002111 if (!max_faults)
2112 break;
Rik van Riel54009412014-10-17 03:29:53 -04002113 nodes = max_group;
2114 }
2115 return nid;
2116}
2117
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002118static void task_numa_placement(struct task_struct *p)
2119{
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002120 int seq, nid, max_nid = -1, max_group_nid = -1;
2121 unsigned long max_faults = 0, max_group_faults = 0;
Rik van Riel04bb2f92013-10-07 11:29:36 +01002122 unsigned long fault_types[2] = { 0, 0 };
Rik van Riel7e2703e2014-01-27 17:03:45 -05002123 unsigned long total_faults;
2124 u64 runtime, period;
Mel Gorman7dbd13e2013-10-07 11:29:29 +01002125 spinlock_t *group_lock = NULL;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002126
Jason Low7e5a2c12015-04-30 17:28:14 -07002127 /*
2128 * The p->mm->numa_scan_seq field gets updated without
2129 * exclusive access. Use READ_ONCE() here to ensure
2130 * that the field is read in a single access:
2131 */
Jason Low316c1608d2015-04-28 13:00:20 -07002132 seq = READ_ONCE(p->mm->numa_scan_seq);
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002133 if (p->numa_scan_seq == seq)
2134 return;
2135 p->numa_scan_seq = seq;
Mel Gorman598f0ec2013-10-07 11:28:55 +01002136 p->numa_scan_period_max = task_scan_max(p);
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002137
Rik van Riel7e2703e2014-01-27 17:03:45 -05002138 total_faults = p->numa_faults_locality[0] +
2139 p->numa_faults_locality[1];
2140 runtime = numa_get_avg_runtime(p, &period);
2141
Mel Gorman7dbd13e2013-10-07 11:29:29 +01002142 /* If the task is part of a group prevent parallel updates to group stats */
2143 if (p->numa_group) {
2144 group_lock = &p->numa_group->lock;
Mike Galbraith60e69ee2014-04-07 10:55:15 +02002145 spin_lock_irq(group_lock);
Mel Gorman7dbd13e2013-10-07 11:29:29 +01002146 }
2147
Mel Gorman688b7582013-10-07 11:28:58 +01002148 /* Find the node with the highest number of faults */
2149 for_each_online_node(nid) {
Iulia Manda44dba3d2014-10-31 02:13:31 +02002150 /* Keep track of the offsets in numa_faults array */
2151 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002152 unsigned long faults = 0, group_faults = 0;
Iulia Manda44dba3d2014-10-31 02:13:31 +02002153 int priv;
Mel Gorman745d6142013-10-07 11:28:59 +01002154
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002155 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
Rik van Riel7e2703e2014-01-27 17:03:45 -05002156 long diff, f_diff, f_weight;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002157
Iulia Manda44dba3d2014-10-31 02:13:31 +02002158 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2159 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2160 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2161 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
Mel Gorman745d6142013-10-07 11:28:59 +01002162
Mel Gormanac8e8952013-10-07 11:29:03 +01002163 /* Decay existing window, copy faults since last scan */
Iulia Manda44dba3d2014-10-31 02:13:31 +02002164 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2165 fault_types[priv] += p->numa_faults[membuf_idx];
2166 p->numa_faults[membuf_idx] = 0;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01002167
Rik van Riel7e2703e2014-01-27 17:03:45 -05002168 /*
2169 * Normalize the faults_from, so all tasks in a group
2170 * count according to CPU use, instead of by the raw
2171 * number of faults. Tasks with little runtime have
2172 * little over-all impact on throughput, and thus their
2173 * faults are less important.
2174 */
2175 f_weight = div64_u64(runtime << 16, period + 1);
Iulia Manda44dba3d2014-10-31 02:13:31 +02002176 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
Rik van Riel7e2703e2014-01-27 17:03:45 -05002177 (total_faults + 1);
Iulia Manda44dba3d2014-10-31 02:13:31 +02002178 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2179 p->numa_faults[cpubuf_idx] = 0;
Rik van Riel50ec8a42014-01-27 17:03:42 -05002180
Iulia Manda44dba3d2014-10-31 02:13:31 +02002181 p->numa_faults[mem_idx] += diff;
2182 p->numa_faults[cpu_idx] += f_diff;
2183 faults += p->numa_faults[mem_idx];
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002184 p->total_numa_faults += diff;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002185 if (p->numa_group) {
Iulia Manda44dba3d2014-10-31 02:13:31 +02002186 /*
2187 * safe because we can only change our own group
2188 *
2189 * mem_idx represents the offset for a given
2190 * nid and priv in a specific region because it
2191 * is at the beginning of the numa_faults array.
2192 */
2193 p->numa_group->faults[mem_idx] += diff;
2194 p->numa_group->faults_cpu[mem_idx] += f_diff;
Mel Gorman989348b2013-10-07 11:29:40 +01002195 p->numa_group->total_faults += diff;
Iulia Manda44dba3d2014-10-31 02:13:31 +02002196 group_faults += p->numa_group->faults[mem_idx];
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002197 }
Mel Gormanac8e8952013-10-07 11:29:03 +01002198 }
2199
Mel Gorman688b7582013-10-07 11:28:58 +01002200 if (faults > max_faults) {
2201 max_faults = faults;
2202 max_nid = nid;
2203 }
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002204
2205 if (group_faults > max_group_faults) {
2206 max_group_faults = group_faults;
2207 max_group_nid = nid;
2208 }
2209 }
2210
Rik van Riel04bb2f92013-10-07 11:29:36 +01002211 update_task_scan_period(p, fault_types[0], fault_types[1]);
2212
Mel Gorman7dbd13e2013-10-07 11:29:29 +01002213 if (p->numa_group) {
Rik van Riel4142c3e2016-01-25 17:07:39 -05002214 numa_group_count_active_nodes(p->numa_group);
Mike Galbraith60e69ee2014-04-07 10:55:15 +02002215 spin_unlock_irq(group_lock);
Rik van Riel54009412014-10-17 03:29:53 -04002216 max_nid = preferred_group_nid(p, max_group_nid);
Mel Gorman688b7582013-10-07 11:28:58 +01002217 }
2218
Rik van Rielbb97fc32014-06-04 16:33:15 -04002219 if (max_faults) {
2220 /* Set the new preferred node */
2221 if (max_nid != p->numa_preferred_nid)
2222 sched_setnuma(p, max_nid);
2223
2224 if (task_node(p) != p->numa_preferred_nid)
2225 numa_migrate_preferred(p);
Mel Gorman3a7053b2013-10-07 11:29:00 +01002226 }
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002227}
2228
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002229static inline int get_numa_group(struct numa_group *grp)
2230{
2231 return atomic_inc_not_zero(&grp->refcount);
2232}
2233
2234static inline void put_numa_group(struct numa_group *grp)
2235{
2236 if (atomic_dec_and_test(&grp->refcount))
2237 kfree_rcu(grp, rcu);
2238}
2239
Mel Gorman3e6a9412013-10-07 11:29:35 +01002240static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2241 int *priv)
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002242{
2243 struct numa_group *grp, *my_grp;
2244 struct task_struct *tsk;
2245 bool join = false;
2246 int cpu = cpupid_to_cpu(cpupid);
2247 int i;
2248
2249 if (unlikely(!p->numa_group)) {
2250 unsigned int size = sizeof(struct numa_group) +
Rik van Riel50ec8a42014-01-27 17:03:42 -05002251 4*nr_node_ids*sizeof(unsigned long);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002252
2253 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2254 if (!grp)
2255 return;
2256
2257 atomic_set(&grp->refcount, 1);
Rik van Riel4142c3e2016-01-25 17:07:39 -05002258 grp->active_nodes = 1;
2259 grp->max_faults_cpu = 0;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002260 spin_lock_init(&grp->lock);
Mel Gormane29cf082013-10-07 11:29:22 +01002261 grp->gid = p->pid;
Rik van Riel50ec8a42014-01-27 17:03:42 -05002262 /* Second half of the array tracks nids where faults happen */
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002263 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2264 nr_node_ids;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002265
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002266 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
Iulia Manda44dba3d2014-10-31 02:13:31 +02002267 grp->faults[i] = p->numa_faults[i];
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002268
Mel Gorman989348b2013-10-07 11:29:40 +01002269 grp->total_faults = p->total_numa_faults;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002270
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002271 grp->nr_tasks++;
2272 rcu_assign_pointer(p->numa_group, grp);
2273 }
2274
2275 rcu_read_lock();
Jason Low316c1608d2015-04-28 13:00:20 -07002276 tsk = READ_ONCE(cpu_rq(cpu)->curr);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002277
2278 if (!cpupid_match_pid(tsk, cpupid))
Peter Zijlstra33547812013-10-09 10:24:48 +02002279 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002280
2281 grp = rcu_dereference(tsk->numa_group);
2282 if (!grp)
Peter Zijlstra33547812013-10-09 10:24:48 +02002283 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002284
2285 my_grp = p->numa_group;
2286 if (grp == my_grp)
Peter Zijlstra33547812013-10-09 10:24:48 +02002287 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002288
2289 /*
2290 * Only join the other group if its bigger; if we're the bigger group,
2291 * the other task will join us.
2292 */
2293 if (my_grp->nr_tasks > grp->nr_tasks)
Peter Zijlstra33547812013-10-09 10:24:48 +02002294 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002295
2296 /*
2297 * Tie-break on the grp address.
2298 */
2299 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
Peter Zijlstra33547812013-10-09 10:24:48 +02002300 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002301
Rik van Rieldabe1d92013-10-07 11:29:34 +01002302 /* Always join threads in the same process. */
2303 if (tsk->mm == current->mm)
2304 join = true;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002305
Rik van Rieldabe1d92013-10-07 11:29:34 +01002306 /* Simple filter to avoid false positives due to PID collisions */
2307 if (flags & TNF_SHARED)
2308 join = true;
2309
Mel Gorman3e6a9412013-10-07 11:29:35 +01002310 /* Update priv based on whether false sharing was detected */
2311 *priv = !join;
2312
Rik van Rieldabe1d92013-10-07 11:29:34 +01002313 if (join && !get_numa_group(grp))
Peter Zijlstra33547812013-10-09 10:24:48 +02002314 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002315
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002316 rcu_read_unlock();
2317
2318 if (!join)
2319 return;
2320
Mike Galbraith60e69ee2014-04-07 10:55:15 +02002321 BUG_ON(irqs_disabled());
2322 double_lock_irq(&my_grp->lock, &grp->lock);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002323
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002324 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
Iulia Manda44dba3d2014-10-31 02:13:31 +02002325 my_grp->faults[i] -= p->numa_faults[i];
2326 grp->faults[i] += p->numa_faults[i];
Mel Gorman989348b2013-10-07 11:29:40 +01002327 }
2328 my_grp->total_faults -= p->total_numa_faults;
2329 grp->total_faults += p->total_numa_faults;
2330
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002331 my_grp->nr_tasks--;
2332 grp->nr_tasks++;
2333
2334 spin_unlock(&my_grp->lock);
Mike Galbraith60e69ee2014-04-07 10:55:15 +02002335 spin_unlock_irq(&grp->lock);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002336
2337 rcu_assign_pointer(p->numa_group, grp);
2338
2339 put_numa_group(my_grp);
Peter Zijlstra33547812013-10-09 10:24:48 +02002340 return;
2341
2342no_join:
2343 rcu_read_unlock();
2344 return;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002345}
2346
Jann Horn837ffc92019-07-16 17:20:45 +02002347/*
2348 * Get rid of NUMA staticstics associated with a task (either current or dead).
2349 * If @final is set, the task is dead and has reached refcount zero, so we can
2350 * safely free all relevant data structures. Otherwise, there might be
2351 * concurrent reads from places like load balancing and procfs, and we should
2352 * reset the data back to default state without freeing ->numa_faults.
2353 */
2354void task_numa_free(struct task_struct *p, bool final)
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002355{
2356 struct numa_group *grp = p->numa_group;
Jann Horn837ffc92019-07-16 17:20:45 +02002357 unsigned long *numa_faults = p->numa_faults;
Steven Rostedte9dd6852014-05-27 17:02:04 -04002358 unsigned long flags;
2359 int i;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002360
Jann Horn837ffc92019-07-16 17:20:45 +02002361 if (!numa_faults)
2362 return;
2363
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002364 if (grp) {
Steven Rostedte9dd6852014-05-27 17:02:04 -04002365 spin_lock_irqsave(&grp->lock, flags);
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002366 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
Iulia Manda44dba3d2014-10-31 02:13:31 +02002367 grp->faults[i] -= p->numa_faults[i];
Mel Gorman989348b2013-10-07 11:29:40 +01002368 grp->total_faults -= p->total_numa_faults;
2369
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002370 grp->nr_tasks--;
Steven Rostedte9dd6852014-05-27 17:02:04 -04002371 spin_unlock_irqrestore(&grp->lock, flags);
Andreea-Cristina Bernat35b123e2014-08-22 17:50:43 +03002372 RCU_INIT_POINTER(p->numa_group, NULL);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002373 put_numa_group(grp);
2374 }
2375
Jann Horn837ffc92019-07-16 17:20:45 +02002376 if (final) {
2377 p->numa_faults = NULL;
2378 kfree(numa_faults);
2379 } else {
2380 p->total_numa_faults = 0;
2381 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2382 numa_faults[i] = 0;
2383 }
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002384}
2385
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002386/*
2387 * Got a PROT_NONE fault for a page on @node.
2388 */
Rik van Riel58b46da2014-01-27 17:03:47 -05002389void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002390{
2391 struct task_struct *p = current;
Peter Zijlstra6688cc02013-10-07 11:29:24 +01002392 bool migrated = flags & TNF_MIGRATED;
Rik van Riel58b46da2014-01-27 17:03:47 -05002393 int cpu_node = task_node(current);
Rik van Riel792568e2014-04-11 13:00:27 -04002394 int local = !!(flags & TNF_FAULT_LOCAL);
Rik van Riel4142c3e2016-01-25 17:07:39 -05002395 struct numa_group *ng;
Mel Gormanac8e8952013-10-07 11:29:03 +01002396 int priv;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002397
Srikar Dronamraju2a595722015-08-11 21:54:21 +05302398 if (!static_branch_likely(&sched_numa_balancing))
Mel Gorman1a687c22012-11-22 11:16:36 +00002399 return;
2400
Mel Gorman9ff1d9f2013-10-07 11:29:04 +01002401 /* for example, ksmd faulting in a user's mm */
2402 if (!p->mm)
2403 return;
2404
Mel Gormanf809ca92013-10-07 11:28:57 +01002405 /* Allocate buffer to track faults on a per-node basis */
Iulia Manda44dba3d2014-10-31 02:13:31 +02002406 if (unlikely(!p->numa_faults)) {
2407 int size = sizeof(*p->numa_faults) *
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002408 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
Mel Gormanf809ca92013-10-07 11:28:57 +01002409
Iulia Manda44dba3d2014-10-31 02:13:31 +02002410 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2411 if (!p->numa_faults)
Mel Gormanf809ca92013-10-07 11:28:57 +01002412 return;
Mel Gorman745d6142013-10-07 11:28:59 +01002413
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002414 p->total_numa_faults = 0;
Rik van Riel04bb2f92013-10-07 11:29:36 +01002415 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
Mel Gormanf809ca92013-10-07 11:28:57 +01002416 }
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002417
Mel Gormanfb003b82012-11-15 09:01:14 +00002418 /*
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002419 * First accesses are treated as private, otherwise consider accesses
2420 * to be private if the accessing pid has not changed
2421 */
2422 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2423 priv = 1;
2424 } else {
2425 priv = cpupid_match_pid(p, last_cpupid);
Peter Zijlstra6688cc02013-10-07 11:29:24 +01002426 if (!priv && !(flags & TNF_NO_GROUP))
Mel Gorman3e6a9412013-10-07 11:29:35 +01002427 task_numa_group(p, last_cpupid, flags, &priv);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002428 }
2429
Rik van Riel792568e2014-04-11 13:00:27 -04002430 /*
2431 * If a workload spans multiple NUMA nodes, a shared fault that
2432 * occurs wholly within the set of nodes that the workload is
2433 * actively using should be counted as local. This allows the
2434 * scan rate to slow down when a workload has settled down.
2435 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05002436 ng = p->numa_group;
2437 if (!priv && !local && ng && ng->active_nodes > 1 &&
2438 numa_is_active_node(cpu_node, ng) &&
2439 numa_is_active_node(mem_node, ng))
Rik van Riel792568e2014-04-11 13:00:27 -04002440 local = 1;
2441
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002442 task_numa_placement(p);
Mel Gormanf809ca92013-10-07 11:28:57 +01002443
Rik van Riel2739d3e2013-10-07 11:29:41 +01002444 /*
2445 * Retry task to preferred node migration periodically, in case it
2446 * case it previously failed, or the scheduler moved us.
2447 */
2448 if (time_after(jiffies, p->numa_migrate_retry))
Mel Gorman6b9a7462013-10-07 11:29:11 +01002449 numa_migrate_preferred(p);
2450
Ingo Molnarb32e86b2013-10-07 11:29:30 +01002451 if (migrated)
2452 p->numa_pages_migrated += pages;
Mel Gorman074c2382015-03-25 15:55:42 -07002453 if (flags & TNF_MIGRATE_FAIL)
2454 p->numa_faults_locality[2] += pages;
Ingo Molnarb32e86b2013-10-07 11:29:30 +01002455
Iulia Manda44dba3d2014-10-31 02:13:31 +02002456 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2457 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
Rik van Riel792568e2014-04-11 13:00:27 -04002458 p->numa_faults_locality[local] += pages;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002459}
2460
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002461static void reset_ptenuma_scan(struct task_struct *p)
2462{
Jason Low7e5a2c12015-04-30 17:28:14 -07002463 /*
2464 * We only did a read acquisition of the mmap sem, so
2465 * p->mm->numa_scan_seq is written to without exclusive access
2466 * and the update is not guaranteed to be atomic. That's not
2467 * much of an issue though, since this is just used for
2468 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2469 * expensive, to avoid any form of compiler optimizations:
2470 */
Jason Low316c1608d2015-04-28 13:00:20 -07002471 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002472 p->mm->numa_scan_offset = 0;
2473}
2474
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002475/*
2476 * The expensive part of numa migration is done from task_work context.
2477 * Triggered from task_tick_numa().
2478 */
2479void task_numa_work(struct callback_head *work)
2480{
2481 unsigned long migrate, next_scan, now = jiffies;
2482 struct task_struct *p = current;
2483 struct mm_struct *mm = p->mm;
Rik van Riel51170842015-11-05 15:56:23 -05002484 u64 runtime = p->se.sum_exec_runtime;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002485 struct vm_area_struct *vma;
Mel Gorman9f406042012-11-14 18:34:32 +00002486 unsigned long start, end;
Mel Gorman598f0ec2013-10-07 11:28:55 +01002487 unsigned long nr_pte_updates = 0;
Rik van Riel4620f8c2015-09-11 09:00:27 -04002488 long pages, virtpages;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002489
Peter Zijlstra9148a3a2016-09-20 22:34:51 +02002490 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002491
2492 work->next = work; /* protect against double add */
2493 /*
2494 * Who cares about NUMA placement when they're dying.
2495 *
2496 * NOTE: make sure not to dereference p->mm before this check,
2497 * exit_task_work() happens _after_ exit_mm() so we could be called
2498 * without p->mm even though we still had it when we enqueued this
2499 * work.
2500 */
2501 if (p->flags & PF_EXITING)
2502 return;
2503
Mel Gorman930aa172013-10-07 11:29:37 +01002504 if (!mm->numa_next_scan) {
Mel Gorman7e8d16b2013-10-07 11:28:54 +01002505 mm->numa_next_scan = now +
2506 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
Mel Gormanb8593bf2012-11-21 01:18:23 +00002507 }
2508
2509 /*
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002510 * Enforce maximal scan/migration frequency..
2511 */
2512 migrate = mm->numa_next_scan;
2513 if (time_before(now, migrate))
2514 return;
2515
Mel Gorman598f0ec2013-10-07 11:28:55 +01002516 if (p->numa_scan_period == 0) {
2517 p->numa_scan_period_max = task_scan_max(p);
2518 p->numa_scan_period = task_scan_min(p);
2519 }
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002520
Mel Gormanfb003b82012-11-15 09:01:14 +00002521 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002522 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2523 return;
2524
Mel Gormane14808b2012-11-19 10:59:15 +00002525 /*
Peter Zijlstra19a78d12013-10-07 11:28:51 +01002526 * Delay this task enough that another task of this mm will likely win
2527 * the next time around.
2528 */
2529 p->node_stamp += 2 * TICK_NSEC;
2530
Mel Gorman9f406042012-11-14 18:34:32 +00002531 start = mm->numa_scan_offset;
2532 pages = sysctl_numa_balancing_scan_size;
2533 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
Rik van Riel4620f8c2015-09-11 09:00:27 -04002534 virtpages = pages * 8; /* Scan up to this much virtual space */
Mel Gorman9f406042012-11-14 18:34:32 +00002535 if (!pages)
2536 return;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002537
Rik van Riel4620f8c2015-09-11 09:00:27 -04002538
Vlastimil Babkaa1e7a9e2017-05-15 15:13:16 +02002539 if (!down_read_trylock(&mm->mmap_sem))
2540 return;
Mel Gorman9f406042012-11-14 18:34:32 +00002541 vma = find_vma(mm, start);
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002542 if (!vma) {
2543 reset_ptenuma_scan(p);
Mel Gorman9f406042012-11-14 18:34:32 +00002544 start = 0;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002545 vma = mm->mmap;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002546 }
Mel Gorman9f406042012-11-14 18:34:32 +00002547 for (; vma; vma = vma->vm_next) {
Naoya Horiguchi6b79c572015-04-07 14:26:47 -07002548 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
Mel Gorman8e76d4e2015-06-10 11:15:00 -07002549 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002550 continue;
Naoya Horiguchi6b79c572015-04-07 14:26:47 -07002551 }
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002552
Mel Gorman4591ce4f2013-10-07 11:29:13 +01002553 /*
2554 * Shared library pages mapped by multiple processes are not
2555 * migrated as it is expected they are cache replicated. Avoid
2556 * hinting faults in read-only file-backed mappings or the vdso
2557 * as migrating the pages will be of marginal benefit.
2558 */
2559 if (!vma->vm_mm ||
2560 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2561 continue;
2562
Mel Gorman3c67f472013-12-18 17:08:40 -08002563 /*
2564 * Skip inaccessible VMAs to avoid any confusion between
2565 * PROT_NONE and NUMA hinting ptes
2566 */
2567 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2568 continue;
2569
Mel Gorman9f406042012-11-14 18:34:32 +00002570 do {
2571 start = max(start, vma->vm_start);
2572 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2573 end = min(end, vma->vm_end);
Rik van Riel4620f8c2015-09-11 09:00:27 -04002574 nr_pte_updates = change_prot_numa(vma, start, end);
Mel Gorman598f0ec2013-10-07 11:28:55 +01002575
2576 /*
Rik van Riel4620f8c2015-09-11 09:00:27 -04002577 * Try to scan sysctl_numa_balancing_size worth of
2578 * hpages that have at least one present PTE that
2579 * is not already pte-numa. If the VMA contains
2580 * areas that are unused or already full of prot_numa
2581 * PTEs, scan up to virtpages, to skip through those
2582 * areas faster.
Mel Gorman598f0ec2013-10-07 11:28:55 +01002583 */
2584 if (nr_pte_updates)
2585 pages -= (end - start) >> PAGE_SHIFT;
Rik van Riel4620f8c2015-09-11 09:00:27 -04002586 virtpages -= (end - start) >> PAGE_SHIFT;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002587
Mel Gorman9f406042012-11-14 18:34:32 +00002588 start = end;
Rik van Riel4620f8c2015-09-11 09:00:27 -04002589 if (pages <= 0 || virtpages <= 0)
Mel Gorman9f406042012-11-14 18:34:32 +00002590 goto out;
Rik van Riel3cf19622014-02-18 17:12:44 -05002591
2592 cond_resched();
Mel Gorman9f406042012-11-14 18:34:32 +00002593 } while (end != vma->vm_end);
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002594 }
2595
Mel Gorman9f406042012-11-14 18:34:32 +00002596out:
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002597 /*
Peter Zijlstrac69307d2013-10-07 11:28:41 +01002598 * It is possible to reach the end of the VMA list but the last few
2599 * VMAs are not guaranteed to the vma_migratable. If they are not, we
2600 * would find the !migratable VMA on the next scan but not reset the
2601 * scanner to the start so check it now.
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002602 */
2603 if (vma)
Mel Gorman9f406042012-11-14 18:34:32 +00002604 mm->numa_scan_offset = start;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002605 else
2606 reset_ptenuma_scan(p);
2607 up_read(&mm->mmap_sem);
Rik van Riel51170842015-11-05 15:56:23 -05002608
2609 /*
2610 * Make sure tasks use at least 32x as much time to run other code
2611 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2612 * Usually update_task_scan_period slows down scanning enough; on an
2613 * overloaded system we need to limit overhead on a per task basis.
2614 */
2615 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2616 u64 diff = p->se.sum_exec_runtime - runtime;
2617 p->node_stamp += 32 * diff;
2618 }
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002619}
2620
2621/*
2622 * Drive the periodic memory faults..
2623 */
2624void task_tick_numa(struct rq *rq, struct task_struct *curr)
2625{
2626 struct callback_head *work = &curr->numa_work;
2627 u64 period, now;
2628
2629 /*
2630 * We don't care about NUMA placement if we don't have memory.
2631 */
2632 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2633 return;
2634
2635 /*
2636 * Using runtime rather than walltime has the dual advantage that
2637 * we (mostly) drive the selection from busy threads and that the
2638 * task needs to have done some actual work before we bother with
2639 * NUMA placement.
2640 */
2641 now = curr->se.sum_exec_runtime;
2642 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2643
Rik van Riel25b3e5a2015-11-05 15:56:22 -05002644 if (now > curr->node_stamp + period) {
Peter Zijlstra4b96a29b2012-10-25 14:16:47 +02002645 if (!curr->node_stamp)
Mel Gorman598f0ec2013-10-07 11:28:55 +01002646 curr->numa_scan_period = task_scan_min(curr);
Peter Zijlstra19a78d12013-10-07 11:28:51 +01002647 curr->node_stamp += period;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002648
2649 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2650 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2651 task_work_add(curr, work, true);
2652 }
2653 }
2654}
2655#else
2656static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2657{
2658}
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01002659
2660static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2661{
2662}
2663
2664static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2665{
2666}
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002667#endif /* CONFIG_NUMA_BALANCING */
2668
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002669static void
2670account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2671{
2672 update_load_add(&cfs_rq->load, se->load.weight);
Peter Zijlstrac09595f2008-06-27 13:41:14 +02002673 if (!parent_entity(se))
Peter Zijlstra029632f2011-10-25 10:00:11 +02002674 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra367456c2012-02-20 21:49:09 +01002675#ifdef CONFIG_SMP
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01002676 if (entity_is_task(se)) {
2677 struct rq *rq = rq_of(cfs_rq);
2678
2679 account_numa_enqueue(rq, task_of(se));
2680 list_add(&se->group_node, &rq->cfs_tasks);
2681 }
Peter Zijlstra367456c2012-02-20 21:49:09 +01002682#endif
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002683 cfs_rq->nr_running++;
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002684}
2685
2686static void
2687account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2688{
2689 update_load_sub(&cfs_rq->load, se->load.weight);
Peter Zijlstrac09595f2008-06-27 13:41:14 +02002690 if (!parent_entity(se))
Peter Zijlstra029632f2011-10-25 10:00:11 +02002691 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
Tim Chenbfdb1982016-02-01 14:47:59 -08002692#ifdef CONFIG_SMP
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01002693 if (entity_is_task(se)) {
2694 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
Bharata B Raob87f1722008-09-25 09:53:54 +05302695 list_del_init(&se->group_node);
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01002696 }
Tim Chenbfdb1982016-02-01 14:47:59 -08002697#endif
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002698 cfs_rq->nr_running--;
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002699}
2700
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002701#ifdef CONFIG_FAIR_GROUP_SCHED
2702# ifdef CONFIG_SMP
Paul Turner6d5ab292011-01-21 20:45:01 -08002703static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002704{
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02002705 long tg_weight, load, shares;
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002706
Peter Zijlstraea1dc6f2016-06-24 16:11:02 +02002707 /*
2708 * This really should be: cfs_rq->avg.load_avg, but instead we use
2709 * cfs_rq->load.weight, which is its upper bound. This helps ramp up
2710 * the shares for small weight interactive tasks.
2711 */
2712 load = scale_load_down(cfs_rq->load.weight);
2713
2714 tg_weight = atomic_long_read(&tg->load_avg);
2715
2716 /* Ensure tg_weight >= load */
2717 tg_weight -= cfs_rq->tg_load_avg_contrib;
2718 tg_weight += load;
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002719
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002720 shares = (tg->shares * load);
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02002721 if (tg_weight)
2722 shares /= tg_weight;
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002723
2724 if (shares < MIN_SHARES)
2725 shares = MIN_SHARES;
2726 if (shares > tg->shares)
2727 shares = tg->shares;
2728
2729 return shares;
2730}
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002731# else /* CONFIG_SMP */
Paul Turner6d5ab292011-01-21 20:45:01 -08002732static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002733{
2734 return tg->shares;
2735}
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002736# endif /* CONFIG_SMP */
Peter Zijlstraea1dc6f2016-06-24 16:11:02 +02002737
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002738static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2739 unsigned long weight)
2740{
Paul Turner19e5eeb2010-12-15 19:10:18 -08002741 if (se->on_rq) {
2742 /* commit outstanding execution time */
2743 if (cfs_rq->curr == se)
2744 update_curr(cfs_rq);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002745 account_entity_dequeue(cfs_rq, se);
Paul Turner19e5eeb2010-12-15 19:10:18 -08002746 }
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002747
2748 update_load_set(&se->load, weight);
2749
2750 if (se->on_rq)
2751 account_entity_enqueue(cfs_rq, se);
2752}
2753
Paul Turner82958362012-10-04 13:18:31 +02002754static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2755
Vincent Guittot6960f772016-12-21 16:50:26 +01002756static void update_cfs_shares(struct sched_entity *se)
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002757{
Vincent Guittot6960f772016-12-21 16:50:26 +01002758 struct cfs_rq *cfs_rq = group_cfs_rq(se);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002759 struct task_group *tg;
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002760 long shares;
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002761
Vincent Guittot6960f772016-12-21 16:50:26 +01002762 if (!cfs_rq)
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002763 return;
Vincent Guittot6960f772016-12-21 16:50:26 +01002764
2765 if (throttled_hierarchy(cfs_rq))
2766 return;
2767
2768 tg = cfs_rq->tg;
2769
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002770#ifndef CONFIG_SMP
2771 if (likely(se->load.weight == tg->shares))
2772 return;
2773#endif
Paul Turner6d5ab292011-01-21 20:45:01 -08002774 shares = calc_cfs_shares(cfs_rq, tg);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002775
2776 reweight_entity(cfs_rq_of(se), se, shares);
2777}
Vincent Guittot6960f772016-12-21 16:50:26 +01002778
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002779#else /* CONFIG_FAIR_GROUP_SCHED */
Vincent Guittot6960f772016-12-21 16:50:26 +01002780static inline void update_cfs_shares(struct sched_entity *se)
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002781{
2782}
2783#endif /* CONFIG_FAIR_GROUP_SCHED */
2784
Alex Shi141965c2013-06-26 13:05:39 +08002785#ifdef CONFIG_SMP
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002786u32 sched_get_wake_up_idle(struct task_struct *p)
2787{
2788 u32 enabled = p->flags & PF_WAKE_UP_IDLE;
2789
2790 return !!enabled;
2791}
Olav Haugan58e45fd2017-11-14 10:13:50 -08002792EXPORT_SYMBOL(sched_get_wake_up_idle);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002793
2794int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle)
2795{
2796 int enable = !!wake_up_idle;
2797
2798 if (enable)
2799 p->flags |= PF_WAKE_UP_IDLE;
2800 else
2801 p->flags &= ~PF_WAKE_UP_IDLE;
2802
2803 return 0;
2804}
Olav Haugan58e45fd2017-11-14 10:13:50 -08002805EXPORT_SYMBOL(sched_set_wake_up_idle);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002806
Paul Turner5b51f2f2012-10-04 13:18:32 +02002807/* Precomputed fixed inverse multiplies for multiplication by y^n */
2808static const u32 runnable_avg_yN_inv[] = {
2809 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2810 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2811 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2812 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2813 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2814 0x85aac367, 0x82cd8698,
2815};
2816
2817/*
2818 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
2819 * over-estimates when re-combining.
2820 */
2821static const u32 runnable_avg_yN_sum[] = {
2822 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2823 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2824 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2825};
2826
2827/*
Yuyang Du7b20b912016-05-03 05:54:27 +08002828 * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
2829 * lower integers. See Documentation/scheduler/sched-avg.txt how these
2830 * were generated:
2831 */
2832static const u32 __accumulated_sum_N32[] = {
2833 0, 23371, 35056, 40899, 43820, 45281,
2834 46011, 46376, 46559, 46650, 46696, 46719,
2835};
2836
2837/*
Paul Turner9d85f212012-10-04 13:18:29 +02002838 * Approximate:
2839 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
2840 */
2841static __always_inline u64 decay_load(u64 val, u64 n)
2842{
Paul Turner5b51f2f2012-10-04 13:18:32 +02002843 unsigned int local_n;
2844
2845 if (!n)
2846 return val;
2847 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2848 return 0;
2849
2850 /* after bounds checking we can collapse to 32-bit */
2851 local_n = n;
2852
2853 /*
2854 * As y^PERIOD = 1/2, we can combine
Zhihui Zhang9c58c792014-09-20 21:24:36 -04002855 * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2856 * With a look-up table which covers y^n (n<PERIOD)
Paul Turner5b51f2f2012-10-04 13:18:32 +02002857 *
2858 * To achieve constant time decay_load.
2859 */
2860 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2861 val >>= local_n / LOAD_AVG_PERIOD;
2862 local_n %= LOAD_AVG_PERIOD;
Paul Turner9d85f212012-10-04 13:18:29 +02002863 }
2864
Yuyang Du9d89c252015-07-15 08:04:37 +08002865 val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2866 return val;
Paul Turner5b51f2f2012-10-04 13:18:32 +02002867}
2868
2869/*
2870 * For updates fully spanning n periods, the contribution to runnable
2871 * average will be: \Sum 1024*y^n
2872 *
2873 * We can compute this reasonably efficiently by combining:
2874 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
2875 */
2876static u32 __compute_runnable_contrib(u64 n)
2877{
2878 u32 contrib = 0;
2879
2880 if (likely(n <= LOAD_AVG_PERIOD))
2881 return runnable_avg_yN_sum[n];
2882 else if (unlikely(n >= LOAD_AVG_MAX_N))
2883 return LOAD_AVG_MAX;
2884
Yuyang Du7b20b912016-05-03 05:54:27 +08002885 /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
2886 contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
2887 n %= LOAD_AVG_PERIOD;
Paul Turner5b51f2f2012-10-04 13:18:32 +02002888 contrib = decay_load(contrib, n);
2889 return contrib + runnable_avg_yN_sum[n];
Paul Turner9d85f212012-10-04 13:18:29 +02002890}
2891
Peter Zijlstra54a21382015-09-07 15:05:42 +02002892#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
Dietmar Eggemanne0f5f3a2015-08-14 17:23:09 +01002893
Paul Turner9d85f212012-10-04 13:18:29 +02002894/*
2895 * We can represent the historical contribution to runnable average as the
2896 * coefficients of a geometric series. To do this we sub-divide our runnable
2897 * history into segments of approximately 1ms (1024us); label the segment that
2898 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
2899 *
2900 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
2901 * p0 p1 p2
2902 * (now) (~1ms ago) (~2ms ago)
2903 *
2904 * Let u_i denote the fraction of p_i that the entity was runnable.
2905 *
2906 * We then designate the fractions u_i as our co-efficients, yielding the
2907 * following representation of historical load:
2908 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
2909 *
2910 * We choose y based on the with of a reasonably scheduling period, fixing:
2911 * y^32 = 0.5
2912 *
2913 * This means that the contribution to load ~32ms ago (u_32) will be weighted
2914 * approximately half as much as the contribution to load within the last ms
2915 * (u_0).
2916 *
2917 * When a period "rolls over" and we have new u_0`, multiplying the previous
2918 * sum again by y is sufficient to update:
2919 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2920 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2921 */
Yuyang Du9d89c252015-07-15 08:04:37 +08002922static __always_inline int
2923__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
Yuyang Du13962232015-07-15 08:04:41 +08002924 unsigned long weight, int running, struct cfs_rq *cfs_rq)
Paul Turner9d85f212012-10-04 13:18:29 +02002925{
Dietmar Eggemanne0f5f3a2015-08-14 17:23:09 +01002926 u64 delta, scaled_delta, periods;
Yuyang Du9d89c252015-07-15 08:04:37 +08002927 u32 contrib;
Peter Zijlstra6115c792015-09-07 15:09:15 +02002928 unsigned int delta_w, scaled_delta_w, decayed = 0;
Dietmar Eggemann6f2b0452015-09-07 14:57:22 +01002929 unsigned long scale_freq, scale_cpu;
Paul Turner9d85f212012-10-04 13:18:29 +02002930
Yuyang Du9d89c252015-07-15 08:04:37 +08002931 delta = now - sa->last_update_time;
Paul Turner9d85f212012-10-04 13:18:29 +02002932 /*
2933 * This should only happen when time goes backwards, which it
2934 * unfortunately does during sched clock init when we swap over to TSC.
2935 */
2936 if ((s64)delta < 0) {
Yuyang Du9d89c252015-07-15 08:04:37 +08002937 sa->last_update_time = now;
Paul Turner9d85f212012-10-04 13:18:29 +02002938 return 0;
2939 }
2940
2941 /*
2942 * Use 1024ns as the unit of measurement since it's a reasonable
2943 * approximation of 1us and fast to compute.
2944 */
2945 delta >>= 10;
2946 if (!delta)
2947 return 0;
Yuyang Du9d89c252015-07-15 08:04:37 +08002948 sa->last_update_time = now;
Paul Turner9d85f212012-10-04 13:18:29 +02002949
Dietmar Eggemann6f2b0452015-09-07 14:57:22 +01002950 scale_freq = arch_scale_freq_capacity(NULL, cpu);
2951 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
Juri Lelli0a942002015-11-09 12:06:24 +00002952 trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
Dietmar Eggemann6f2b0452015-09-07 14:57:22 +01002953
Paul Turner9d85f212012-10-04 13:18:29 +02002954 /* delta_w is the amount already accumulated against our next period */
Yuyang Du9d89c252015-07-15 08:04:37 +08002955 delta_w = sa->period_contrib;
Paul Turner9d85f212012-10-04 13:18:29 +02002956 if (delta + delta_w >= 1024) {
Paul Turner9d85f212012-10-04 13:18:29 +02002957 decayed = 1;
2958
Yuyang Du9d89c252015-07-15 08:04:37 +08002959 /* how much left for next period will start over, we don't know yet */
2960 sa->period_contrib = 0;
2961
Paul Turner9d85f212012-10-04 13:18:29 +02002962 /*
2963 * Now that we know we're crossing a period boundary, figure
2964 * out how much from delta we need to complete the current
2965 * period and accrue it.
2966 */
2967 delta_w = 1024 - delta_w;
Peter Zijlstra54a21382015-09-07 15:05:42 +02002968 scaled_delta_w = cap_scale(delta_w, scale_freq);
Yuyang Du13962232015-07-15 08:04:41 +08002969 if (weight) {
Dietmar Eggemanne0f5f3a2015-08-14 17:23:09 +01002970 sa->load_sum += weight * scaled_delta_w;
2971 if (cfs_rq) {
2972 cfs_rq->runnable_load_sum +=
2973 weight * scaled_delta_w;
2974 }
Yuyang Du13962232015-07-15 08:04:41 +08002975 }
Vincent Guittot36ee28e2015-02-27 16:54:04 +01002976 if (running)
Peter Zijlstra006cdf02015-09-09 09:06:17 +02002977 sa->util_sum += scaled_delta_w * scale_cpu;
Paul Turner9d85f212012-10-04 13:18:29 +02002978
Paul Turner5b51f2f2012-10-04 13:18:32 +02002979 delta -= delta_w;
Paul Turner9d85f212012-10-04 13:18:29 +02002980
Paul Turner5b51f2f2012-10-04 13:18:32 +02002981 /* Figure out how many additional periods this update spans */
2982 periods = delta / 1024;
2983 delta %= 1024;
2984
Yuyang Du9d89c252015-07-15 08:04:37 +08002985 sa->load_sum = decay_load(sa->load_sum, periods + 1);
Yuyang Du13962232015-07-15 08:04:41 +08002986 if (cfs_rq) {
2987 cfs_rq->runnable_load_sum =
2988 decay_load(cfs_rq->runnable_load_sum, periods + 1);
2989 }
Yuyang Du9d89c252015-07-15 08:04:37 +08002990 sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
Paul Turner5b51f2f2012-10-04 13:18:32 +02002991
2992 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
Yuyang Du9d89c252015-07-15 08:04:37 +08002993 contrib = __compute_runnable_contrib(periods);
Peter Zijlstra54a21382015-09-07 15:05:42 +02002994 contrib = cap_scale(contrib, scale_freq);
Yuyang Du13962232015-07-15 08:04:41 +08002995 if (weight) {
Yuyang Du9d89c252015-07-15 08:04:37 +08002996 sa->load_sum += weight * contrib;
Yuyang Du13962232015-07-15 08:04:41 +08002997 if (cfs_rq)
2998 cfs_rq->runnable_load_sum += weight * contrib;
2999 }
Vincent Guittot36ee28e2015-02-27 16:54:04 +01003000 if (running)
Peter Zijlstra006cdf02015-09-09 09:06:17 +02003001 sa->util_sum += contrib * scale_cpu;
Paul Turner9d85f212012-10-04 13:18:29 +02003002 }
3003
3004 /* Remainder of delta accrued against u_0` */
Peter Zijlstra54a21382015-09-07 15:05:42 +02003005 scaled_delta = cap_scale(delta, scale_freq);
Yuyang Du13962232015-07-15 08:04:41 +08003006 if (weight) {
Dietmar Eggemanne0f5f3a2015-08-14 17:23:09 +01003007 sa->load_sum += weight * scaled_delta;
Yuyang Du13962232015-07-15 08:04:41 +08003008 if (cfs_rq)
Dietmar Eggemanne0f5f3a2015-08-14 17:23:09 +01003009 cfs_rq->runnable_load_sum += weight * scaled_delta;
Yuyang Du13962232015-07-15 08:04:41 +08003010 }
Vincent Guittot36ee28e2015-02-27 16:54:04 +01003011 if (running)
Peter Zijlstra006cdf02015-09-09 09:06:17 +02003012 sa->util_sum += scaled_delta * scale_cpu;
Yuyang Du9d89c252015-07-15 08:04:37 +08003013
3014 sa->period_contrib += delta;
3015
3016 if (decayed) {
3017 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
Yuyang Du13962232015-07-15 08:04:41 +08003018 if (cfs_rq) {
3019 cfs_rq->runnable_load_avg =
3020 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
3021 }
Peter Zijlstra006cdf02015-09-09 09:06:17 +02003022 sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
Yuyang Du9d89c252015-07-15 08:04:37 +08003023 }
Paul Turner9d85f212012-10-04 13:18:29 +02003024
3025 return decayed;
3026}
3027
Vincent Guittot96956e22016-11-08 10:53:44 +01003028/*
3029 * Signed add and clamp on underflow.
3030 *
3031 * Explicitly do a load-store to ensure the intermediate value never hits
3032 * memory. This allows lockless observations without ever seeing the negative
3033 * values.
3034 */
3035#define add_positive(_ptr, _val) do { \
3036 typeof(_ptr) ptr = (_ptr); \
3037 typeof(_val) val = (_val); \
3038 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3039 \
3040 res = var + val; \
3041 \
3042 if (val < 0 && res > var) \
3043 res = 0; \
3044 \
3045 WRITE_ONCE(*ptr, res); \
3046} while (0)
3047
Paul Turnerc566e8e2012-10-04 13:18:30 +02003048#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra7c3edd22016-07-13 10:56:25 +02003049/**
3050 * update_tg_load_avg - update the tg's load avg
3051 * @cfs_rq: the cfs_rq whose avg changed
3052 * @force: update regardless of how small the difference
3053 *
3054 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3055 * However, because tg->load_avg is a global value there are performance
3056 * considerations.
3057 *
3058 * In order to avoid having to look at the other cfs_rq's, we use a
3059 * differential update where we store the last value we propagated. This in
3060 * turn allows skipping updates if the differential is 'small'.
3061 *
3062 * Updating tg's load_avg is necessary before update_cfs_share() (which is
3063 * done) and effective_load() (which is not done because it is too costly).
Paul Turnerbb17f652012-10-04 13:18:31 +02003064 */
Yuyang Du9d89c252015-07-15 08:04:37 +08003065static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
Paul Turnerbb17f652012-10-04 13:18:31 +02003066{
Yuyang Du9d89c252015-07-15 08:04:37 +08003067 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
Paul Turnerbb17f652012-10-04 13:18:31 +02003068
Waiman Longaa0b7ae2015-12-02 13:41:50 -05003069 /*
3070 * No need to update load_avg for root_task_group as it is not used.
3071 */
3072 if (cfs_rq->tg == &root_task_group)
3073 return;
3074
Yuyang Du9d89c252015-07-15 08:04:37 +08003075 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3076 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3077 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
Paul Turnerbb17f652012-10-04 13:18:31 +02003078 }
Paul Turner8165e142012-10-04 13:18:31 +02003079}
Dietmar Eggemannf5f97392014-02-26 11:19:33 +00003080
Byungchul Parkad936d82015-10-24 01:16:19 +09003081/*
3082 * Called within set_task_rq() right before setting a task's cpu. The
3083 * caller only guarantees p->pi_lock is held; no other assumptions,
3084 * including the state of rq->lock, should be made.
3085 */
3086void set_task_rq_fair(struct sched_entity *se,
3087 struct cfs_rq *prev, struct cfs_rq *next)
3088{
3089 if (!sched_feat(ATTACH_AGE_LOAD))
3090 return;
3091
3092 /*
3093 * We are supposed to update the task to "current" time, then its up to
3094 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
3095 * getting what current time is, so simply throw away the out-of-date
3096 * time. This will result in the wakee task is less decayed, but giving
3097 * the wakee more load sounds not bad.
3098 */
3099 if (se->avg.last_update_time && prev) {
3100 u64 p_last_update_time;
3101 u64 n_last_update_time;
3102
3103#ifndef CONFIG_64BIT
3104 u64 p_last_update_time_copy;
3105 u64 n_last_update_time_copy;
3106
3107 do {
3108 p_last_update_time_copy = prev->load_last_update_time_copy;
3109 n_last_update_time_copy = next->load_last_update_time_copy;
3110
3111 smp_rmb();
3112
3113 p_last_update_time = prev->avg.last_update_time;
3114 n_last_update_time = next->avg.last_update_time;
3115
3116 } while (p_last_update_time != p_last_update_time_copy ||
3117 n_last_update_time != n_last_update_time_copy);
3118#else
3119 p_last_update_time = prev->avg.last_update_time;
3120 n_last_update_time = next->avg.last_update_time;
3121#endif
3122 __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
3123 &se->avg, 0, 0, NULL);
3124 se->avg.last_update_time = n_last_update_time;
3125 }
3126}
Vincent Guittot96956e22016-11-08 10:53:44 +01003127
3128/* Take into account change of utilization of a child task group */
3129static inline void
3130update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
3131{
3132 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3133 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3134
3135 /* Nothing to update */
3136 if (!delta)
3137 return;
3138
3139 /* Set new sched_entity's utilization */
3140 se->avg.util_avg = gcfs_rq->avg.util_avg;
3141 se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
3142
3143 /* Update parent cfs_rq utilization */
3144 add_positive(&cfs_rq->avg.util_avg, delta);
3145 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3146}
3147
3148/* Take into account change of load of a child task group */
3149static inline void
3150update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
3151{
3152 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3153 long delta, load = gcfs_rq->avg.load_avg;
3154
3155 /*
3156 * If the load of group cfs_rq is null, the load of the
3157 * sched_entity will also be null so we can skip the formula
3158 */
3159 if (load) {
3160 long tg_load;
3161
3162 /* Get tg's load and ensure tg_load > 0 */
3163 tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
3164
3165 /* Ensure tg_load >= load and updated with current load*/
3166 tg_load -= gcfs_rq->tg_load_avg_contrib;
3167 tg_load += load;
3168
3169 /*
3170 * We need to compute a correction term in the case that the
3171 * task group is consuming more CPU than a task of equal
3172 * weight. A task with a weight equals to tg->shares will have
3173 * a load less or equal to scale_load_down(tg->shares).
3174 * Similarly, the sched_entities that represent the task group
3175 * at parent level, can't have a load higher than
3176 * scale_load_down(tg->shares). And the Sum of sched_entities'
3177 * load must be <= scale_load_down(tg->shares).
3178 */
3179 if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
3180 /* scale gcfs_rq's load into tg's shares*/
3181 load *= scale_load_down(gcfs_rq->tg->shares);
3182 load /= tg_load;
3183 }
3184 }
3185
3186 delta = load - se->avg.load_avg;
3187
3188 /* Nothing to update */
3189 if (!delta)
3190 return;
3191
3192 /* Set new sched_entity's load */
3193 se->avg.load_avg = load;
3194 se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
3195
3196 /* Update parent cfs_rq load */
3197 add_positive(&cfs_rq->avg.load_avg, delta);
3198 cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
3199
3200 /*
3201 * If the sched_entity is already enqueued, we also have to update the
3202 * runnable load avg.
3203 */
3204 if (se->on_rq) {
3205 /* Update parent cfs_rq runnable_load_avg */
3206 add_positive(&cfs_rq->runnable_load_avg, delta);
3207 cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
3208 }
3209}
3210
3211static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
3212{
3213 cfs_rq->propagate_avg = 1;
3214}
3215
3216static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
3217{
3218 struct cfs_rq *cfs_rq = group_cfs_rq(se);
3219
3220 if (!cfs_rq->propagate_avg)
3221 return 0;
3222
3223 cfs_rq->propagate_avg = 0;
3224 return 1;
3225}
3226
3227/* Update task and its cfs_rq load average */
3228static inline int propagate_entity_load_avg(struct sched_entity *se)
3229{
3230 struct cfs_rq *cfs_rq;
3231
3232 if (entity_is_task(se))
3233 return 0;
3234
3235 if (!test_and_clear_tg_cfs_propagate(se))
3236 return 0;
3237
3238 cfs_rq = cfs_rq_of(se);
3239
3240 set_tg_cfs_propagate(cfs_rq);
3241
3242 update_tg_cfs_util(cfs_rq, se);
3243 update_tg_cfs_load(cfs_rq, se);
3244
3245 return 1;
3246}
3247
Vincent Guittot0b4a2f12017-03-17 14:47:22 +01003248/*
3249 * Check if we need to update the load and the utilization of a blocked
3250 * group_entity:
3251 */
3252static inline bool skip_blocked_update(struct sched_entity *se)
3253{
3254 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3255
3256 /*
3257 * If sched_entity still have not zero load or utilization, we have to
3258 * decay it:
3259 */
3260 if (se->avg.load_avg || se->avg.util_avg)
3261 return false;
3262
3263 /*
3264 * If there is a pending propagation, we have to update the load and
3265 * the utilization of the sched_entity:
3266 */
3267 if (gcfs_rq->propagate_avg)
3268 return false;
3269
3270 /*
3271 * Otherwise, the load and the utilization of the sched_entity is
3272 * already zero and there is no pending propagation, so it will be a
3273 * waste of time to try to decay it:
3274 */
3275 return true;
3276}
3277
Peter Zijlstra6e831252014-02-11 16:11:48 +01003278#else /* CONFIG_FAIR_GROUP_SCHED */
Vincent Guittot96956e22016-11-08 10:53:44 +01003279
Yuyang Du9d89c252015-07-15 08:04:37 +08003280static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
Vincent Guittot96956e22016-11-08 10:53:44 +01003281
3282static inline int propagate_entity_load_avg(struct sched_entity *se)
3283{
3284 return 0;
3285}
3286
3287static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
3288
Peter Zijlstra6e831252014-02-11 16:11:48 +01003289#endif /* CONFIG_FAIR_GROUP_SCHED */
Paul Turnerc566e8e2012-10-04 13:18:30 +02003290
Steve Mucklea2c6c912016-03-24 15:26:07 -07003291static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
Yuyang Du9d89c252015-07-15 08:04:37 +08003292{
Rafael J. Wysocki58919e82016-08-16 22:14:55 +02003293 if (&this_rq()->cfs == cfs_rq) {
Steve Muckle21e96f82016-03-21 17:21:07 -07003294 /*
3295 * There are a few boundary cases this might miss but it should
3296 * get called often enough that that should (hopefully) not be
3297 * a real problem -- added to that it only calls on the local
3298 * CPU, so if we enqueue remotely we'll miss an update, but
3299 * the next tick/schedule should update.
3300 *
3301 * It will not get called when we go idle, because the idle
3302 * thread is a different class (!fair), nor will the utilization
3303 * number include things like RT tasks.
3304 *
3305 * As is, the util number is not freq-invariant (we'd have to
3306 * implement arch_scale_freq_capacity() for that).
3307 *
3308 * See cpu_util().
3309 */
Rafael J. Wysocki12bde332016-08-10 03:11:17 +02003310 cpufreq_update_util(rq_of(cfs_rq), 0);
Steve Muckle21e96f82016-03-21 17:21:07 -07003311 }
Steve Mucklea2c6c912016-03-24 15:26:07 -07003312}
3313
Peter Zijlstra89741892016-06-16 10:50:40 +02003314/*
3315 * Unsigned subtract and clamp on underflow.
3316 *
3317 * Explicitly do a load-store to ensure the intermediate value never hits
3318 * memory. This allows lockless observations without ever seeing the negative
3319 * values.
3320 */
3321#define sub_positive(_ptr, _val) do { \
3322 typeof(_ptr) ptr = (_ptr); \
3323 typeof(*ptr) val = (_val); \
3324 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3325 res = var - val; \
3326 if (res > var) \
3327 res = 0; \
3328 WRITE_ONCE(*ptr, res); \
3329} while (0)
3330
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02003331/**
3332 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
3333 * @now: current time, as per cfs_rq_clock_task()
3334 * @cfs_rq: cfs_rq to update
3335 * @update_freq: should we call cfs_rq_util_change() or will the call do so
3336 *
3337 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
3338 * avg. The immediate corollary is that all (fair) tasks must be attached, see
3339 * post_init_entity_util_avg().
3340 *
3341 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
3342 *
Peter Zijlstra7c3edd22016-07-13 10:56:25 +02003343 * Returns true if the load decayed or we removed load.
3344 *
3345 * Since both these conditions indicate a changed cfs_rq->avg.load we should
3346 * call update_tg_load_avg() when this function returns true.
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02003347 */
Steve Mucklea2c6c912016-03-24 15:26:07 -07003348static inline int
3349update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3350{
3351 struct sched_avg *sa = &cfs_rq->avg;
3352 int decayed, removed_load = 0, removed_util = 0;
3353
3354 if (atomic_long_read(&cfs_rq->removed_load_avg)) {
3355 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
Peter Zijlstra89741892016-06-16 10:50:40 +02003356 sub_positive(&sa->load_avg, r);
3357 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
Steve Mucklea2c6c912016-03-24 15:26:07 -07003358 removed_load = 1;
Vincent Guittot3a34bf52016-11-08 10:53:46 +01003359 set_tg_cfs_propagate(cfs_rq);
Steve Mucklea2c6c912016-03-24 15:26:07 -07003360 }
3361
3362 if (atomic_long_read(&cfs_rq->removed_util_avg)) {
3363 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
Peter Zijlstra89741892016-06-16 10:50:40 +02003364 sub_positive(&sa->util_avg, r);
3365 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
Steve Mucklea2c6c912016-03-24 15:26:07 -07003366 removed_util = 1;
Vincent Guittot3a34bf52016-11-08 10:53:46 +01003367 set_tg_cfs_propagate(cfs_rq);
Steve Mucklea2c6c912016-03-24 15:26:07 -07003368 }
3369
3370 decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
3371 scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
3372
3373#ifndef CONFIG_64BIT
3374 smp_wmb();
3375 cfs_rq->load_last_update_time_copy = sa->last_update_time;
3376#endif
3377
3378 if (update_freq && (decayed || removed_util))
3379 cfs_rq_util_change(cfs_rq);
Steve Muckle21e96f82016-03-21 17:21:07 -07003380
Brendan Jackmanb2246472017-01-10 11:31:01 +00003381 /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
3382 if (cfs_rq == &rq_of(cfs_rq)->cfs)
3383 trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
Brendan Jackman0f493a72017-01-09 17:20:11 +00003384
Steve Muckle41e0d372016-03-21 17:21:08 -07003385 return decayed || removed_load;
Yuyang Du9d89c252015-07-15 08:04:37 +08003386}
3387
Vincent Guittot96956e22016-11-08 10:53:44 +01003388/*
3389 * Optional action to be done while updating the load average
3390 */
3391#define UPDATE_TG 0x1
3392#define SKIP_AGE_LOAD 0x2
3393
Yuyang Du9d89c252015-07-15 08:04:37 +08003394/* Update task and its cfs_rq load average */
Vincent Guittot96956e22016-11-08 10:53:44 +01003395static inline void update_load_avg(struct sched_entity *se, int flags)
Paul Turner9d85f212012-10-04 13:18:29 +02003396{
Paul Turner2dac7542012-10-04 13:18:30 +02003397 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Yuyang Du9d89c252015-07-15 08:04:37 +08003398 u64 now = cfs_rq_clock_task(cfs_rq);
Rafael J. Wysocki34e2c552016-02-15 20:20:42 +01003399 struct rq *rq = rq_of(cfs_rq);
3400 int cpu = cpu_of(rq);
Vincent Guittot96956e22016-11-08 10:53:44 +01003401 int decayed;
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05303402 void *ptr = NULL;
Paul Turner2dac7542012-10-04 13:18:30 +02003403
Paul Turnerf1b17282012-10-04 13:18:31 +02003404 /*
Yuyang Du9d89c252015-07-15 08:04:37 +08003405 * Track task load average for carrying it to new CPU after migrated, and
3406 * track group sched_entity load average for task_h_load calc in migration
Paul Turnerf1b17282012-10-04 13:18:31 +02003407 */
Vincent Guittot96956e22016-11-08 10:53:44 +01003408 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
3409 __update_load_avg(now, cpu, &se->avg,
Byungchul Parka05e8c52015-08-20 20:21:56 +09003410 se->on_rq * scale_load_down(se->load.weight),
3411 cfs_rq->curr == se, NULL);
Vincent Guittot96956e22016-11-08 10:53:44 +01003412 }
Paul Turnerf1b17282012-10-04 13:18:31 +02003413
Vincent Guittot96956e22016-11-08 10:53:44 +01003414 decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
3415 decayed |= propagate_entity_load_avg(se);
3416
3417 if (decayed && (flags & UPDATE_TG))
Yuyang Du9d89c252015-07-15 08:04:37 +08003418 update_tg_load_avg(cfs_rq, 0);
Juri Lellia4b0c3a2015-11-09 12:07:27 +00003419
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05303420 if (entity_is_task(se)) {
3421#ifdef CONFIG_SCHED_WALT
3422 ptr = (void *)&(task_of(se)->ravg);
3423#endif
3424 trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
3425 }
Yuyang Du9d89c252015-07-15 08:04:37 +08003426}
Paul Turner2dac7542012-10-04 13:18:30 +02003427
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02003428/**
3429 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
3430 * @cfs_rq: cfs_rq to attach to
3431 * @se: sched_entity to attach
3432 *
3433 * Must call update_cfs_rq_load_avg() before this, since we rely on
3434 * cfs_rq->avg.last_update_time being current.
3435 */
Byungchul Parka05e8c52015-08-20 20:21:56 +09003436static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3437{
3438 se->avg.last_update_time = cfs_rq->avg.last_update_time;
3439 cfs_rq->avg.load_avg += se->avg.load_avg;
3440 cfs_rq->avg.load_sum += se->avg.load_sum;
3441 cfs_rq->avg.util_avg += se->avg.util_avg;
3442 cfs_rq->avg.util_sum += se->avg.util_sum;
Vincent Guittot96956e22016-11-08 10:53:44 +01003443 set_tg_cfs_propagate(cfs_rq);
Steve Mucklea2c6c912016-03-24 15:26:07 -07003444
3445 cfs_rq_util_change(cfs_rq);
Byungchul Parka05e8c52015-08-20 20:21:56 +09003446}
3447
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02003448/**
3449 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
3450 * @cfs_rq: cfs_rq to detach from
3451 * @se: sched_entity to detach
3452 *
3453 * Must call update_cfs_rq_load_avg() before this, since we rely on
3454 * cfs_rq->avg.last_update_time being current.
3455 */
Byungchul Parka05e8c52015-08-20 20:21:56 +09003456static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3457{
Byungchul Parka05e8c52015-08-20 20:21:56 +09003458
Peter Zijlstra89741892016-06-16 10:50:40 +02003459 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3460 sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
3461 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3462 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
Vincent Guittot96956e22016-11-08 10:53:44 +01003463 set_tg_cfs_propagate(cfs_rq);
Steve Mucklea2c6c912016-03-24 15:26:07 -07003464
3465 cfs_rq_util_change(cfs_rq);
Byungchul Parka05e8c52015-08-20 20:21:56 +09003466}
3467
Yuyang Du9d89c252015-07-15 08:04:37 +08003468/* Add the load generated by se into cfs_rq's load average */
3469static inline void
3470enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3471{
3472 struct sched_avg *sa = &se->avg;
Yuyang Du9d89c252015-07-15 08:04:37 +08003473
Yuyang Du13962232015-07-15 08:04:41 +08003474 cfs_rq->runnable_load_avg += sa->load_avg;
3475 cfs_rq->runnable_load_sum += sa->load_sum;
3476
Vincent Guittot96956e22016-11-08 10:53:44 +01003477 if (!sa->last_update_time) {
Byungchul Parka05e8c52015-08-20 20:21:56 +09003478 attach_entity_load_avg(cfs_rq, se);
Yuyang Du9d89c252015-07-15 08:04:37 +08003479 update_tg_load_avg(cfs_rq, 0);
Vincent Guittot96956e22016-11-08 10:53:44 +01003480 }
Paul Turner9ee474f2012-10-04 13:18:30 +02003481}
3482
Yuyang Du13962232015-07-15 08:04:41 +08003483/* Remove the runnable load generated by se from cfs_rq's runnable load average */
3484static inline void
3485dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3486{
Yuyang Du13962232015-07-15 08:04:41 +08003487 cfs_rq->runnable_load_avg =
3488 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
3489 cfs_rq->runnable_load_sum =
Byungchul Parka05e8c52015-08-20 20:21:56 +09003490 max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
Yuyang Du13962232015-07-15 08:04:41 +08003491}
3492
Yuyang Du0905f042015-12-17 07:34:27 +08003493#ifndef CONFIG_64BIT
3494static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3495{
3496 u64 last_update_time_copy;
3497 u64 last_update_time;
3498
3499 do {
3500 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3501 smp_rmb();
3502 last_update_time = cfs_rq->avg.last_update_time;
3503 } while (last_update_time != last_update_time_copy);
3504
3505 return last_update_time;
3506}
3507#else
3508static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3509{
3510 return cfs_rq->avg.last_update_time;
3511}
3512#endif
3513
Paul Turner9ee474f2012-10-04 13:18:30 +02003514/*
Morten Rasmussen355772432016-10-14 14:41:07 +01003515 * Synchronize entity load avg of dequeued entity without locking
3516 * the previous rq.
3517 */
3518void sync_entity_load_avg(struct sched_entity *se)
3519{
3520 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3521 u64 last_update_time;
3522
3523 last_update_time = cfs_rq_last_update_time(cfs_rq);
3524 __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
3525}
3526
3527/*
Yuyang Du9d89c252015-07-15 08:04:37 +08003528 * Task first catches up with cfs_rq, and then subtract
3529 * itself from the cfs_rq (task must be off the queue now).
Paul Turner9ee474f2012-10-04 13:18:30 +02003530 */
Yuyang Du9d89c252015-07-15 08:04:37 +08003531void remove_entity_load_avg(struct sched_entity *se)
Paul Turner9ee474f2012-10-04 13:18:30 +02003532{
Yuyang Du9d89c252015-07-15 08:04:37 +08003533 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Paul Turner9ee474f2012-10-04 13:18:30 +02003534
Yuyang Du0905f042015-12-17 07:34:27 +08003535 /*
Peter Zijlstra7dc603c2016-06-16 13:29:28 +02003536 * tasks cannot exit without having gone through wake_up_new_task() ->
3537 * post_init_entity_util_avg() which will have added things to the
3538 * cfs_rq, so we can remove unconditionally.
3539 *
3540 * Similarly for groups, they will have passed through
3541 * post_init_entity_util_avg() before unregister_sched_fair_group()
3542 * calls this.
Yuyang Du0905f042015-12-17 07:34:27 +08003543 */
Paul Turner9ee474f2012-10-04 13:18:30 +02003544
Morten Rasmussen355772432016-10-14 14:41:07 +01003545 sync_entity_load_avg(se);
Yuyang Du9d89c252015-07-15 08:04:37 +08003546 atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
3547 atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
Paul Turner2dac7542012-10-04 13:18:30 +02003548}
Vincent Guittot642dbc32013-04-18 18:34:26 +02003549
Yuyang Du7ea241a2015-07-15 08:04:42 +08003550static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3551{
3552 return cfs_rq->runnable_load_avg;
3553}
3554
3555static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3556{
3557 return cfs_rq->avg.load_avg;
3558}
3559
Peter Zijlstra6e831252014-02-11 16:11:48 +01003560static int idle_balance(struct rq *this_rq);
3561
Peter Zijlstra38033c32014-01-23 20:32:21 +01003562#else /* CONFIG_SMP */
3563
Peter Zijlstra01011472016-06-17 11:20:46 +02003564static inline int
3565update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3566{
3567 return 0;
3568}
3569
Vincent Guittot96956e22016-11-08 10:53:44 +01003570#define UPDATE_TG 0x0
3571#define SKIP_AGE_LOAD 0x0
3572
3573static inline void update_load_avg(struct sched_entity *se, int not_used1)
Rafael J. Wysocki536bd002016-05-06 14:58:43 +02003574{
Rafael J. Wysocki12bde332016-08-10 03:11:17 +02003575 cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
Rafael J. Wysocki536bd002016-05-06 14:58:43 +02003576}
3577
Yuyang Du9d89c252015-07-15 08:04:37 +08003578static inline void
3579enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
Yuyang Du13962232015-07-15 08:04:41 +08003580static inline void
3581dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
Yuyang Du9d89c252015-07-15 08:04:37 +08003582static inline void remove_entity_load_avg(struct sched_entity *se) {}
Peter Zijlstra6e831252014-02-11 16:11:48 +01003583
Byungchul Parka05e8c52015-08-20 20:21:56 +09003584static inline void
3585attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3586static inline void
3587detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3588
Peter Zijlstra6e831252014-02-11 16:11:48 +01003589static inline int idle_balance(struct rq *rq)
3590{
3591 return 0;
3592}
3593
Peter Zijlstra38033c32014-01-23 20:32:21 +01003594#endif /* CONFIG_SMP */
Paul Turner9d85f212012-10-04 13:18:29 +02003595
Peter Zijlstraddc97292007-10-15 17:00:10 +02003596static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3597{
3598#ifdef CONFIG_SCHED_DEBUG
3599 s64 d = se->vruntime - cfs_rq->min_vruntime;
3600
3601 if (d < 0)
3602 d = -d;
3603
3604 if (d > 3*sysctl_sched_latency)
Josh Poimboeufae928822016-06-17 12:43:24 -05003605 schedstat_inc(cfs_rq->nr_spread_over);
Peter Zijlstraddc97292007-10-15 17:00:10 +02003606#endif
3607}
3608
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003609static void
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02003610place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3611{
Peter Zijlstra1af5f732008-10-24 11:06:13 +02003612 u64 vruntime = cfs_rq->min_vruntime;
Peter Zijlstra94dfb5e2007-10-15 17:00:05 +02003613
Peter Zijlstra2cb86002007-11-09 22:39:37 +01003614 /*
3615 * The 'current' period is already promised to the current tasks,
3616 * however the extra weight of the new task will slow them down a
3617 * little, place the new task so that it fits in the slot that
3618 * stays open at the end.
3619 */
Peter Zijlstra94dfb5e2007-10-15 17:00:05 +02003620 if (initial && sched_feat(START_DEBIT))
Peter Zijlstraf9c0b092008-10-17 19:27:04 +02003621 vruntime += sched_vslice(cfs_rq, se);
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02003622
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02003623 /* sleeps up to a single latency don't count. */
Mike Galbraith5ca98802010-03-11 17:17:17 +01003624 if (!initial) {
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02003625 unsigned long thresh = sysctl_sched_latency;
Peter Zijlstraa7be37a2008-06-27 13:41:11 +02003626
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02003627 /*
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02003628 * Halve their sleep time's effect, to allow
3629 * for a gentler effect of sleepers:
3630 */
3631 if (sched_feat(GENTLE_FAIR_SLEEPERS))
3632 thresh >>= 1;
Ingo Molnar51e03042009-09-16 08:54:45 +02003633
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02003634 vruntime -= thresh;
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02003635 }
3636
Mike Galbraithb5d9d732009-09-08 11:12:28 +02003637 /* ensure we never gain time by being placed backwards. */
Viresh Kumar16c8f1c2012-11-08 13:33:46 +05303638 se->vruntime = max_vruntime(se->vruntime, vruntime);
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02003639}
3640
Paul Turnerd3d9dc32011-07-21 09:43:39 -07003641static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3642
Mel Gormancb251762016-02-05 09:08:36 +00003643static inline void check_schedstat_required(void)
3644{
3645#ifdef CONFIG_SCHEDSTATS
3646 if (schedstat_enabled())
3647 return;
3648
3649 /* Force schedstat enabled if a dependent tracepoint is active */
3650 if (trace_sched_stat_wait_enabled() ||
3651 trace_sched_stat_sleep_enabled() ||
3652 trace_sched_stat_iowait_enabled() ||
3653 trace_sched_stat_blocked_enabled() ||
3654 trace_sched_stat_runtime_enabled()) {
Josh Poimboeufeda8dca2016-06-13 02:32:09 -05003655 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
Mel Gormancb251762016-02-05 09:08:36 +00003656 "stat_blocked and stat_runtime require the "
3657 "kernel parameter schedstats=enabled or "
3658 "kernel.sched_schedstats=1\n");
3659 }
3660#endif
3661}
3662
Peter Zijlstrab5179ac2016-05-11 16:10:34 +02003663
3664/*
3665 * MIGRATION
3666 *
3667 * dequeue
3668 * update_curr()
3669 * update_min_vruntime()
3670 * vruntime -= min_vruntime
3671 *
3672 * enqueue
3673 * update_curr()
3674 * update_min_vruntime()
3675 * vruntime += min_vruntime
3676 *
3677 * this way the vruntime transition between RQs is done when both
3678 * min_vruntime are up-to-date.
3679 *
3680 * WAKEUP (remote)
3681 *
Peter Zijlstra59efa0b2016-05-10 18:24:37 +02003682 * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
Peter Zijlstrab5179ac2016-05-11 16:10:34 +02003683 * vruntime -= min_vruntime
3684 *
3685 * enqueue
3686 * update_curr()
3687 * update_min_vruntime()
3688 * vruntime += min_vruntime
3689 *
3690 * this way we don't have the most up-to-date min_vruntime on the originating
3691 * CPU and an up-to-date min_vruntime on the destination CPU.
3692 */
3693
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02003694static void
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01003695enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003696{
Peter Zijlstra2f950352016-05-11 19:27:56 +02003697 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
3698 bool curr = cfs_rq->curr == se;
Peter Zijlstra3a47d512016-03-09 13:04:03 +01003699
Ingo Molnar53d3bc72016-05-11 08:25:53 +02003700 /*
Peter Zijlstra2f950352016-05-11 19:27:56 +02003701 * If we're the current task, we must renormalise before calling
3702 * update_curr().
Ingo Molnar53d3bc72016-05-11 08:25:53 +02003703 */
Peter Zijlstra2f950352016-05-11 19:27:56 +02003704 if (renorm && curr)
3705 se->vruntime += cfs_rq->min_vruntime;
3706
Ingo Molnarb7cc0892007-08-09 11:16:47 +02003707 update_curr(cfs_rq);
Peter Zijlstra2f950352016-05-11 19:27:56 +02003708
3709 /*
3710 * Otherwise, renormalise after, such that we're placed at the current
3711 * moment in time, instead of some random moment in the past. Being
3712 * placed in the past could significantly boost this task to the
3713 * fairness detriment of existing tasks.
3714 */
3715 if (renorm && !curr)
3716 se->vruntime += cfs_rq->min_vruntime;
3717
Vincent Guittot6960f772016-12-21 16:50:26 +01003718 /*
3719 * When enqueuing a sched_entity, we must:
3720 * - Update loads to have both entity and cfs_rq synced with now.
3721 * - Add its load to cfs_rq->runnable_avg
3722 * - For group_entity, update its weight to reflect the new share of
3723 * its group cfs_rq
3724 * - Add its new weight to cfs_rq->load.weight
3725 */
Vincent Guittot96956e22016-11-08 10:53:44 +01003726 update_load_avg(se, UPDATE_TG);
Yuyang Du9d89c252015-07-15 08:04:37 +08003727 enqueue_entity_load_avg(cfs_rq, se);
Vincent Guittot6960f772016-12-21 16:50:26 +01003728 update_cfs_shares(se);
Linus Torvalds17bc14b2012-12-14 07:20:43 -08003729 account_entity_enqueue(cfs_rq, se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003730
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -05003731 if (flags & ENQUEUE_WAKEUP)
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02003732 place_entity(cfs_rq, se, 0);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003733
Mel Gormancb251762016-02-05 09:08:36 +00003734 check_schedstat_required();
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05003735 update_stats_enqueue(cfs_rq, se, flags);
3736 check_spread(cfs_rq, se);
Peter Zijlstra2f950352016-05-11 19:27:56 +02003737 if (!curr)
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02003738 __enqueue_entity(cfs_rq, se);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08003739 se->on_rq = 1;
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08003740
Paul Turnerd3d9dc32011-07-21 09:43:39 -07003741 if (cfs_rq->nr_running == 1) {
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08003742 list_add_leaf_cfs_rq(cfs_rq);
Paul Turnerd3d9dc32011-07-21 09:43:39 -07003743 check_enqueue_throttle(cfs_rq);
3744 }
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003745}
3746
Rik van Riel2c13c9192011-02-01 09:48:37 -05003747static void __clear_buddies_last(struct sched_entity *se)
Peter Zijlstra2002c692008-11-11 11:52:33 +01003748{
Rik van Riel2c13c9192011-02-01 09:48:37 -05003749 for_each_sched_entity(se) {
3750 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstraf1044792012-02-11 06:05:00 +01003751 if (cfs_rq->last != se)
Rik van Riel2c13c9192011-02-01 09:48:37 -05003752 break;
Peter Zijlstraf1044792012-02-11 06:05:00 +01003753
3754 cfs_rq->last = NULL;
Rik van Riel2c13c9192011-02-01 09:48:37 -05003755 }
3756}
Peter Zijlstra2002c692008-11-11 11:52:33 +01003757
Rik van Riel2c13c9192011-02-01 09:48:37 -05003758static void __clear_buddies_next(struct sched_entity *se)
3759{
3760 for_each_sched_entity(se) {
3761 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstraf1044792012-02-11 06:05:00 +01003762 if (cfs_rq->next != se)
Rik van Riel2c13c9192011-02-01 09:48:37 -05003763 break;
Peter Zijlstraf1044792012-02-11 06:05:00 +01003764
3765 cfs_rq->next = NULL;
Rik van Riel2c13c9192011-02-01 09:48:37 -05003766 }
Peter Zijlstra2002c692008-11-11 11:52:33 +01003767}
3768
Rik van Rielac53db52011-02-01 09:51:03 -05003769static void __clear_buddies_skip(struct sched_entity *se)
3770{
3771 for_each_sched_entity(se) {
3772 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstraf1044792012-02-11 06:05:00 +01003773 if (cfs_rq->skip != se)
Rik van Rielac53db52011-02-01 09:51:03 -05003774 break;
Peter Zijlstraf1044792012-02-11 06:05:00 +01003775
3776 cfs_rq->skip = NULL;
Rik van Rielac53db52011-02-01 09:51:03 -05003777 }
3778}
3779
Peter Zijlstraa571bbe2009-01-28 14:51:40 +01003780static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3781{
Rik van Riel2c13c9192011-02-01 09:48:37 -05003782 if (cfs_rq->last == se)
3783 __clear_buddies_last(se);
3784
3785 if (cfs_rq->next == se)
3786 __clear_buddies_next(se);
Rik van Rielac53db52011-02-01 09:51:03 -05003787
3788 if (cfs_rq->skip == se)
3789 __clear_buddies_skip(se);
Peter Zijlstraa571bbe2009-01-28 14:51:40 +01003790}
3791
Peter Zijlstra6c16a6d2012-03-21 13:07:16 -07003792static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turnerd8b49862011-07-21 09:43:41 -07003793
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003794static void
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01003795dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003796{
Dmitry Adamushkoa2a2d682007-10-15 17:00:13 +02003797 /*
3798 * Update run-time statistics of the 'current'.
3799 */
3800 update_curr(cfs_rq);
Vincent Guittot6960f772016-12-21 16:50:26 +01003801
3802 /*
3803 * When dequeuing a sched_entity, we must:
3804 * - Update loads to have both entity and cfs_rq synced with now.
3805 * - Substract its load from the cfs_rq->runnable_avg.
3806 * - Substract its previous weight from cfs_rq->load.weight.
3807 * - For group entity, update its weight to reflect the new share
3808 * of its group cfs_rq.
3809 */
Vincent Guittot96956e22016-11-08 10:53:44 +01003810 update_load_avg(se, UPDATE_TG);
Yuyang Du13962232015-07-15 08:04:41 +08003811 dequeue_entity_load_avg(cfs_rq, se);
Dmitry Adamushkoa2a2d682007-10-15 17:00:13 +02003812
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05003813 update_stats_dequeue(cfs_rq, se, flags);
Peter Zijlstra67e9fb22007-10-15 17:00:10 +02003814
Peter Zijlstra2002c692008-11-11 11:52:33 +01003815 clear_buddies(cfs_rq, se);
Peter Zijlstra47932412008-11-04 21:25:09 +01003816
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02003817 if (se != cfs_rq->curr)
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02003818 __dequeue_entity(cfs_rq, se);
Linus Torvalds17bc14b2012-12-14 07:20:43 -08003819 se->on_rq = 0;
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02003820 account_entity_dequeue(cfs_rq, se);
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01003821
3822 /*
Peter Zijlstrab60205c2016-09-20 21:58:12 +02003823 * Normalize after update_curr(); which will also have moved
3824 * min_vruntime if @se is the one holding it back. But before doing
3825 * update_min_vruntime() again, which will discount @se's position and
3826 * can move min_vruntime forward still more.
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01003827 */
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01003828 if (!(flags & DEQUEUE_SLEEP))
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01003829 se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra1e876232011-05-17 16:21:10 -07003830
Paul Turnerd8b49862011-07-21 09:43:41 -07003831 /* return excess runtime on last dequeue */
3832 return_cfs_rq_runtime(cfs_rq);
3833
Vincent Guittot6960f772016-12-21 16:50:26 +01003834 update_cfs_shares(se);
Peter Zijlstrab60205c2016-09-20 21:58:12 +02003835
3836 /*
3837 * Now advance min_vruntime if @se was the entity holding it back,
3838 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
3839 * put back on, and if we advance min_vruntime, we'll be placed back
3840 * further than we started -- ie. we'll be penalized.
3841 */
Song Muchun2530be52018-10-14 19:26:12 +08003842 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
Peter Zijlstrab60205c2016-09-20 21:58:12 +02003843 update_min_vruntime(cfs_rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003844}
3845
3846/*
3847 * Preempt the current task with a newly woken task if needed:
3848 */
Peter Zijlstra7c92e542007-09-05 14:32:49 +02003849static void
Ingo Molnar2e09bf52007-10-15 17:00:05 +02003850check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003851{
Peter Zijlstra11697832007-09-05 14:32:49 +02003852 unsigned long ideal_runtime, delta_exec;
Wang Xingchaof4cfb332011-09-16 13:35:52 -04003853 struct sched_entity *se;
3854 s64 delta;
Peter Zijlstra11697832007-09-05 14:32:49 +02003855
Peter Zijlstra6d0f0ebd2007-10-15 17:00:05 +02003856 ideal_runtime = sched_slice(cfs_rq, curr);
Peter Zijlstra11697832007-09-05 14:32:49 +02003857 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
Mike Galbraitha9f3e2b2009-01-28 14:51:39 +01003858 if (delta_exec > ideal_runtime) {
Kirill Tkhai88751252014-06-29 00:03:57 +04003859 resched_curr(rq_of(cfs_rq));
Mike Galbraitha9f3e2b2009-01-28 14:51:39 +01003860 /*
3861 * The current task ran long enough, ensure it doesn't get
3862 * re-elected due to buddy favours.
3863 */
3864 clear_buddies(cfs_rq, curr);
Mike Galbraithf685cea2009-10-23 23:09:22 +02003865 return;
3866 }
3867
3868 /*
3869 * Ensure that a task that missed wakeup preemption by a
3870 * narrow margin doesn't have to wait for a full slice.
3871 * This also mitigates buddy induced latencies under load.
3872 */
Mike Galbraithf685cea2009-10-23 23:09:22 +02003873 if (delta_exec < sysctl_sched_min_granularity)
3874 return;
3875
Wang Xingchaof4cfb332011-09-16 13:35:52 -04003876 se = __pick_first_entity(cfs_rq);
3877 delta = curr->vruntime - se->vruntime;
Mike Galbraithf685cea2009-10-23 23:09:22 +02003878
Wang Xingchaof4cfb332011-09-16 13:35:52 -04003879 if (delta < 0)
3880 return;
Mike Galbraithd7d82942011-01-05 05:41:17 +01003881
Wang Xingchaof4cfb332011-09-16 13:35:52 -04003882 if (delta > ideal_runtime)
Kirill Tkhai88751252014-06-29 00:03:57 +04003883 resched_curr(rq_of(cfs_rq));
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003884}
3885
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02003886static void
Ingo Molnar8494f412007-08-09 11:16:48 +02003887set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003888{
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02003889 /* 'current' is not kept within the tree. */
3890 if (se->on_rq) {
3891 /*
3892 * Any task has to be enqueued before it get to execute on
3893 * a CPU. So account for the time it spent waiting on the
3894 * runqueue.
3895 */
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05003896 update_stats_wait_end(cfs_rq, se);
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02003897 __dequeue_entity(cfs_rq, se);
Vincent Guittot96956e22016-11-08 10:53:44 +01003898 update_load_avg(se, UPDATE_TG);
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02003899 }
3900
Ingo Molnar79303e92007-08-09 11:16:47 +02003901 update_stats_curr_start(cfs_rq, se);
Ingo Molnar429d43b2007-10-15 17:00:03 +02003902 cfs_rq->curr = se;
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05003903
Ingo Molnareba1ed42007-10-15 17:00:02 +02003904 /*
3905 * Track our maximum slice length, if the CPU's load is at
3906 * least twice that of our own weight (i.e. dont track it
3907 * when there are only lesser-weight tasks around):
3908 */
Mel Gormancb251762016-02-05 09:08:36 +00003909 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05003910 schedstat_set(se->statistics.slice_max,
3911 max((u64)schedstat_val(se->statistics.slice_max),
3912 se->sum_exec_runtime - se->prev_sum_exec_runtime));
Ingo Molnareba1ed42007-10-15 17:00:02 +02003913 }
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05003914
Peter Zijlstra4a55b452007-09-05 14:32:49 +02003915 se->prev_sum_exec_runtime = se->sum_exec_runtime;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003916}
3917
Peter Zijlstra3f3a4902008-10-24 11:06:16 +02003918static int
3919wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
3920
Rik van Rielac53db52011-02-01 09:51:03 -05003921/*
3922 * Pick the next process, keeping these things in mind, in this order:
3923 * 1) keep things fair between processes/task groups
3924 * 2) pick the "next" process, since someone really wants that to run
3925 * 3) pick the "last" process, for cache locality
3926 * 4) do not run the "skip" process, if something else is available
3927 */
Peter Zijlstra678d5712012-02-11 06:05:00 +01003928static struct sched_entity *
3929pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
Peter Zijlstraaa2ac252008-03-14 21:12:12 +01003930{
Peter Zijlstra678d5712012-02-11 06:05:00 +01003931 struct sched_entity *left = __pick_first_entity(cfs_rq);
3932 struct sched_entity *se;
Maria Yu72d01d12019-07-19 19:25:09 +08003933 bool strict_skip = false;
Peter Zijlstra678d5712012-02-11 06:05:00 +01003934
3935 /*
3936 * If curr is set we have to see if its left of the leftmost entity
3937 * still in the tree, provided there was anything in the tree at all.
3938 */
3939 if (!left || (curr && entity_before(curr, left)))
3940 left = curr;
3941
3942 se = left; /* ideally we run the leftmost entity */
Peter Zijlstraf4b67552008-11-04 21:25:07 +01003943
Rik van Rielac53db52011-02-01 09:51:03 -05003944 /*
3945 * Avoid running the skip buddy, if running something else can
3946 * be done without getting too unfair.
3947 */
3948 if (cfs_rq->skip == se) {
Peter Zijlstra678d5712012-02-11 06:05:00 +01003949 struct sched_entity *second;
3950
3951 if (se == curr) {
3952 second = __pick_first_entity(cfs_rq);
Maria Yu72d01d12019-07-19 19:25:09 +08003953 if (sched_feat(STRICT_SKIP_BUDDY))
3954 strict_skip = true;
Peter Zijlstra678d5712012-02-11 06:05:00 +01003955 } else {
3956 second = __pick_next_entity(se);
3957 if (!second || (curr && entity_before(curr, second)))
3958 second = curr;
3959 }
3960
Maria Yu72d01d12019-07-19 19:25:09 +08003961 if (second && (strict_skip ||
Maria Yu6bab9be2019-05-29 12:30:47 +08003962 wakeup_preempt_entity(second, left) < 1))
Rik van Rielac53db52011-02-01 09:51:03 -05003963 se = second;
3964 }
Peter Zijlstraaa2ac252008-03-14 21:12:12 +01003965
Mike Galbraithf685cea2009-10-23 23:09:22 +02003966 /*
3967 * Prefer last buddy, try to return the CPU to a preempted task.
3968 */
3969 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
3970 se = cfs_rq->last;
3971
Rik van Rielac53db52011-02-01 09:51:03 -05003972 /*
3973 * Someone really wants this to run. If it's not unfair, run it.
3974 */
3975 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
3976 se = cfs_rq->next;
3977
Mike Galbraithf685cea2009-10-23 23:09:22 +02003978 clear_buddies(cfs_rq, se);
Peter Zijlstra47932412008-11-04 21:25:09 +01003979
3980 return se;
Peter Zijlstraaa2ac252008-03-14 21:12:12 +01003981}
3982
Peter Zijlstra678d5712012-02-11 06:05:00 +01003983static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turnerd3d9dc32011-07-21 09:43:39 -07003984
Ingo Molnarab6cde22007-08-09 11:16:48 +02003985static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003986{
3987 /*
3988 * If still on the runqueue then deactivate_task()
3989 * was not called and update_curr() has to be done:
3990 */
3991 if (prev->on_rq)
Ingo Molnarb7cc0892007-08-09 11:16:47 +02003992 update_curr(cfs_rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003993
Paul Turnerd3d9dc32011-07-21 09:43:39 -07003994 /* throttle cfs_rqs exceeding runtime */
3995 check_cfs_rq_runtime(cfs_rq);
3996
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05003997 check_spread(cfs_rq, prev);
Mel Gormancb251762016-02-05 09:08:36 +00003998
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02003999 if (prev->on_rq) {
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05004000 update_stats_wait_start(cfs_rq, prev);
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004001 /* Put 'current' back into the tree. */
4002 __enqueue_entity(cfs_rq, prev);
Paul Turner9d85f212012-10-04 13:18:29 +02004003 /* in !on_rq case, update occurred at dequeue */
Yuyang Du9d89c252015-07-15 08:04:37 +08004004 update_load_avg(prev, 0);
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004005 }
Ingo Molnar429d43b2007-10-15 17:00:03 +02004006 cfs_rq->curr = NULL;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004007}
4008
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004009static void
4010entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004011{
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004012 /*
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004013 * Update run-time statistics of the 'current'.
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004014 */
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004015 update_curr(cfs_rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004016
Paul Turner43365bd2010-12-15 19:10:17 -08004017 /*
Paul Turner9d85f212012-10-04 13:18:29 +02004018 * Ensure that runnable average is periodically updated.
4019 */
Vincent Guittot96956e22016-11-08 10:53:44 +01004020 update_load_avg(curr, UPDATE_TG);
Vincent Guittot6960f772016-12-21 16:50:26 +01004021 update_cfs_shares(curr);
Paul Turner9d85f212012-10-04 13:18:29 +02004022
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004023#ifdef CONFIG_SCHED_HRTICK
4024 /*
4025 * queued ticks are scheduled to match the slice, so don't bother
4026 * validating it and just reschedule.
4027 */
Harvey Harrison983ed7a2008-04-24 18:17:55 -07004028 if (queued) {
Kirill Tkhai88751252014-06-29 00:03:57 +04004029 resched_curr(rq_of(cfs_rq));
Harvey Harrison983ed7a2008-04-24 18:17:55 -07004030 return;
4031 }
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004032 /*
4033 * don't let the period tick interfere with the hrtick preemption
4034 */
4035 if (!sched_feat(DOUBLE_TICK) &&
4036 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4037 return;
4038#endif
4039
Yong Zhang2c2efae2011-07-29 16:20:33 +08004040 if (cfs_rq->nr_running > 1)
Ingo Molnar2e09bf52007-10-15 17:00:05 +02004041 check_preempt_tick(cfs_rq, curr);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004042}
4043
Paul Turnerab84d312011-07-21 09:43:28 -07004044
4045/**************************************************
4046 * CFS bandwidth control machinery
4047 */
4048
4049#ifdef CONFIG_CFS_BANDWIDTH
Peter Zijlstra029632f2011-10-25 10:00:11 +02004050
4051#ifdef HAVE_JUMP_LABEL
Ingo Molnarc5905af2012-02-24 08:31:31 +01004052static struct static_key __cfs_bandwidth_used;
Peter Zijlstra029632f2011-10-25 10:00:11 +02004053
4054static inline bool cfs_bandwidth_used(void)
4055{
Ingo Molnarc5905af2012-02-24 08:31:31 +01004056 return static_key_false(&__cfs_bandwidth_used);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004057}
4058
Ben Segall1ee14e62013-10-16 11:16:12 -07004059void cfs_bandwidth_usage_inc(void)
Peter Zijlstra029632f2011-10-25 10:00:11 +02004060{
Ben Segall1ee14e62013-10-16 11:16:12 -07004061 static_key_slow_inc(&__cfs_bandwidth_used);
4062}
4063
4064void cfs_bandwidth_usage_dec(void)
4065{
4066 static_key_slow_dec(&__cfs_bandwidth_used);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004067}
4068#else /* HAVE_JUMP_LABEL */
4069static bool cfs_bandwidth_used(void)
4070{
4071 return true;
4072}
4073
Ben Segall1ee14e62013-10-16 11:16:12 -07004074void cfs_bandwidth_usage_inc(void) {}
4075void cfs_bandwidth_usage_dec(void) {}
Peter Zijlstra029632f2011-10-25 10:00:11 +02004076#endif /* HAVE_JUMP_LABEL */
4077
Paul Turnerab84d312011-07-21 09:43:28 -07004078/*
4079 * default period for cfs group bandwidth.
4080 * default: 0.1s, units: nanoseconds
4081 */
4082static inline u64 default_cfs_period(void)
4083{
4084 return 100000000ULL;
4085}
Paul Turnerec12cb72011-07-21 09:43:30 -07004086
4087static inline u64 sched_cfs_bandwidth_slice(void)
4088{
4089 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4090}
4091
Paul Turnera9cf55b2011-07-21 09:43:32 -07004092/*
4093 * Replenish runtime according to assigned quota and update expiration time.
4094 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
4095 * additional synchronization around rq->lock.
4096 *
4097 * requires cfs_b->lock
4098 */
Peter Zijlstra029632f2011-10-25 10:00:11 +02004099void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
Paul Turnera9cf55b2011-07-21 09:43:32 -07004100{
4101 u64 now;
4102
4103 if (cfs_b->quota == RUNTIME_INF)
4104 return;
4105
4106 now = sched_clock_cpu(smp_processor_id());
4107 cfs_b->runtime = cfs_b->quota;
4108 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
4109}
4110
Peter Zijlstra029632f2011-10-25 10:00:11 +02004111static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4112{
4113 return &tg->cfs_bandwidth;
4114}
4115
Paul Turnerf1b17282012-10-04 13:18:31 +02004116/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
4117static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4118{
4119 if (unlikely(cfs_rq->throttle_count))
Xunlei Pang1a99ae32016-05-10 21:03:18 +08004120 return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
Paul Turnerf1b17282012-10-04 13:18:31 +02004121
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004122 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
Paul Turnerf1b17282012-10-04 13:18:31 +02004123}
4124
Paul Turner85dac902011-07-21 09:43:33 -07004125/* returns 0 on failure to allocate runtime */
4126static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turnerec12cb72011-07-21 09:43:30 -07004127{
4128 struct task_group *tg = cfs_rq->tg;
4129 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
Paul Turnera9cf55b2011-07-21 09:43:32 -07004130 u64 amount = 0, min_amount, expires;
Paul Turnerec12cb72011-07-21 09:43:30 -07004131
4132 /* note: this is a positive sum as runtime_remaining <= 0 */
4133 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
4134
4135 raw_spin_lock(&cfs_b->lock);
4136 if (cfs_b->quota == RUNTIME_INF)
4137 amount = min_amount;
Paul Turner58088ad2011-07-21 09:43:31 -07004138 else {
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02004139 start_cfs_bandwidth(cfs_b);
Paul Turner58088ad2011-07-21 09:43:31 -07004140
4141 if (cfs_b->runtime > 0) {
4142 amount = min(cfs_b->runtime, min_amount);
4143 cfs_b->runtime -= amount;
4144 cfs_b->idle = 0;
4145 }
Paul Turnerec12cb72011-07-21 09:43:30 -07004146 }
Paul Turnera9cf55b2011-07-21 09:43:32 -07004147 expires = cfs_b->runtime_expires;
Paul Turnerec12cb72011-07-21 09:43:30 -07004148 raw_spin_unlock(&cfs_b->lock);
4149
4150 cfs_rq->runtime_remaining += amount;
Paul Turnera9cf55b2011-07-21 09:43:32 -07004151 /*
4152 * we may have advanced our local expiration to account for allowed
4153 * spread between our sched_clock and the one on which runtime was
4154 * issued.
4155 */
4156 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
4157 cfs_rq->runtime_expires = expires;
Paul Turner85dac902011-07-21 09:43:33 -07004158
4159 return cfs_rq->runtime_remaining > 0;
Paul Turnera9cf55b2011-07-21 09:43:32 -07004160}
4161
4162/*
4163 * Note: This depends on the synchronization provided by sched_clock and the
4164 * fact that rq->clock snapshots this value.
4165 */
4166static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4167{
4168 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
Paul Turnera9cf55b2011-07-21 09:43:32 -07004169
4170 /* if the deadline is ahead of our clock, nothing to do */
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004171 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
Paul Turnera9cf55b2011-07-21 09:43:32 -07004172 return;
4173
4174 if (cfs_rq->runtime_remaining < 0)
4175 return;
4176
4177 /*
4178 * If the local deadline has passed we have to consider the
4179 * possibility that our sched_clock is 'fast' and the global deadline
4180 * has not truly expired.
4181 *
4182 * Fortunately we can check determine whether this the case by checking
Ben Segall51f21762014-05-19 15:49:45 -07004183 * whether the global deadline has advanced. It is valid to compare
4184 * cfs_b->runtime_expires without any locks since we only care about
4185 * exact equality, so a partial write will still work.
Paul Turnera9cf55b2011-07-21 09:43:32 -07004186 */
4187
Ben Segall51f21762014-05-19 15:49:45 -07004188 if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
Paul Turnera9cf55b2011-07-21 09:43:32 -07004189 /* extend local deadline, drift is bounded above by 2 ticks */
4190 cfs_rq->runtime_expires += TICK_NSEC;
4191 } else {
4192 /* global deadline is ahead, expiration has passed */
4193 cfs_rq->runtime_remaining = 0;
4194 }
Paul Turnerec12cb72011-07-21 09:43:30 -07004195}
4196
Peter Zijlstra9dbdb152013-11-18 18:27:06 +01004197static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
Paul Turnerec12cb72011-07-21 09:43:30 -07004198{
Paul Turnera9cf55b2011-07-21 09:43:32 -07004199 /* dock delta_exec before expiring quota (as it could span periods) */
Paul Turnerec12cb72011-07-21 09:43:30 -07004200 cfs_rq->runtime_remaining -= delta_exec;
Paul Turnera9cf55b2011-07-21 09:43:32 -07004201 expire_cfs_rq_runtime(cfs_rq);
4202
4203 if (likely(cfs_rq->runtime_remaining > 0))
Paul Turnerec12cb72011-07-21 09:43:30 -07004204 return;
4205
Liangyan0a3989a2019-08-26 20:16:33 +08004206 if (cfs_rq->throttled)
4207 return;
Paul Turner85dac902011-07-21 09:43:33 -07004208 /*
4209 * if we're unable to extend our runtime we resched so that the active
4210 * hierarchy can be throttled
4211 */
4212 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
Kirill Tkhai88751252014-06-29 00:03:57 +04004213 resched_curr(rq_of(cfs_rq));
Paul Turnerec12cb72011-07-21 09:43:30 -07004214}
4215
Peter Zijlstra6c16a6d2012-03-21 13:07:16 -07004216static __always_inline
Peter Zijlstra9dbdb152013-11-18 18:27:06 +01004217void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
Paul Turnerec12cb72011-07-21 09:43:30 -07004218{
Paul Turner56f570e2011-11-07 20:26:33 -08004219 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
Paul Turnerec12cb72011-07-21 09:43:30 -07004220 return;
4221
4222 __account_cfs_rq_runtime(cfs_rq, delta_exec);
4223}
4224
Paul Turner85dac902011-07-21 09:43:33 -07004225static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4226{
Paul Turner56f570e2011-11-07 20:26:33 -08004227 return cfs_bandwidth_used() && cfs_rq->throttled;
Paul Turner85dac902011-07-21 09:43:33 -07004228}
4229
Paul Turner64660c82011-07-21 09:43:36 -07004230/* check whether cfs_rq, or any parent, is throttled */
4231static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4232{
Paul Turner56f570e2011-11-07 20:26:33 -08004233 return cfs_bandwidth_used() && cfs_rq->throttle_count;
Paul Turner64660c82011-07-21 09:43:36 -07004234}
4235
4236/*
4237 * Ensure that neither of the group entities corresponding to src_cpu or
4238 * dest_cpu are members of a throttled hierarchy when performing group
4239 * load-balance operations.
4240 */
4241static inline int throttled_lb_pair(struct task_group *tg,
4242 int src_cpu, int dest_cpu)
4243{
4244 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4245
4246 src_cfs_rq = tg->cfs_rq[src_cpu];
4247 dest_cfs_rq = tg->cfs_rq[dest_cpu];
4248
4249 return throttled_hierarchy(src_cfs_rq) ||
4250 throttled_hierarchy(dest_cfs_rq);
4251}
4252
4253/* updated child weight may affect parent so we have to do this bottom up */
4254static int tg_unthrottle_up(struct task_group *tg, void *data)
4255{
4256 struct rq *rq = data;
4257 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4258
4259 cfs_rq->throttle_count--;
Paul Turner64660c82011-07-21 09:43:36 -07004260 if (!cfs_rq->throttle_count) {
Paul Turnerf1b17282012-10-04 13:18:31 +02004261 /* adjust cfs_rq_clock_task() */
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004262 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
Paul Turnerf1b17282012-10-04 13:18:31 +02004263 cfs_rq->throttled_clock_task;
Paul Turner64660c82011-07-21 09:43:36 -07004264 }
Paul Turner64660c82011-07-21 09:43:36 -07004265
4266 return 0;
4267}
4268
4269static int tg_throttle_down(struct task_group *tg, void *data)
4270{
4271 struct rq *rq = data;
4272 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4273
Paul Turner82958362012-10-04 13:18:31 +02004274 /* group is entering throttled state, stop time */
4275 if (!cfs_rq->throttle_count)
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004276 cfs_rq->throttled_clock_task = rq_clock_task(rq);
Paul Turner64660c82011-07-21 09:43:36 -07004277 cfs_rq->throttle_count++;
4278
4279 return 0;
4280}
4281
Pavankumar Kondeti351d3fa2019-09-04 11:28:58 +05304282#ifdef CONFIG_SCHED_WALT
4283static inline void walt_propagate_cumulative_runnable_avg(u64 *accumulated,
4284 u64 value, bool add)
4285{
4286 if (add)
4287 *accumulated += value;
4288 else
4289 *accumulated -= value;
4290}
4291#else
4292/*
4293 * Provide a nop definition since cumulative_runnable_avg is not
4294 * available in rq or cfs_rq when WALT is not enabled.
4295 */
4296#define walt_propagate_cumulative_runnable_avg(...)
4297#endif
4298
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004299static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner85dac902011-07-21 09:43:33 -07004300{
4301 struct rq *rq = rq_of(cfs_rq);
4302 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4303 struct sched_entity *se;
4304 long task_delta, dequeue = 1;
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02004305 bool empty;
Paul Turner85dac902011-07-21 09:43:33 -07004306
4307 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4308
Paul Turnerf1b17282012-10-04 13:18:31 +02004309 /* freeze hierarchy runnable averages while throttled */
Paul Turner64660c82011-07-21 09:43:36 -07004310 rcu_read_lock();
4311 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4312 rcu_read_unlock();
Paul Turner85dac902011-07-21 09:43:33 -07004313
4314 task_delta = cfs_rq->h_nr_running;
4315 for_each_sched_entity(se) {
4316 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4317 /* throttled entity or throttle-on-deactivate */
4318 if (!se->on_rq)
4319 break;
4320
4321 if (dequeue)
4322 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4323 qcfs_rq->h_nr_running -= task_delta;
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +05304324 walt_dec_throttled_cfs_rq_stats(&qcfs_rq->walt_stats, cfs_rq);
Pavankumar Kondeti351d3fa2019-09-04 11:28:58 +05304325 walt_propagate_cumulative_runnable_avg(
4326 &qcfs_rq->cumulative_runnable_avg,
4327 cfs_rq->cumulative_runnable_avg, false);
Paul Turner85dac902011-07-21 09:43:33 -07004328
4329 if (qcfs_rq->load.weight)
4330 dequeue = 0;
4331 }
4332
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +05304333 if (!se) {
Kirill Tkhai72465442014-05-09 03:00:14 +04004334 sub_nr_running(rq, task_delta);
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +05304335 walt_dec_throttled_cfs_rq_stats(&rq->walt_stats, cfs_rq);
Pavankumar Kondeti351d3fa2019-09-04 11:28:58 +05304336 walt_propagate_cumulative_runnable_avg(
4337 &rq->cumulative_runnable_avg,
4338 cfs_rq->cumulative_runnable_avg, false);
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +05304339 }
Paul Turner85dac902011-07-21 09:43:33 -07004340
4341 cfs_rq->throttled = 1;
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004342 cfs_rq->throttled_clock = rq_clock(rq);
Paul Turner85dac902011-07-21 09:43:33 -07004343 raw_spin_lock(&cfs_b->lock);
Cong Wangd49db342015-06-24 12:41:47 -07004344 empty = list_empty(&cfs_b->throttled_cfs_rq);
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02004345
Ben Segallc06f04c2014-06-20 15:21:20 -07004346 /*
4347 * Add to the _head_ of the list, so that an already-started
Phil Auldbc1fccc2018-10-08 10:36:40 -04004348 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
4349 * not running add to the tail so that later runqueues don't get starved.
Ben Segallc06f04c2014-06-20 15:21:20 -07004350 */
Phil Auldbc1fccc2018-10-08 10:36:40 -04004351 if (cfs_b->distribute_running)
4352 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4353 else
4354 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02004355
4356 /*
4357 * If we're the first throttled task, make sure the bandwidth
4358 * timer is running.
4359 */
4360 if (empty)
4361 start_cfs_bandwidth(cfs_b);
4362
Paul Turner85dac902011-07-21 09:43:33 -07004363 raw_spin_unlock(&cfs_b->lock);
4364}
4365
Peter Zijlstra029632f2011-10-25 10:00:11 +02004366void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner671fd9d2011-07-21 09:43:34 -07004367{
4368 struct rq *rq = rq_of(cfs_rq);
4369 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4370 struct sched_entity *se;
4371 int enqueue = 1;
4372 long task_delta;
Pavankumar Kondeti5d73fcc2017-01-11 15:45:54 +05304373 struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq;
Paul Turner671fd9d2011-07-21 09:43:34 -07004374
Michael Wang22b958d2013-06-04 14:23:39 +08004375 se = cfs_rq->tg->se[cpu_of(rq)];
Paul Turner671fd9d2011-07-21 09:43:34 -07004376
4377 cfs_rq->throttled = 0;
Frederic Weisbecker1a55af22013-04-12 01:51:01 +02004378
4379 update_rq_clock(rq);
4380
Paul Turner671fd9d2011-07-21 09:43:34 -07004381 raw_spin_lock(&cfs_b->lock);
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004382 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
Paul Turner671fd9d2011-07-21 09:43:34 -07004383 list_del_rcu(&cfs_rq->throttled_list);
4384 raw_spin_unlock(&cfs_b->lock);
4385
Paul Turner64660c82011-07-21 09:43:36 -07004386 /* update hierarchical throttle state */
4387 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4388
Paul Turner671fd9d2011-07-21 09:43:34 -07004389 if (!cfs_rq->load.weight)
4390 return;
4391
4392 task_delta = cfs_rq->h_nr_running;
4393 for_each_sched_entity(se) {
4394 if (se->on_rq)
4395 enqueue = 0;
4396
4397 cfs_rq = cfs_rq_of(se);
4398 if (enqueue)
4399 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4400 cfs_rq->h_nr_running += task_delta;
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +05304401 walt_inc_throttled_cfs_rq_stats(&cfs_rq->walt_stats, tcfs_rq);
Pavankumar Kondeti351d3fa2019-09-04 11:28:58 +05304402 walt_propagate_cumulative_runnable_avg(
4403 &cfs_rq->cumulative_runnable_avg,
4404 tcfs_rq->cumulative_runnable_avg, true);
Paul Turner671fd9d2011-07-21 09:43:34 -07004405
4406 if (cfs_rq_throttled(cfs_rq))
4407 break;
4408 }
4409
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +05304410 if (!se) {
Kirill Tkhai72465442014-05-09 03:00:14 +04004411 add_nr_running(rq, task_delta);
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +05304412 walt_inc_throttled_cfs_rq_stats(&rq->walt_stats, tcfs_rq);
Pavankumar Kondeti351d3fa2019-09-04 11:28:58 +05304413 walt_propagate_cumulative_runnable_avg(
4414 &rq->cumulative_runnable_avg,
4415 tcfs_rq->cumulative_runnable_avg, true);
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +05304416 }
Paul Turner671fd9d2011-07-21 09:43:34 -07004417
4418 /* determine whether we need to wake up potentially idle cpu */
4419 if (rq->curr == rq->idle && rq->cfs.nr_running)
Kirill Tkhai88751252014-06-29 00:03:57 +04004420 resched_curr(rq);
Paul Turner671fd9d2011-07-21 09:43:34 -07004421}
4422
4423static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4424 u64 remaining, u64 expires)
4425{
4426 struct cfs_rq *cfs_rq;
Ben Segallc06f04c2014-06-20 15:21:20 -07004427 u64 runtime;
4428 u64 starting_runtime = remaining;
Paul Turner671fd9d2011-07-21 09:43:34 -07004429
4430 rcu_read_lock();
4431 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4432 throttled_list) {
4433 struct rq *rq = rq_of(cfs_rq);
4434
4435 raw_spin_lock(&rq->lock);
4436 if (!cfs_rq_throttled(cfs_rq))
4437 goto next;
4438
Liangyan0a3989a2019-08-26 20:16:33 +08004439 /* By the above check, this should never be true */
4440 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
4441
Paul Turner671fd9d2011-07-21 09:43:34 -07004442 runtime = -cfs_rq->runtime_remaining + 1;
4443 if (runtime > remaining)
4444 runtime = remaining;
4445 remaining -= runtime;
4446
4447 cfs_rq->runtime_remaining += runtime;
4448 cfs_rq->runtime_expires = expires;
4449
4450 /* we check whether we're throttled above */
4451 if (cfs_rq->runtime_remaining > 0)
4452 unthrottle_cfs_rq(cfs_rq);
4453
4454next:
4455 raw_spin_unlock(&rq->lock);
4456
4457 if (!remaining)
4458 break;
4459 }
4460 rcu_read_unlock();
4461
Ben Segallc06f04c2014-06-20 15:21:20 -07004462 return starting_runtime - remaining;
Paul Turner671fd9d2011-07-21 09:43:34 -07004463}
4464
Paul Turner58088ad2011-07-21 09:43:31 -07004465/*
4466 * Responsible for refilling a task_group's bandwidth and unthrottling its
4467 * cfs_rqs as appropriate. If there has been no activity within the last
4468 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
4469 * used to track this state.
4470 */
4471static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
4472{
Paul Turner671fd9d2011-07-21 09:43:34 -07004473 u64 runtime, runtime_expires;
Ben Segall51f21762014-05-19 15:49:45 -07004474 int throttled;
Paul Turner58088ad2011-07-21 09:43:31 -07004475
Paul Turner58088ad2011-07-21 09:43:31 -07004476 /* no need to continue the timer with no bandwidth constraint */
4477 if (cfs_b->quota == RUNTIME_INF)
Ben Segall51f21762014-05-19 15:49:45 -07004478 goto out_deactivate;
Paul Turner58088ad2011-07-21 09:43:31 -07004479
Paul Turner671fd9d2011-07-21 09:43:34 -07004480 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
Nikhil Raoe8da1b12011-07-21 09:43:40 -07004481 cfs_b->nr_periods += overrun;
Paul Turner671fd9d2011-07-21 09:43:34 -07004482
Ben Segall51f21762014-05-19 15:49:45 -07004483 /*
4484 * idle depends on !throttled (for the case of a large deficit), and if
4485 * we're going inactive then everything else can be deferred
4486 */
4487 if (cfs_b->idle && !throttled)
4488 goto out_deactivate;
Paul Turnera9cf55b2011-07-21 09:43:32 -07004489
4490 __refill_cfs_bandwidth_runtime(cfs_b);
4491
Paul Turner671fd9d2011-07-21 09:43:34 -07004492 if (!throttled) {
4493 /* mark as potentially idle for the upcoming period */
4494 cfs_b->idle = 1;
Ben Segall51f21762014-05-19 15:49:45 -07004495 return 0;
Paul Turner671fd9d2011-07-21 09:43:34 -07004496 }
Paul Turner58088ad2011-07-21 09:43:31 -07004497
Nikhil Raoe8da1b12011-07-21 09:43:40 -07004498 /* account preceding periods in which throttling occurred */
4499 cfs_b->nr_throttled += overrun;
4500
Paul Turner671fd9d2011-07-21 09:43:34 -07004501 runtime_expires = cfs_b->runtime_expires;
Paul Turner671fd9d2011-07-21 09:43:34 -07004502
4503 /*
Ben Segallc06f04c2014-06-20 15:21:20 -07004504 * This check is repeated as we are holding onto the new bandwidth while
4505 * we unthrottle. This can potentially race with an unthrottled group
4506 * trying to acquire new bandwidth from the global pool. This can result
4507 * in us over-using our runtime if it is all used during this loop, but
4508 * only by limited amounts in that extreme case.
Paul Turner671fd9d2011-07-21 09:43:34 -07004509 */
Phil Auldbc1fccc2018-10-08 10:36:40 -04004510 while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
Ben Segallc06f04c2014-06-20 15:21:20 -07004511 runtime = cfs_b->runtime;
Phil Auldbc1fccc2018-10-08 10:36:40 -04004512 cfs_b->distribute_running = 1;
Paul Turner671fd9d2011-07-21 09:43:34 -07004513 raw_spin_unlock(&cfs_b->lock);
4514 /* we can't nest cfs_b->lock while distributing bandwidth */
4515 runtime = distribute_cfs_runtime(cfs_b, runtime,
4516 runtime_expires);
4517 raw_spin_lock(&cfs_b->lock);
4518
Phil Auldbc1fccc2018-10-08 10:36:40 -04004519 cfs_b->distribute_running = 0;
Paul Turner671fd9d2011-07-21 09:43:34 -07004520 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
Ben Segallc06f04c2014-06-20 15:21:20 -07004521
4522 cfs_b->runtime -= min(runtime, cfs_b->runtime);
Paul Turner671fd9d2011-07-21 09:43:34 -07004523 }
4524
Paul Turner671fd9d2011-07-21 09:43:34 -07004525 /*
4526 * While we are ensured activity in the period following an
4527 * unthrottle, this also covers the case in which the new bandwidth is
4528 * insufficient to cover the existing bandwidth deficit. (Forcing the
4529 * timer to remain active while there are any throttled entities.)
4530 */
4531 cfs_b->idle = 0;
Paul Turner58088ad2011-07-21 09:43:31 -07004532
Ben Segall51f21762014-05-19 15:49:45 -07004533 return 0;
4534
4535out_deactivate:
Ben Segall51f21762014-05-19 15:49:45 -07004536 return 1;
Paul Turner58088ad2011-07-21 09:43:31 -07004537}
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004538
Paul Turnerd8b49862011-07-21 09:43:41 -07004539/* a cfs_rq won't donate quota below this amount */
4540static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4541/* minimum remaining period time to redistribute slack quota */
4542static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4543/* how long we wait to gather additional slack before distributing */
4544static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4545
Ben Segalldb06e782013-10-16 11:16:17 -07004546/*
4547 * Are we near the end of the current quota period?
4548 *
4549 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
Thomas Gleixner4961b6e2015-04-14 21:09:05 +00004550 * hrtimer base being cleared by hrtimer_start. In the case of
Ben Segalldb06e782013-10-16 11:16:17 -07004551 * migrate_hrtimers, base is never cleared, so we are fine.
4552 */
Paul Turnerd8b49862011-07-21 09:43:41 -07004553static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4554{
4555 struct hrtimer *refresh_timer = &cfs_b->period_timer;
4556 u64 remaining;
4557
4558 /* if the call-back is running a quota refresh is already occurring */
4559 if (hrtimer_callback_running(refresh_timer))
4560 return 1;
4561
4562 /* is a quota refresh about to occur? */
4563 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4564 if (remaining < min_expire)
4565 return 1;
4566
4567 return 0;
4568}
4569
4570static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4571{
4572 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4573
4574 /* if there's a quota refresh soon don't bother with slack */
4575 if (runtime_refresh_within(cfs_b, min_left))
4576 return;
4577
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02004578 hrtimer_start(&cfs_b->slack_timer,
4579 ns_to_ktime(cfs_bandwidth_slack_period),
4580 HRTIMER_MODE_REL);
Paul Turnerd8b49862011-07-21 09:43:41 -07004581}
4582
4583/* we know any runtime found here is valid as update_curr() precedes return */
4584static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4585{
4586 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4587 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4588
4589 if (slack_runtime <= 0)
4590 return;
4591
4592 raw_spin_lock(&cfs_b->lock);
4593 if (cfs_b->quota != RUNTIME_INF &&
4594 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
4595 cfs_b->runtime += slack_runtime;
4596
4597 /* we are under rq->lock, defer unthrottling using a timer */
4598 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4599 !list_empty(&cfs_b->throttled_cfs_rq))
4600 start_cfs_slack_bandwidth(cfs_b);
4601 }
4602 raw_spin_unlock(&cfs_b->lock);
4603
4604 /* even if it's not valid for return we don't want to try again */
4605 cfs_rq->runtime_remaining -= slack_runtime;
4606}
4607
4608static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4609{
Paul Turner56f570e2011-11-07 20:26:33 -08004610 if (!cfs_bandwidth_used())
4611 return;
4612
Paul Turnerfccfdc62011-11-07 20:26:34 -08004613 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
Paul Turnerd8b49862011-07-21 09:43:41 -07004614 return;
4615
4616 __return_cfs_rq_runtime(cfs_rq);
4617}
4618
4619/*
4620 * This is done with a timer (instead of inline with bandwidth return) since
4621 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
4622 */
4623static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4624{
4625 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4626 u64 expires;
4627
4628 /* confirm we're still not at a refresh boundary */
Paul Turnerd8b49862011-07-21 09:43:41 -07004629 raw_spin_lock(&cfs_b->lock);
Phil Auldbc1fccc2018-10-08 10:36:40 -04004630 if (cfs_b->distribute_running) {
4631 raw_spin_unlock(&cfs_b->lock);
4632 return;
4633 }
4634
Ben Segalldb06e782013-10-16 11:16:17 -07004635 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4636 raw_spin_unlock(&cfs_b->lock);
4637 return;
4638 }
4639
Ben Segallc06f04c2014-06-20 15:21:20 -07004640 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
Paul Turnerd8b49862011-07-21 09:43:41 -07004641 runtime = cfs_b->runtime;
Ben Segallc06f04c2014-06-20 15:21:20 -07004642
Paul Turnerd8b49862011-07-21 09:43:41 -07004643 expires = cfs_b->runtime_expires;
Phil Auldbc1fccc2018-10-08 10:36:40 -04004644 if (runtime)
4645 cfs_b->distribute_running = 1;
4646
Paul Turnerd8b49862011-07-21 09:43:41 -07004647 raw_spin_unlock(&cfs_b->lock);
4648
4649 if (!runtime)
4650 return;
4651
4652 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
4653
4654 raw_spin_lock(&cfs_b->lock);
4655 if (expires == cfs_b->runtime_expires)
Ben Segallc06f04c2014-06-20 15:21:20 -07004656 cfs_b->runtime -= min(runtime, cfs_b->runtime);
Phil Auldbc1fccc2018-10-08 10:36:40 -04004657 cfs_b->distribute_running = 0;
Paul Turnerd8b49862011-07-21 09:43:41 -07004658 raw_spin_unlock(&cfs_b->lock);
4659}
4660
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004661/*
4662 * When a group wakes up we want to make sure that its quota is not already
4663 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
4664 * runtime as update_curr() throttling can not not trigger until it's on-rq.
4665 */
4666static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4667{
Paul Turner56f570e2011-11-07 20:26:33 -08004668 if (!cfs_bandwidth_used())
4669 return;
4670
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004671 /* an active group must be handled by the update_curr()->put() path */
4672 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4673 return;
4674
4675 /* ensure the group is not already throttled */
4676 if (cfs_rq_throttled(cfs_rq))
4677 return;
4678
4679 /* update runtime allocation */
4680 account_cfs_rq_runtime(cfs_rq, 0);
4681 if (cfs_rq->runtime_remaining <= 0)
4682 throttle_cfs_rq(cfs_rq);
4683}
4684
Peter Zijlstra55e16d32016-06-22 15:14:26 +02004685static void sync_throttle(struct task_group *tg, int cpu)
4686{
4687 struct cfs_rq *pcfs_rq, *cfs_rq;
4688
4689 if (!cfs_bandwidth_used())
4690 return;
4691
4692 if (!tg->parent)
4693 return;
4694
4695 cfs_rq = tg->cfs_rq[cpu];
4696 pcfs_rq = tg->parent->cfs_rq[cpu];
4697
4698 cfs_rq->throttle_count = pcfs_rq->throttle_count;
Xunlei Pangb8922122016-07-09 15:54:22 +08004699 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
Peter Zijlstra55e16d32016-06-22 15:14:26 +02004700}
4701
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004702/* conditionally throttle active cfs_rq's from put_prev_entity() */
Peter Zijlstra678d5712012-02-11 06:05:00 +01004703static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004704{
Paul Turner56f570e2011-11-07 20:26:33 -08004705 if (!cfs_bandwidth_used())
Peter Zijlstra678d5712012-02-11 06:05:00 +01004706 return false;
Paul Turner56f570e2011-11-07 20:26:33 -08004707
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004708 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
Peter Zijlstra678d5712012-02-11 06:05:00 +01004709 return false;
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004710
4711 /*
4712 * it's possible for a throttled entity to be forced into a running
4713 * state (e.g. set_curr_task), in this case we're finished.
4714 */
4715 if (cfs_rq_throttled(cfs_rq))
Peter Zijlstra678d5712012-02-11 06:05:00 +01004716 return true;
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004717
4718 throttle_cfs_rq(cfs_rq);
Peter Zijlstra678d5712012-02-11 06:05:00 +01004719 return true;
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004720}
Peter Zijlstra029632f2011-10-25 10:00:11 +02004721
Peter Zijlstra029632f2011-10-25 10:00:11 +02004722static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4723{
4724 struct cfs_bandwidth *cfs_b =
4725 container_of(timer, struct cfs_bandwidth, slack_timer);
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02004726
Peter Zijlstra029632f2011-10-25 10:00:11 +02004727 do_sched_cfs_slack_timer(cfs_b);
4728
4729 return HRTIMER_NORESTART;
4730}
4731
Phil Auld33f2a3e2019-04-23 19:51:06 -04004732extern const u64 max_cfs_quota_period;
4733
Peter Zijlstra029632f2011-10-25 10:00:11 +02004734static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4735{
4736 struct cfs_bandwidth *cfs_b =
4737 container_of(timer, struct cfs_bandwidth, period_timer);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004738 int overrun;
4739 int idle = 0;
Phil Auld33f2a3e2019-04-23 19:51:06 -04004740 int count = 0;
Peter Zijlstra029632f2011-10-25 10:00:11 +02004741
Ben Segall51f21762014-05-19 15:49:45 -07004742 raw_spin_lock(&cfs_b->lock);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004743 for (;;) {
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02004744 overrun = hrtimer_forward_now(timer, cfs_b->period);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004745 if (!overrun)
4746 break;
4747
Phil Auld33f2a3e2019-04-23 19:51:06 -04004748 if (++count > 3) {
4749 u64 new, old = ktime_to_ns(cfs_b->period);
4750
Xuewei Zhangbdb6fa82019-10-03 17:12:43 -07004751 /*
4752 * Grow period by a factor of 2 to avoid losing precision.
4753 * Precision loss in the quota/period ratio can cause __cfs_schedulable
4754 * to fail.
4755 */
4756 new = old * 2;
4757 if (new < max_cfs_quota_period) {
4758 cfs_b->period = ns_to_ktime(new);
4759 cfs_b->quota *= 2;
Phil Auld33f2a3e2019-04-23 19:51:06 -04004760
Xuewei Zhangbdb6fa82019-10-03 17:12:43 -07004761 pr_warn_ratelimited(
4762 "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
4763 smp_processor_id(),
4764 div_u64(new, NSEC_PER_USEC),
4765 div_u64(cfs_b->quota, NSEC_PER_USEC));
4766 } else {
4767 pr_warn_ratelimited(
4768 "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
4769 smp_processor_id(),
4770 div_u64(old, NSEC_PER_USEC),
4771 div_u64(cfs_b->quota, NSEC_PER_USEC));
4772 }
Phil Auld33f2a3e2019-04-23 19:51:06 -04004773
4774 /* reset count so we don't come right back in here */
4775 count = 0;
4776 }
4777
Peter Zijlstra029632f2011-10-25 10:00:11 +02004778 idle = do_sched_cfs_period_timer(cfs_b, overrun);
4779 }
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02004780 if (idle)
4781 cfs_b->period_active = 0;
Ben Segall51f21762014-05-19 15:49:45 -07004782 raw_spin_unlock(&cfs_b->lock);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004783
4784 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4785}
4786
4787void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4788{
4789 raw_spin_lock_init(&cfs_b->lock);
4790 cfs_b->runtime = 0;
4791 cfs_b->quota = RUNTIME_INF;
4792 cfs_b->period = ns_to_ktime(default_cfs_period());
4793
4794 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02004795 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004796 cfs_b->period_timer.function = sched_cfs_period_timer;
4797 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4798 cfs_b->slack_timer.function = sched_cfs_slack_timer;
Phil Auldbc1fccc2018-10-08 10:36:40 -04004799 cfs_b->distribute_running = 0;
Peter Zijlstra029632f2011-10-25 10:00:11 +02004800}
4801
4802static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4803{
4804 cfs_rq->runtime_enabled = 0;
4805 INIT_LIST_HEAD(&cfs_rq->throttled_list);
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +05304806 walt_init_cfs_rq_stats(cfs_rq);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004807}
4808
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02004809void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
Peter Zijlstra029632f2011-10-25 10:00:11 +02004810{
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02004811 lockdep_assert_held(&cfs_b->lock);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004812
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02004813 if (!cfs_b->period_active) {
4814 cfs_b->period_active = 1;
4815 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
4816 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
4817 }
Peter Zijlstra029632f2011-10-25 10:00:11 +02004818}
4819
4820static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4821{
Tetsuo Handa7f1a1692014-12-25 15:51:21 +09004822 /* init_cfs_bandwidth() was not called */
4823 if (!cfs_b->throttled_cfs_rq.next)
4824 return;
4825
Peter Zijlstra029632f2011-10-25 10:00:11 +02004826 hrtimer_cancel(&cfs_b->period_timer);
4827 hrtimer_cancel(&cfs_b->slack_timer);
4828}
4829
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04004830static void __maybe_unused update_runtime_enabled(struct rq *rq)
4831{
4832 struct cfs_rq *cfs_rq;
4833
4834 for_each_leaf_cfs_rq(rq, cfs_rq) {
4835 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
4836
4837 raw_spin_lock(&cfs_b->lock);
4838 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4839 raw_spin_unlock(&cfs_b->lock);
4840 }
4841}
4842
Arnd Bergmann38dc3342013-01-25 14:14:22 +00004843static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
Peter Zijlstra029632f2011-10-25 10:00:11 +02004844{
4845 struct cfs_rq *cfs_rq;
4846
4847 for_each_leaf_cfs_rq(rq, cfs_rq) {
Peter Zijlstra029632f2011-10-25 10:00:11 +02004848 if (!cfs_rq->runtime_enabled)
4849 continue;
4850
4851 /*
4852 * clock_task is not advancing so we just need to make sure
4853 * there's some valid quota amount
4854 */
Ben Segall51f21762014-05-19 15:49:45 -07004855 cfs_rq->runtime_remaining = 1;
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04004856 /*
4857 * Offline rq is schedulable till cpu is completely disabled
4858 * in take_cpu_down(), so we prevent new cfs throttling here.
4859 */
4860 cfs_rq->runtime_enabled = 0;
4861
Peter Zijlstra029632f2011-10-25 10:00:11 +02004862 if (cfs_rq_throttled(cfs_rq))
4863 unthrottle_cfs_rq(cfs_rq);
4864 }
4865}
4866
Pavankumar Kondeti351d3fa2019-09-04 11:28:58 +05304867#ifdef CONFIG_SCHED_WALT
4868static void walt_fixup_cumulative_runnable_avg_fair(struct rq *rq,
4869 struct task_struct *p,
4870 u64 new_task_load)
4871{
4872 struct cfs_rq *cfs_rq;
4873 struct sched_entity *se = &p->se;
4874 s64 task_load_delta = (s64)new_task_load - p->ravg.demand;
4875
4876 for_each_sched_entity(se) {
4877 cfs_rq = cfs_rq_of(se);
4878
4879 cfs_rq->cumulative_runnable_avg += task_load_delta;
4880 if (cfs_rq_throttled(cfs_rq))
4881 break;
4882 }
4883
4884 /* Fix up rq only if we didn't find any throttled cfs_rq */
4885 if (!se)
4886 walt_fixup_cumulative_runnable_avg(rq, p, new_task_load);
4887}
4888
4889#endif /* CONFIG_SCHED_WALT */
4890
Peter Zijlstra029632f2011-10-25 10:00:11 +02004891#else /* CONFIG_CFS_BANDWIDTH */
Paul Turnerf1b17282012-10-04 13:18:31 +02004892static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4893{
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004894 return rq_clock_task(rq_of(cfs_rq));
Paul Turnerf1b17282012-10-04 13:18:31 +02004895}
4896
Peter Zijlstra9dbdb152013-11-18 18:27:06 +01004897static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
Peter Zijlstra678d5712012-02-11 06:05:00 +01004898static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004899static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
Peter Zijlstra55e16d32016-06-22 15:14:26 +02004900static inline void sync_throttle(struct task_group *tg, int cpu) {}
Peter Zijlstra6c16a6d2012-03-21 13:07:16 -07004901static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner85dac902011-07-21 09:43:33 -07004902
4903static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4904{
4905 return 0;
4906}
Paul Turner64660c82011-07-21 09:43:36 -07004907
4908static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4909{
4910 return 0;
4911}
4912
4913static inline int throttled_lb_pair(struct task_group *tg,
4914 int src_cpu, int dest_cpu)
4915{
4916 return 0;
4917}
Peter Zijlstra029632f2011-10-25 10:00:11 +02004918
4919void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4920
4921#ifdef CONFIG_FAIR_GROUP_SCHED
4922static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turnerab84d312011-07-21 09:43:28 -07004923#endif
4924
Peter Zijlstra029632f2011-10-25 10:00:11 +02004925static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4926{
4927 return NULL;
4928}
4929static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04004930static inline void update_runtime_enabled(struct rq *rq) {}
Peter Boonstoppela4c96ae2012-08-09 15:34:47 -07004931static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
Peter Zijlstra029632f2011-10-25 10:00:11 +02004932
Pavankumar Kondeti351d3fa2019-09-04 11:28:58 +05304933#define walt_fixup_cumulative_runnable_avg_fair \
4934 walt_fixup_cumulative_runnable_avg
4935
Peter Zijlstra029632f2011-10-25 10:00:11 +02004936#endif /* CONFIG_CFS_BANDWIDTH */
4937
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004938/**************************************************
4939 * CFS operations on tasks:
4940 */
4941
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004942#ifdef CONFIG_SCHED_HRTICK
4943static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4944{
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004945 struct sched_entity *se = &p->se;
4946 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4947
Peter Zijlstra9148a3a2016-09-20 22:34:51 +02004948 SCHED_WARN_ON(task_rq(p) != rq);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004949
Srivatsa Vaddagiri8bf46a32016-09-16 18:28:51 -07004950 if (rq->cfs.h_nr_running > 1) {
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004951 u64 slice = sched_slice(cfs_rq, se);
4952 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4953 s64 delta = slice - ran;
4954
4955 if (delta < 0) {
4956 if (rq->curr == p)
Kirill Tkhai88751252014-06-29 00:03:57 +04004957 resched_curr(rq);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004958 return;
4959 }
Peter Zijlstra31656512008-07-18 18:01:23 +02004960 hrtick_start(rq, delta);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004961 }
4962}
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02004963
4964/*
4965 * called from enqueue/dequeue and updates the hrtick when the
4966 * current task is from our class and nr_running is low enough
4967 * to matter.
4968 */
4969static void hrtick_update(struct rq *rq)
4970{
4971 struct task_struct *curr = rq->curr;
4972
Mike Galbraithb39e66e2011-11-22 15:20:07 +01004973 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02004974 return;
4975
4976 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
4977 hrtick_start_fair(rq, curr);
4978}
Dhaval Giani55e12e52008-06-24 23:39:43 +05304979#else /* !CONFIG_SCHED_HRTICK */
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004980static inline void
4981hrtick_start_fair(struct rq *rq, struct task_struct *p)
4982{
4983}
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02004984
4985static inline void hrtick_update(struct rq *rq)
4986{
4987}
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004988#endif
4989
Patrick Bellasi2178e842016-07-22 11:35:59 +01004990#ifdef CONFIG_SMP
Juri Lelli4585a262015-08-19 19:47:12 +01004991static unsigned long capacity_orig_of(int cpu);
Juri Lelli43aac892015-06-26 12:14:23 +01004992static unsigned long cpu_util(int cpu);
Patrick Bellasi2178e842016-07-22 11:35:59 +01004993#endif
Morten Rasmussena562dfc2015-05-09 16:49:57 +01004994
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004995/*
4996 * The enqueue_task method is called before nr_running is
4997 * increased. Here we update the fair scheduling stats and
4998 * then put the task into the rbtree:
4999 */
Thomas Gleixnerea87bb72010-01-20 20:58:57 +00005000static void
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005001enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005002{
5003 struct cfs_rq *cfs_rq;
Peter Zijlstra62fb1852008-02-25 17:34:02 +01005004 struct sched_entity *se = &p->se;
Patrick Bellasi2178e842016-07-22 11:35:59 +01005005#ifdef CONFIG_SMP
Juri Lelli43aac892015-06-26 12:14:23 +01005006 int task_new = flags & ENQUEUE_WAKEUP_NEW;
Patrick Bellasi2178e842016-07-22 11:35:59 +01005007#endif
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005008
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +05305009#ifdef CONFIG_SCHED_WALT
5010 p->misfit = !task_fits_max(p, rq->cpu);
5011#endif
Rafael J. Wysocki8c34ab12016-09-09 23:59:33 +02005012 /*
5013 * If in_iowait is set, the code below may not trigger any cpufreq
5014 * utilization updates, so do it here explicitly with the IOWAIT flag
5015 * passed.
5016 */
5017 if (p->in_iowait)
5018 cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
5019
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005020 for_each_sched_entity(se) {
Peter Zijlstra62fb1852008-02-25 17:34:02 +01005021 if (se->on_rq)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005022 break;
5023 cfs_rq = cfs_rq_of(se);
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01005024 enqueue_entity(cfs_rq, se, flags);
Paul Turner85dac902011-07-21 09:43:33 -07005025
5026 /*
5027 * end evaluation on encountering a throttled cfs_rq
5028 *
5029 * note: in the case of encountering a throttled cfs_rq we will
5030 * post the final h_nr_running increment below.
Peter Zijlstrae210bff2016-06-16 18:51:48 +02005031 */
Paul Turner85dac902011-07-21 09:43:33 -07005032 if (cfs_rq_throttled(cfs_rq))
5033 break;
Paul Turner953bfcd2011-07-21 09:43:27 -07005034 cfs_rq->h_nr_running++;
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +05305035 walt_inc_cfs_rq_stats(cfs_rq, p);
Paul Turner85dac902011-07-21 09:43:33 -07005036
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01005037 flags = ENQUEUE_WAKEUP;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005038 }
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005039
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005040 for_each_sched_entity(se) {
Lin Ming0f317142011-07-22 09:14:31 +08005041 cfs_rq = cfs_rq_of(se);
Paul Turner953bfcd2011-07-21 09:43:27 -07005042 cfs_rq->h_nr_running++;
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +05305043 walt_inc_cfs_rq_stats(cfs_rq, p);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005044
Paul Turner85dac902011-07-21 09:43:33 -07005045 if (cfs_rq_throttled(cfs_rq))
5046 break;
5047
Vincent Guittot96956e22016-11-08 10:53:44 +01005048 update_load_avg(se, UPDATE_TG);
Vincent Guittot6960f772016-12-21 16:50:26 +01005049 update_cfs_shares(se);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005050 }
5051
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005052 if (!se) {
Kirill Tkhai72465442014-05-09 03:00:14 +04005053 add_nr_running(rq, 1);
Pavankumar Kondetid3370502017-07-20 11:47:13 +05305054 inc_rq_walt_stats(rq, p);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005055 }
Yuyang Ducd126af2015-07-15 08:04:36 +08005056
Patrick Bellasi2178e842016-07-22 11:35:59 +01005057#ifdef CONFIG_SMP
5058
Patrick Bellasie72491c2016-08-24 11:27:27 +01005059 /*
5060 * Update SchedTune accounting.
5061 *
5062 * We do it before updating the CPU capacity to ensure the
5063 * boost value of the current task is accounted for in the
5064 * selection of the OPP.
5065 *
5066 * We do it also in the case where we enqueue a throttled task;
5067 * we could argue that a throttled task should not boost a CPU,
5068 * however:
5069 * a) properly implementing CPU boosting considering throttled
5070 * tasks will increase a lot the complexity of the solution
5071 * b) it's not easy to quantify the benefits introduced by
5072 * such a more complex solution.
5073 * Thus, for the time being we go for the simple solution and boost
5074 * also for throttled RQs.
5075 */
5076 schedtune_enqueue_task(p, cpu_of(rq));
5077
Patrick Bellasi2178e842016-07-22 11:35:59 +01005078 if (!se) {
Morten Rasmussena562dfc2015-05-09 16:49:57 +01005079 if (!task_new && !rq->rd->overutilized &&
Patrick Bellasi8e45d942016-02-10 09:24:36 +00005080 cpu_overutilized(rq->cpu)) {
Morten Rasmussena562dfc2015-05-09 16:49:57 +01005081 rq->rd->overutilized = true;
Patrick Bellasi8e45d942016-02-10 09:24:36 +00005082 trace_sched_overutilized(true);
5083 }
Morten Rasmussena562dfc2015-05-09 16:49:57 +01005084 }
Patrick Bellasid2489002016-07-28 18:44:40 +01005085
Patrick Bellasi2178e842016-07-22 11:35:59 +01005086#endif /* CONFIG_SMP */
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02005087 hrtick_update(rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005088}
5089
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005090static void set_next_buddy(struct sched_entity *se);
5091
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005092/*
5093 * The dequeue_task method is called before nr_running is
5094 * decreased. We remove the task from the rbtree and
5095 * update the fair scheduling stats:
5096 */
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005097static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005098{
5099 struct cfs_rq *cfs_rq;
Peter Zijlstra62fb1852008-02-25 17:34:02 +01005100 struct sched_entity *se = &p->se;
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005101 int task_sleep = flags & DEQUEUE_SLEEP;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005102
5103 for_each_sched_entity(se) {
5104 cfs_rq = cfs_rq_of(se);
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005105 dequeue_entity(cfs_rq, se, flags);
Paul Turner85dac902011-07-21 09:43:33 -07005106
5107 /*
5108 * end evaluation on encountering a throttled cfs_rq
5109 *
5110 * note: in the case of encountering a throttled cfs_rq we will
5111 * post the final h_nr_running decrement below.
5112 */
5113 if (cfs_rq_throttled(cfs_rq))
5114 break;
Paul Turner953bfcd2011-07-21 09:43:27 -07005115 cfs_rq->h_nr_running--;
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +05305116 walt_dec_cfs_rq_stats(cfs_rq, p);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005117
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005118 /* Don't dequeue parent if it has other entities besides us */
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005119 if (cfs_rq->load.weight) {
Konstantin Khlebnikov754bd592016-06-16 15:57:15 +03005120 /* Avoid re-evaluating load for this entity: */
5121 se = parent_entity(se);
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005122 /*
5123 * Bias pick_next to pick a task from this cfs_rq, as
5124 * p is sleeping when it is within its sched_slice.
5125 */
Konstantin Khlebnikov754bd592016-06-16 15:57:15 +03005126 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5127 set_next_buddy(se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005128 break;
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005129 }
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005130 flags |= DEQUEUE_SLEEP;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005131 }
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005132
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005133 for_each_sched_entity(se) {
Lin Ming0f317142011-07-22 09:14:31 +08005134 cfs_rq = cfs_rq_of(se);
Paul Turner953bfcd2011-07-21 09:43:27 -07005135 cfs_rq->h_nr_running--;
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +05305136 walt_dec_cfs_rq_stats(cfs_rq, p);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005137
Paul Turner85dac902011-07-21 09:43:33 -07005138 if (cfs_rq_throttled(cfs_rq))
5139 break;
5140
Vincent Guittot96956e22016-11-08 10:53:44 +01005141 update_load_avg(se, UPDATE_TG);
Vincent Guittot6960f772016-12-21 16:50:26 +01005142 update_cfs_shares(se);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005143 }
5144
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005145 if (!se) {
Kirill Tkhai72465442014-05-09 03:00:14 +04005146 sub_nr_running(rq, 1);
Pavankumar Kondetid3370502017-07-20 11:47:13 +05305147 dec_rq_walt_stats(rq, p);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005148 }
Yuyang Ducd126af2015-07-15 08:04:36 +08005149
Patrick Bellasi2178e842016-07-22 11:35:59 +01005150#ifdef CONFIG_SMP
5151
Patrick Bellasie72491c2016-08-24 11:27:27 +01005152 /*
5153 * Update SchedTune accounting
5154 *
5155 * We do it before updating the CPU capacity to ensure the
5156 * boost value of the current task is accounted for in the
5157 * selection of the OPP.
5158 */
5159 schedtune_dequeue_task(p, cpu_of(rq));
5160
Patrick Bellasi2178e842016-07-22 11:35:59 +01005161#endif /* CONFIG_SMP */
5162
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02005163 hrtick_update(rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005164}
5165
Gregory Haskinse7693a32008-01-25 21:08:09 +01005166#ifdef CONFIG_SMP
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02005167
5168/* Working cpumask for: load_balance, load_balance_newidle. */
5169DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5170DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5171
Joonwoo Parka5e601e2017-09-20 16:13:03 -07005172#ifdef CONFIG_SCHED_CORE_ROTATE
5173static int rotate_cpu_start;
5174static DEFINE_SPINLOCK(rotate_lock);
5175static unsigned long avoid_prev_cpu_last;
5176
5177static struct find_first_cpu_bit_env first_cpu_bit_env = {
5178 .avoid_prev_cpu_last = &avoid_prev_cpu_last,
5179 .rotate_cpu_start = &rotate_cpu_start,
5180 .interval = HZ,
5181 .rotate_lock = &rotate_lock,
5182};
5183#endif
5184
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005185#ifdef CONFIG_NO_HZ_COMMON
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005186/*
5187 * per rq 'load' arrray crap; XXX kill this.
5188 */
5189
5190/*
Peter Zijlstrad937cdc2015-10-19 13:49:30 +02005191 * The exact cpuload calculated at every tick would be:
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005192 *
Peter Zijlstrad937cdc2015-10-19 13:49:30 +02005193 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5194 *
5195 * If a cpu misses updates for n ticks (as it was idle) and update gets
5196 * called on the n+1-th tick when cpu may be busy, then we have:
5197 *
5198 * load_n = (1 - 1/2^i)^n * load_0
5199 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005200 *
5201 * decay_load_missed() below does efficient calculation of
Peter Zijlstrad937cdc2015-10-19 13:49:30 +02005202 *
5203 * load' = (1 - 1/2^i)^n * load
5204 *
5205 * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
5206 * This allows us to precompute the above in said factors, thereby allowing the
5207 * reduction of an arbitrary n in O(log_2 n) steps. (See also
5208 * fixed_power_int())
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005209 *
5210 * The calculation is approximated on a 128 point scale.
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005211 */
5212#define DEGRADE_SHIFT 7
Peter Zijlstrad937cdc2015-10-19 13:49:30 +02005213
5214static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
5215static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
5216 { 0, 0, 0, 0, 0, 0, 0, 0 },
5217 { 64, 32, 8, 0, 0, 0, 0, 0 },
5218 { 96, 72, 40, 12, 1, 0, 0, 0 },
5219 { 112, 98, 75, 43, 15, 1, 0, 0 },
5220 { 120, 112, 98, 76, 45, 16, 2, 0 }
5221};
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005222
5223/*
5224 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
5225 * would be when CPU is idle and so we just decay the old load without
5226 * adding any new load.
5227 */
5228static unsigned long
5229decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5230{
5231 int j = 0;
5232
5233 if (!missed_updates)
5234 return load;
5235
5236 if (missed_updates >= degrade_zero_ticks[idx])
5237 return 0;
5238
5239 if (idx == 1)
5240 return load >> missed_updates;
5241
5242 while (missed_updates) {
5243 if (missed_updates % 2)
5244 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5245
5246 missed_updates >>= 1;
5247 j++;
5248 }
5249 return load;
5250}
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005251#endif /* CONFIG_NO_HZ_COMMON */
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005252
Byungchul Park59543272015-10-14 18:47:35 +09005253/**
Frederic Weisbeckercee1afc2016-04-13 15:56:50 +02005254 * __cpu_load_update - update the rq->cpu_load[] statistics
Byungchul Park59543272015-10-14 18:47:35 +09005255 * @this_rq: The rq to update statistics for
5256 * @this_load: The current load
5257 * @pending_updates: The number of missed updates
Byungchul Park59543272015-10-14 18:47:35 +09005258 *
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005259 * Update rq->cpu_load[] statistics. This function is usually called every
Byungchul Park59543272015-10-14 18:47:35 +09005260 * scheduler tick (TICK_NSEC).
5261 *
5262 * This function computes a decaying average:
5263 *
5264 * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
5265 *
5266 * Because of NOHZ it might not get called on every tick which gives need for
5267 * the @pending_updates argument.
5268 *
5269 * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
5270 * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
5271 * = A * (A * load[i]_n-2 + B) + B
5272 * = A * (A * (A * load[i]_n-3 + B) + B) + B
5273 * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
5274 * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
5275 * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
5276 * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
5277 *
5278 * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
5279 * any change in load would have resulted in the tick being turned back on.
5280 *
5281 * For regular NOHZ, this reduces to:
5282 *
5283 * load[i]_n = (1 - 1/2^i)^n * load[i]_0
5284 *
5285 * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005286 * term.
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005287 */
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005288static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5289 unsigned long pending_updates)
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005290{
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005291 unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005292 int i, scale;
5293
5294 this_rq->nr_load_updates++;
5295
5296 /* Update our load: */
5297 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
5298 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5299 unsigned long old_load, new_load;
5300
5301 /* scale is effectively 1 << i now, and >> i divides by scale */
5302
Byungchul Park7400d3b2016-01-15 16:07:49 +09005303 old_load = this_rq->cpu_load[i];
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005304#ifdef CONFIG_NO_HZ_COMMON
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005305 old_load = decay_load_missed(old_load, pending_updates - 1, i);
Byungchul Park7400d3b2016-01-15 16:07:49 +09005306 if (tickless_load) {
5307 old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
5308 /*
5309 * old_load can never be a negative value because a
5310 * decayed tickless_load cannot be greater than the
5311 * original tickless_load.
5312 */
5313 old_load += tickless_load;
5314 }
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005315#endif
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005316 new_load = this_load;
5317 /*
5318 * Round up the averaging division if load is increasing. This
5319 * prevents us from getting stuck on 9 if the load is 10, for
5320 * example.
5321 */
5322 if (new_load > old_load)
5323 new_load += scale - 1;
5324
5325 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5326 }
5327
5328 sched_avg_update(this_rq);
5329}
5330
Yuyang Du7ea241a2015-07-15 08:04:42 +08005331/* Used instead of source_load when we know the type == 0 */
5332static unsigned long weighted_cpuload(const int cpu)
5333{
5334 return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
5335}
5336
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005337#ifdef CONFIG_NO_HZ_COMMON
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005338/*
5339 * There is no sane way to deal with nohz on smp when using jiffies because the
5340 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
5341 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5342 *
5343 * Therefore we need to avoid the delta approach from the regular tick when
5344 * possible since that would seriously skew the load calculation. This is why we
5345 * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
5346 * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
5347 * loop exit, nohz_idle_balance, nohz full exit...)
5348 *
5349 * This means we might still be one tick off for nohz periods.
5350 */
5351
5352static void cpu_load_update_nohz(struct rq *this_rq,
5353 unsigned long curr_jiffies,
5354 unsigned long load)
Frederic Weisbeckerbe68a682016-01-13 17:01:29 +01005355{
5356 unsigned long pending_updates;
5357
5358 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5359 if (pending_updates) {
5360 this_rq->last_load_update_tick = curr_jiffies;
5361 /*
5362 * In the regular NOHZ case, we were idle, this means load 0.
5363 * In the NOHZ_FULL case, we were non-idle, we should consider
5364 * its weighted load.
5365 */
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005366 cpu_load_update(this_rq, load, pending_updates);
Frederic Weisbeckerbe68a682016-01-13 17:01:29 +01005367 }
5368}
5369
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005370/*
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005371 * Called from nohz_idle_balance() to update the load ratings before doing the
5372 * idle balance.
5373 */
Frederic Weisbeckercee1afc2016-04-13 15:56:50 +02005374static void cpu_load_update_idle(struct rq *this_rq)
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005375{
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005376 /*
5377 * bail if there's load or we're actually up-to-date.
5378 */
Frederic Weisbeckerbe68a682016-01-13 17:01:29 +01005379 if (weighted_cpuload(cpu_of(this_rq)))
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005380 return;
5381
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005382 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005383}
5384
5385/*
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005386 * Record CPU load on nohz entry so we know the tickless load to account
5387 * on nohz exit. cpu_load[0] happens then to be updated more frequently
5388 * than other cpu_load[idx] but it should be fine as cpu_load readers
5389 * shouldn't rely into synchronized cpu_load[*] updates.
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005390 */
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005391void cpu_load_update_nohz_start(void)
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005392{
5393 struct rq *this_rq = this_rq();
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005394
5395 /*
5396 * This is all lockless but should be fine. If weighted_cpuload changes
5397 * concurrently we'll exit nohz. And cpu_load write can race with
5398 * cpu_load_update_idle() but both updater would be writing the same.
5399 */
5400 this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
5401}
5402
5403/*
5404 * Account the tickless load in the end of a nohz frame.
5405 */
5406void cpu_load_update_nohz_stop(void)
5407{
Jason Low316c1608d2015-04-28 13:00:20 -07005408 unsigned long curr_jiffies = READ_ONCE(jiffies);
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005409 struct rq *this_rq = this_rq();
5410 unsigned long load;
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005411
5412 if (curr_jiffies == this_rq->last_load_update_tick)
5413 return;
5414
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005415 load = weighted_cpuload(cpu_of(this_rq));
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005416 raw_spin_lock(&this_rq->lock);
Matt Flemingb52fad22016-05-03 20:46:54 +01005417 update_rq_clock(this_rq);
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005418 cpu_load_update_nohz(this_rq, curr_jiffies, load);
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005419 raw_spin_unlock(&this_rq->lock);
5420}
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005421#else /* !CONFIG_NO_HZ_COMMON */
5422static inline void cpu_load_update_nohz(struct rq *this_rq,
5423 unsigned long curr_jiffies,
5424 unsigned long load) { }
5425#endif /* CONFIG_NO_HZ_COMMON */
5426
5427static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
5428{
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005429#ifdef CONFIG_NO_HZ_COMMON
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005430 /* See the mess around cpu_load_update_nohz(). */
5431 this_rq->last_load_update_tick = READ_ONCE(jiffies);
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005432#endif
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005433 cpu_load_update(this_rq, load, 1);
5434}
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005435
5436/*
5437 * Called from scheduler_tick()
5438 */
Frederic Weisbeckercee1afc2016-04-13 15:56:50 +02005439void cpu_load_update_active(struct rq *this_rq)
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005440{
Yuyang Du7ea241a2015-07-15 08:04:42 +08005441 unsigned long load = weighted_cpuload(cpu_of(this_rq));
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005442
5443 if (tick_nohz_tick_stopped())
5444 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
5445 else
5446 cpu_load_update_periodic(this_rq, load);
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005447}
5448
Peter Zijlstra029632f2011-10-25 10:00:11 +02005449/*
5450 * Return a low guess at the load of a migration-source cpu weighted
5451 * according to the scheduling class and "nice" value.
5452 *
5453 * We want to under-estimate the load of migration sources, to
5454 * balance conservatively.
5455 */
5456static unsigned long source_load(int cpu, int type)
5457{
5458 struct rq *rq = cpu_rq(cpu);
5459 unsigned long total = weighted_cpuload(cpu);
5460
5461 if (type == 0 || !sched_feat(LB_BIAS))
5462 return total;
5463
5464 return min(rq->cpu_load[type-1], total);
5465}
5466
5467/*
5468 * Return a high guess at the load of a migration-target cpu weighted
5469 * according to the scheduling class and "nice" value.
5470 */
5471static unsigned long target_load(int cpu, int type)
5472{
5473 struct rq *rq = cpu_rq(cpu);
5474 unsigned long total = weighted_cpuload(cpu);
5475
5476 if (type == 0 || !sched_feat(LB_BIAS))
5477 return total;
5478
5479 return max(rq->cpu_load[type-1], total);
5480}
5481
Vincent Guittotca6d75e2015-02-27 16:54:09 +01005482
Peter Zijlstra029632f2011-10-25 10:00:11 +02005483static unsigned long cpu_avg_load_per_task(int cpu)
5484{
5485 struct rq *rq = cpu_rq(cpu);
Jason Low316c1608d2015-04-28 13:00:20 -07005486 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
Yuyang Du7ea241a2015-07-15 08:04:42 +08005487 unsigned long load_avg = weighted_cpuload(cpu);
Peter Zijlstra029632f2011-10-25 10:00:11 +02005488
5489 if (nr_running)
Alex Shib92486c2013-06-20 10:18:50 +08005490 return load_avg / nr_running;
Peter Zijlstra029632f2011-10-25 10:00:11 +02005491
5492 return 0;
5493}
5494
Peter Zijlstrabb3469a2008-06-27 13:41:27 +02005495#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstraf5bfb7d2008-06-27 13:41:39 +02005496/*
5497 * effective_load() calculates the load change as seen from the root_task_group
5498 *
5499 * Adding load to a group doesn't make a group heavier, but can cause movement
5500 * of group shares between cpus. Assuming the shares were perfectly aligned one
5501 * can calculate the shift in shares.
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02005502 *
5503 * Calculate the effective load difference if @wl is added (subtracted) to @tg
5504 * on this @cpu and results in a total addition (subtraction) of @wg to the
5505 * total group weight.
5506 *
5507 * Given a runqueue weight distribution (rw_i) we can compute a shares
5508 * distribution (s_i) using:
5509 *
5510 * s_i = rw_i / \Sum rw_j (1)
5511 *
5512 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
5513 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
5514 * shares distribution (s_i):
5515 *
5516 * rw_i = { 2, 4, 1, 0 }
5517 * s_i = { 2/7, 4/7, 1/7, 0 }
5518 *
5519 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
5520 * task used to run on and the CPU the waker is running on), we need to
5521 * compute the effect of waking a task on either CPU and, in case of a sync
5522 * wakeup, compute the effect of the current task going to sleep.
5523 *
5524 * So for a change of @wl to the local @cpu with an overall group weight change
5525 * of @wl we can compute the new shares distribution (s'_i) using:
5526 *
5527 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
5528 *
5529 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
5530 * differences in waking a task to CPU 0. The additional task changes the
5531 * weight and shares distributions like:
5532 *
5533 * rw'_i = { 3, 4, 1, 0 }
5534 * s'_i = { 3/8, 4/8, 1/8, 0 }
5535 *
5536 * We can then compute the difference in effective weight by using:
5537 *
5538 * dw_i = S * (s'_i - s_i) (3)
5539 *
5540 * Where 'S' is the group weight as seen by its parent.
5541 *
5542 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
5543 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
5544 * 4/7) times the weight of the group.
Peter Zijlstraf5bfb7d2008-06-27 13:41:39 +02005545 */
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005546static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstrabb3469a2008-06-27 13:41:27 +02005547{
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02005548 struct sched_entity *se = tg->se[cpu];
Peter Zijlstraf1d239f2008-06-27 13:41:38 +02005549
Rik van Riel9722c2d2014-01-06 11:39:12 +00005550 if (!tg->parent) /* the trivial, non-cgroup case */
Peter Zijlstraf1d239f2008-06-27 13:41:38 +02005551 return wl;
5552
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02005553 for_each_sched_entity(se) {
Peter Zijlstra7dd49122016-06-24 15:53:54 +02005554 struct cfs_rq *cfs_rq = se->my_q;
5555 long W, w = cfs_rq_load_avg(cfs_rq);
Peter Zijlstrabb3469a2008-06-27 13:41:27 +02005556
Peter Zijlstra7dd49122016-06-24 15:53:54 +02005557 tg = cfs_rq->tg;
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02005558
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02005559 /*
5560 * W = @wg + \Sum rw_j
5561 */
Peter Zijlstra7dd49122016-06-24 15:53:54 +02005562 W = wg + atomic_long_read(&tg->load_avg);
5563
5564 /* Ensure \Sum rw_j >= rw_i */
5565 W -= cfs_rq->tg_load_avg_contrib;
5566 W += w;
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02005567
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02005568 /*
5569 * w = rw_i + @wl
5570 */
Peter Zijlstra7dd49122016-06-24 15:53:54 +02005571 w += wl;
Peter Zijlstra940959e2008-09-23 15:33:42 +02005572
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02005573 /*
5574 * wl = S * s'_i; see (2)
5575 */
5576 if (W > 0 && w < W)
Dietmar Eggemannab522e32016-08-22 15:00:41 +01005577 wl = (w * (long)scale_load_down(tg->shares)) / W;
Paul Turner977dda72011-01-14 17:57:50 -08005578 else
Dietmar Eggemannab522e32016-08-22 15:00:41 +01005579 wl = scale_load_down(tg->shares);
Peter Zijlstra940959e2008-09-23 15:33:42 +02005580
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02005581 /*
5582 * Per the above, wl is the new se->load.weight value; since
5583 * those are clipped to [MIN_SHARES, ...) do so now. See
5584 * calc_cfs_shares().
5585 */
Paul Turner977dda72011-01-14 17:57:50 -08005586 if (wl < MIN_SHARES)
5587 wl = MIN_SHARES;
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02005588
5589 /*
5590 * wl = dw_i = S * (s'_i - s_i); see (3)
5591 */
Yuyang Du9d89c252015-07-15 08:04:37 +08005592 wl -= se->avg.load_avg;
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02005593
5594 /*
5595 * Recursively apply this logic to all parent groups to compute
5596 * the final effective load change on the root group. Since
5597 * only the @tg group gets extra weight, all parent groups can
5598 * only redistribute existing shares. @wl is the shift in shares
5599 * resulting from this level per the above.
5600 */
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02005601 wg = 0;
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02005602 }
5603
5604 return wl;
Peter Zijlstrabb3469a2008-06-27 13:41:27 +02005605}
5606#else
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02005607
Mel Gorman58d081b2013-10-07 11:29:10 +01005608static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02005609{
Peter Zijlstra83378262008-06-27 13:41:37 +02005610 return wl;
Peter Zijlstrabb3469a2008-06-27 13:41:27 +02005611}
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02005612
Peter Zijlstrabb3469a2008-06-27 13:41:27 +02005613#endif
5614
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02005615static void record_wakee(struct task_struct *p)
5616{
5617 /*
5618 * Only decay a single time; tasks that have less then 1 wakeup per
5619 * jiffy will not have built up many flips.
5620 */
5621 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5622 current->wakee_flips >>= 1;
5623 current->wakee_flip_decay_ts = jiffies;
5624 }
5625
5626 if (current->last_wakee != p) {
5627 current->last_wakee = p;
5628 current->wakee_flips++;
5629 }
5630}
5631
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02005632/*
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305633 * Externally visible function. Let's keep the one above
5634 * so that the check is inlined/optimized in the sched paths.
5635 */
5636bool sched_is_energy_aware(void)
5637{
5638 return energy_aware();
5639}
5640
5641/*
Juri Lelli2f8ed122015-04-30 17:35:23 +01005642 * Returns the current capacity of cpu after applying both
5643 * cpu and freq scaling.
5644 */
5645unsigned long capacity_curr_of(int cpu)
5646{
5647 return cpu_rq(cpu)->cpu_capacity_orig *
5648 arch_scale_freq_capacity(NULL, cpu)
5649 >> SCHED_CAPACITY_SHIFT;
5650}
5651
Vikram Mulukutla8f013d02017-04-12 19:01:03 -07005652/*
Ionela Voinescu00c143e2017-12-07 19:43:46 +00005653 * Returns the current capacity of cpu after applying both
5654 * cpu and min freq scaling.
Vikram Mulukutla8f013d02017-04-12 19:01:03 -07005655 */
Ionela Voinescu00c143e2017-12-07 19:43:46 +00005656unsigned long capacity_min_of(int cpu)
Vikram Mulukutla8f013d02017-04-12 19:01:03 -07005657{
Ionela Voinescu00c143e2017-12-07 19:43:46 +00005658 if (!sched_feat(MIN_CAPACITY_CAPPING))
5659 return 0;
5660 return arch_scale_cpu_capacity(NULL, cpu) *
5661 arch_scale_min_freq_capacity(NULL, cpu)
5662 >> SCHED_CAPACITY_SHIFT;
Vikram Mulukutla8f013d02017-04-12 19:01:03 -07005663}
5664
Ionela Voinescu00c143e2017-12-07 19:43:46 +00005665/*
Patrick Bellasieca58022017-07-05 10:59:59 +01005666 * CPU candidates.
5667 *
5668 * These are labels to reference CPU candidates for an energy_diff.
5669 * Currently we support only two possible candidates: the task's previous CPU
5670 * and another candiate CPU.
5671 * More advanced/aggressive EAS selection policies can consider more
5672 * candidates.
5673 */
5674#define EAS_CPU_PRV 0
5675#define EAS_CPU_NXT 1
5676#define EAS_CPU_BKP 2
5677#define EAS_CPU_CNT 3
5678
5679/*
5680 * energy_diff - supports the computation of the estimated energy impact in
5681 * moving a "task"'s "util_delta" between different CPU candidates.
5682 */
Morten Rasmussena455fa72015-01-02 14:21:56 +00005683struct energy_env {
Patrick Bellasieca58022017-07-05 10:59:59 +01005684 /* Utilization to move */
5685 struct task_struct *p;
5686 int util_delta;
5687
5688 /* Mask of CPUs candidates to evaluate */
5689 cpumask_t cpus_mask;
5690
5691 /* CPU candidates to evaluate */
5692 struct {
5693
5694 /* CPU ID, must be in cpus_mask */
5695 int cpu_id;
5696
5697 /*
5698 * Index (into sched_group_energy::cap_states) of the OPP the
5699 * CPU needs to run at if the task is placed on it.
5700 * This includes the both active and blocked load, due to
5701 * other tasks on this CPU, as well as the task's own
5702 * utilization.
5703 */
5704 int cap_idx;
5705 int cap;
5706
5707 /* Estimated system energy */
5708 unsigned int energy;
5709
5710 /* Estimated energy variation wrt EAS_CPU_PRV */
5711 int nrg_delta;
5712
5713 } cpu[EAS_CPU_CNT];
5714
5715 /*
5716 * Index (into energy_env::cpu) of the morst energy efficient CPU for
5717 * the specified energy_env::task
5718 */
5719 int next_idx;
5720
5721 /* Support data */
Morten Rasmussena455fa72015-01-02 14:21:56 +00005722 struct sched_group *sg_top;
5723 struct sched_group *sg_cap;
Patrick Bellasieca58022017-07-05 10:59:59 +01005724 struct sched_group *sg;
Morten Rasmussena455fa72015-01-02 14:21:56 +00005725};
5726
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305727static int cpu_util_wake(int cpu, struct task_struct *p);
5728
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02005729/*
Morten Rasmussena455fa72015-01-02 14:21:56 +00005730 * __cpu_norm_util() returns the cpu util relative to a specific capacity,
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305731 * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
5732 * energy calculations.
5733 *
5734 * Since util is a scale-invariant utilization defined as:
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005735 *
5736 * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
5737 *
5738 * the normalized util can be found using the specific capacity.
5739 *
5740 * capacity = capacity_orig * curr_freq/max_freq
5741 *
5742 * norm_util = running_time/time ~ util/capacity
5743 */
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305744static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005745{
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005746 if (util >= capacity)
5747 return SCHED_CAPACITY_SCALE;
5748
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305749 return (util << SCHED_CAPACITY_SHIFT)/capacity;
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005750}
5751
Patrick Bellasieca58022017-07-05 10:59:59 +01005752static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx)
Syed Rameez Mustafa20acfe72017-01-30 09:35:46 +05305753{
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005754 unsigned long max_util = 0;
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305755 unsigned long util;
5756 int cpu;
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005757
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305758 for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
Chris Redpath505be1f2017-09-12 14:48:29 +01005759 util = cpu_util_wake(cpu, eenv->p);
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305760
5761 /*
5762 * If we are looking at the target CPU specified by the eenv,
5763 * then we should add the (estimated) utilization of the task
5764 * assuming we will wake it up on that CPU.
5765 */
Patrick Bellasieca58022017-07-05 10:59:59 +01005766 if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305767 util += eenv->util_delta;
5768
5769 max_util = max(max_util, util);
Ionela Voinescu42c96142017-12-07 19:50:45 +00005770
5771 /*
5772 * Take into account any minimum frequency imposed
5773 * elsewhere which limits the energy states available
5774 * If the MIN_CAPACITY_CAPPING feature is not enabled
5775 * capacity_min_of will return 0 (not capped).
5776 */
5777 max_util = max(max_util, capacity_min_of(cpu));
5778
Morten Rasmussena455fa72015-01-02 14:21:56 +00005779 }
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005780
5781 return max_util;
5782}
5783
5784/*
5785 * group_norm_util() returns the approximated group util relative to it's
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305786 * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
5787 * in energy calculations.
5788 *
5789 * Since task executions may or may not overlap in time in the group the true
5790 * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
5791 * when iterating over all CPUs in the group.
5792 * The latter estimate is used as it leads to a more pessimistic energy
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005793 * estimate (more busy).
5794 */
Morten Rasmussena455fa72015-01-02 14:21:56 +00005795static unsigned
Patrick Bellasieca58022017-07-05 10:59:59 +01005796long group_norm_util(struct energy_env *eenv, int cpu_idx)
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005797{
Patrick Bellasieca58022017-07-05 10:59:59 +01005798 unsigned long capacity = eenv->cpu[cpu_idx].cap;
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305799 unsigned long util, util_sum = 0;
5800 int cpu;
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005801
Patrick Bellasieca58022017-07-05 10:59:59 +01005802 for_each_cpu(cpu, sched_group_cpus(eenv->sg)) {
Chris Redpath505be1f2017-09-12 14:48:29 +01005803 util = cpu_util_wake(cpu, eenv->p);
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305804
5805 /*
5806 * If we are looking at the target CPU specified by the eenv,
5807 * then we should add the (estimated) utilization of the task
5808 * assuming we will wake it up on that CPU.
5809 */
Patrick Bellasieca58022017-07-05 10:59:59 +01005810 if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305811 util += eenv->util_delta;
5812
5813 util_sum += __cpu_norm_util(util, capacity);
Morten Rasmussena455fa72015-01-02 14:21:56 +00005814 }
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005815
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305816 return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005817}
5818
Patrick Bellasieca58022017-07-05 10:59:59 +01005819static int find_new_capacity(struct energy_env *eenv, int cpu_idx)
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005820{
Patrick Bellasieca58022017-07-05 10:59:59 +01005821 const struct sched_group_energy *sge = eenv->sg->sge;
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305822 int idx, max_idx = sge->nr_cap_states - 1;
Patrick Bellasieca58022017-07-05 10:59:59 +01005823 unsigned long util = group_max_util(eenv, cpu_idx);
Joonwoo Park5dee3f12016-12-15 12:06:19 -08005824
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305825 /* default is max_cap if we don't find a match */
Patrick Bellasieca58022017-07-05 10:59:59 +01005826 eenv->cpu[cpu_idx].cap_idx = max_idx;
5827 eenv->cpu[cpu_idx].cap = sge->cap_states[max_idx].cap;
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005828
5829 for (idx = 0; idx < sge->nr_cap_states; idx++) {
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305830 if (sge->cap_states[idx].cap >= util) {
Patrick Bellasieca58022017-07-05 10:59:59 +01005831 /* Keep track of SG's capacity */
5832 eenv->cpu[cpu_idx].cap_idx = idx;
5833 eenv->cpu[cpu_idx].cap = sge->cap_states[idx].cap;
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305834 break;
5835 }
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005836 }
5837
Patrick Bellasieca58022017-07-05 10:59:59 +01005838 return eenv->cpu[cpu_idx].cap_idx;
Joonwoo Park5dee3f12016-12-15 12:06:19 -08005839}
5840
Patrick Bellasieca58022017-07-05 10:59:59 +01005841static int group_idle_state(struct energy_env *eenv, int cpu_idx)
Joonwoo Park5dee3f12016-12-15 12:06:19 -08005842{
Patrick Bellasieca58022017-07-05 10:59:59 +01005843 struct sched_group *sg = eenv->sg;
Dietmar Eggemann1f884f42015-01-27 14:04:17 +00005844 int i, state = INT_MAX;
Chris Redpathda03fc12017-07-04 10:23:03 +01005845 int src_in_grp, dst_in_grp;
5846 long grp_util = 0;
Dietmar Eggemann1f884f42015-01-27 14:04:17 +00005847
5848 /* Find the shallowest idle state in the sched group. */
5849 for_each_cpu(i, sched_group_cpus(sg))
5850 state = min(state, idle_get_state_idx(cpu_rq(i)));
5851
Joonwoo Parkbb5b0e62017-06-01 14:54:23 -07005852 if (unlikely(state == INT_MAX))
5853 return -EINVAL;
5854
Dietmar Eggemann1f884f42015-01-27 14:04:17 +00005855 /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
5856 state++;
5857
Patrick Bellasieca58022017-07-05 10:59:59 +01005858 src_in_grp = cpumask_test_cpu(eenv->cpu[EAS_CPU_PRV].cpu_id,
5859 sched_group_cpus(sg));
5860 dst_in_grp = cpumask_test_cpu(eenv->cpu[cpu_idx].cpu_id,
5861 sched_group_cpus(sg));
Chris Redpathda03fc12017-07-04 10:23:03 +01005862 if (src_in_grp == dst_in_grp) {
5863 /* both CPUs under consideration are in the same group or not in
5864 * either group, migration should leave idle state the same.
5865 */
5866 goto end;
5867 }
Ke Wang97bbdb02017-11-01 16:07:38 +08005868
5869 /*
5870 * Try to estimate if a deeper idle state is
5871 * achievable when we move the task.
Chris Redpathda03fc12017-07-04 10:23:03 +01005872 */
Ke Wang97bbdb02017-11-01 16:07:38 +08005873 for_each_cpu(i, sched_group_cpus(sg)) {
Chris Redpath505be1f2017-09-12 14:48:29 +01005874 grp_util += cpu_util_wake(i, eenv->p);
Patrick Bellasieca58022017-07-05 10:59:59 +01005875 if (unlikely(i == eenv->cpu[cpu_idx].cpu_id))
Ke Wang97bbdb02017-11-01 16:07:38 +08005876 grp_util += eenv->util_delta;
5877 }
Chris Redpathda03fc12017-07-04 10:23:03 +01005878
5879 if (grp_util <=
5880 ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
5881 /* after moving, this group is at most partly
5882 * occupied, so it should have some idle time.
5883 */
5884 int max_idle_state_idx = sg->sge->nr_idle_states - 2;
5885 int new_state = grp_util * max_idle_state_idx;
5886 if (grp_util <= 0)
5887 /* group will have no util, use lowest state */
5888 new_state = max_idle_state_idx + 1;
5889 else {
5890 /* for partially idle, linearly map util to idle
5891 * states, excluding the lowest one. This does not
5892 * correspond to the state we expect to enter in
5893 * reality, but an indication of what might happen.
5894 */
5895 new_state = min(max_idle_state_idx, (int)
5896 (new_state / sg->sgc->max_capacity));
5897 new_state = max_idle_state_idx - new_state;
5898 }
5899 state = new_state;
5900 } else {
5901 /* After moving, the group will be fully occupied
5902 * so assume it will not be idle at all.
5903 */
5904 state = 0;
5905 }
5906end:
Dietmar Eggemann1f884f42015-01-27 14:04:17 +00005907 return state;
5908}
5909
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005910/*
Patrick Bellasieca58022017-07-05 10:59:59 +01005911 * calc_sg_energy: compute energy for the eenv's SG (i.e. eenv->sg).
5912 *
5913 * This works in iterations to compute the SG's energy for each CPU
5914 * candidate defined by the energy_env's cpu array.
Patrick Bellasi1fac7062017-07-31 11:21:37 +01005915 *
5916 * NOTE: in the following computations for busy_energy and idle_energy we do
5917 * not shift by SCHED_CAPACITY_SHIFT in order to reduce rounding errors.
5918 * The required scaling will be performed just one time, by the calling
5919 * functions, once we accumulated the contributons for all the SGs.
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005920 */
Joonwoo Parkbb5b0e62017-06-01 14:54:23 -07005921static int calc_sg_energy(struct energy_env *eenv)
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005922{
Patrick Bellasieca58022017-07-05 10:59:59 +01005923 struct sched_group *sg = eenv->sg;
5924 int busy_energy, idle_energy;
5925 unsigned int busy_power;
5926 unsigned int idle_power;
5927 unsigned long sg_util;
5928 int cap_idx, idle_idx;
5929 int total_energy = 0;
5930 int cpu_idx;
5931
5932 for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
5933
5934
5935 if (eenv->cpu[cpu_idx].cpu_id == -1)
5936 continue;
5937 /* Compute ACTIVE energy */
5938 cap_idx = find_new_capacity(eenv, cpu_idx);
5939 busy_power = sg->sge->cap_states[cap_idx].power;
5940 /*
5941 * in order to calculate cpu_norm_util, we need to know which
5942 * capacity level the group will be at, so calculate that first
5943 */
5944 sg_util = group_norm_util(eenv, cpu_idx);
5945
5946 busy_energy = sg_util * busy_power;
Patrick Bellasieca58022017-07-05 10:59:59 +01005947
5948 /* Compute IDLE energy */
5949 idle_idx = group_idle_state(eenv, cpu_idx);
Joonwoo Parkbb5b0e62017-06-01 14:54:23 -07005950 if (unlikely(idle_idx < 0))
5951 return idle_idx;
Joonwoo Parkecedc7a2017-06-07 11:51:54 -07005952 if (idle_idx > sg->sge->nr_idle_states - 1)
5953 idle_idx = sg->sge->nr_idle_states - 1;
5954
Patrick Bellasieca58022017-07-05 10:59:59 +01005955 idle_power = sg->sge->idle_states[idle_idx].power;
5956
5957 idle_energy = SCHED_CAPACITY_SCALE - sg_util;
5958 idle_energy *= idle_power;
Patrick Bellasieca58022017-07-05 10:59:59 +01005959
5960 total_energy = busy_energy + idle_energy;
5961 eenv->cpu[cpu_idx].energy += total_energy;
5962 }
Joonwoo Parkbb5b0e62017-06-01 14:54:23 -07005963 return 0;
Patrick Bellasieca58022017-07-05 10:59:59 +01005964}
5965
5966/*
5967 * compute_energy() computes the absolute variation in energy consumption by
5968 * moving eenv.util_delta from EAS_CPU_PRV to EAS_CPU_NXT.
5969 *
5970 * NOTE: compute_energy() may fail when racing with sched_domain updates, in
5971 * which case we abort by returning -EINVAL.
5972 */
5973static int compute_energy(struct energy_env *eenv)
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005974{
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005975 struct cpumask visit_cpus;
Chris Redpathe2cc9502018-01-24 09:25:24 +00005976 int cpu_count;
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005977
Morten Rasmussena455fa72015-01-02 14:21:56 +00005978 WARN_ON(!eenv->sg_top->sge);
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005979
Morten Rasmussena455fa72015-01-02 14:21:56 +00005980 cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
Chris Redpathe2cc9502018-01-24 09:25:24 +00005981 /* If a cpu is hotplugged in while we are in this function,
5982 * it does not appear in the existing visit_cpus mask
5983 * which came from the sched_group pointer of the
5984 * sched_domain pointed at by sd_ea for either the prev
5985 * or next cpu and was dereferenced in __energy_diff.
5986 * Since we will dereference sd_scs later as we iterate
5987 * through the CPUs we expect to visit, new CPUs can
5988 * be present which are not in the visit_cpus mask.
5989 * Guard this with cpu_count.
5990 */
5991 cpu_count = cpumask_weight(&visit_cpus);
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005992
5993 while (!cpumask_empty(&visit_cpus)) {
5994 struct sched_group *sg_shared_cap = NULL;
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05305995 int cpu = cpumask_first(&visit_cpus);
5996 struct sched_domain *sd;
Morten Rasmussen61bf6252014-12-18 14:47:18 +00005997
5998 /*
5999 * Is the group utilization affected by cpus outside this
6000 * sched_group?
Chris Redpathe2cc9502018-01-24 09:25:24 +00006001 * This sd may have groups with cpus which were not present
6002 * when we took visit_cpus.
Morten Rasmussen61bf6252014-12-18 14:47:18 +00006003 */
6004 sd = rcu_dereference(per_cpu(sd_scs, cpu));
Morten Rasmussene4188502017-02-06 16:28:53 +00006005 if (sd && sd->parent)
Morten Rasmussen61bf6252014-12-18 14:47:18 +00006006 sg_shared_cap = sd->parent->groups;
6007
6008 for_each_domain(cpu, sd) {
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306009 struct sched_group *sg = sd->groups;
Morten Rasmussen61bf6252014-12-18 14:47:18 +00006010
6011 /* Has this sched_domain already been visited? */
6012 if (sd->child && group_first_cpu(sg) != cpu)
6013 break;
6014
6015 do {
Patrick Bellasieca58022017-07-05 10:59:59 +01006016 eenv->sg_cap = sg;
Morten Rasmussen61bf6252014-12-18 14:47:18 +00006017 if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
Morten Rasmussena455fa72015-01-02 14:21:56 +00006018 eenv->sg_cap = sg_shared_cap;
Morten Rasmussen61bf6252014-12-18 14:47:18 +00006019
Patrick Bellasieca58022017-07-05 10:59:59 +01006020 /*
6021 * Compute the energy for all the candidate
6022 * CPUs in the current visited SG.
6023 */
6024 eenv->sg = sg;
Joonwoo Parkbb5b0e62017-06-01 14:54:23 -07006025 if (calc_sg_energy(eenv))
6026 return -EINVAL;
Patrick Bellasi632905f2016-01-14 18:35:13 +00006027
Patrick Bellasieca58022017-07-05 10:59:59 +01006028 /* remove CPUs we have just visited */
Chris Redpathe2cc9502018-01-24 09:25:24 +00006029 if (!sd->child) {
6030 /*
6031 * cpu_count here is the number of
6032 * cpus we expect to visit in this
6033 * calculation. If we race against
6034 * hotplug, we can have extra cpus
6035 * added to the groups we are
6036 * iterating which do not appear in
6037 * the visit_cpus mask. In that case
6038 * we are not able to calculate energy
6039 * without restarting so we will bail
6040 * out and use prev_cpu this time.
6041 */
6042 if (!cpu_count)
6043 return -EINVAL;
Morten Rasmussen61bf6252014-12-18 14:47:18 +00006044 cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
Chris Redpathe2cc9502018-01-24 09:25:24 +00006045 cpu_count--;
6046 }
Morten Rasmussen61bf6252014-12-18 14:47:18 +00006047
Samer Xie47d813f2019-03-04 15:30:44 +08006048 if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)) &&
6049 sd->child)
Morten Rasmussen61bf6252014-12-18 14:47:18 +00006050 goto next_cpu;
6051
6052 } while (sg = sg->next, sg != sd->groups);
6053 }
Morten Rasmussene4188502017-02-06 16:28:53 +00006054
6055 /*
6056 * If we raced with hotplug and got an sd NULL-pointer;
6057 * returning a wrong energy estimation is better than
6058 * entering an infinite loop.
Chris Redpathe2cc9502018-01-24 09:25:24 +00006059 * Specifically: If a cpu is unplugged after we took
6060 * the visit_cpus mask, it no longer has an sd_scs
6061 * pointer, so when we dereference it, we get NULL.
Morten Rasmussene4188502017-02-06 16:28:53 +00006062 */
6063 if (cpumask_test_cpu(cpu, &visit_cpus))
6064 return -EINVAL;
Morten Rasmussen61bf6252014-12-18 14:47:18 +00006065next_cpu:
Todd Kjos64f6fd12016-06-16 16:33:54 -07006066 cpumask_clear_cpu(cpu, &visit_cpus);
Morten Rasmussen61bf6252014-12-18 14:47:18 +00006067 continue;
6068 }
6069
Morten Rasmussena455fa72015-01-02 14:21:56 +00006070 return 0;
Morten Rasmussen61bf6252014-12-18 14:47:18 +00006071}
6072
Morten Rasmussen931bd822015-01-06 17:34:05 +00006073static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
6074{
6075 return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
6076}
6077
6078/*
Patrick Bellasieca58022017-07-05 10:59:59 +01006079 * select_energy_cpu_idx(): estimate the energy impact of changing the
6080 * utilization distribution.
6081 *
6082 * The eenv parameter specifies the changes: utilisation amount and a pair of
6083 * possible CPU candidates (the previous CPU and a different target CPU).
6084 *
6085 * This function returns the index of a CPU candidate specified by the
6086 * energy_env which corresponds to the first CPU saving energy.
6087 * Thus, 0 (EAS_CPU_PRV) means that non of the CPU candidate is more energy
6088 * efficient than running on prev_cpu. This is also the value returned in case
6089 * of abort due to error conditions during the computations.
6090 * A value greater than zero means that the first energy-efficient CPU is the
6091 * one represented by eenv->cpu[eenv->next_idx].cpu_id.
Morten Rasmussen931bd822015-01-06 17:34:05 +00006092 */
Patrick Bellasieca58022017-07-05 10:59:59 +01006093static inline int select_energy_cpu_idx(struct energy_env *eenv)
Morten Rasmussen931bd822015-01-06 17:34:05 +00006094{
6095 struct sched_domain *sd;
6096 struct sched_group *sg;
Patrick Bellasi326e4472017-09-12 14:57:51 +01006097 int sd_cpu = -1;
Patrick Bellasieca58022017-07-05 10:59:59 +01006098 int cpu_idx;
Patrick Bellasi326e4472017-09-12 14:57:51 +01006099 int margin;
Morten Rasmussen931bd822015-01-06 17:34:05 +00006100
Patrick Bellasieca58022017-07-05 10:59:59 +01006101 sd_cpu = eenv->cpu[EAS_CPU_PRV].cpu_id;
Morten Rasmussen931bd822015-01-06 17:34:05 +00006102 sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
Morten Rasmussen931bd822015-01-06 17:34:05 +00006103 if (!sd)
Patrick Bellasieca58022017-07-05 10:59:59 +01006104 return EAS_CPU_PRV;
6105
6106 cpumask_clear(&eenv->cpus_mask);
6107 for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
6108 int cpu = eenv->cpu[cpu_idx].cpu_id;
6109
6110 if (cpu < 0)
6111 continue;
6112 cpumask_set_cpu(cpu, &eenv->cpus_mask);
6113 }
Morten Rasmussen931bd822015-01-06 17:34:05 +00006114
6115 sg = sd->groups;
Morten Rasmussen931bd822015-01-06 17:34:05 +00006116 do {
Patrick Bellasieca58022017-07-05 10:59:59 +01006117 /* Skip SGs which do not contains a candidate CPU */
6118 if (!cpumask_intersects(&eenv->cpus_mask, sched_group_cpus(sg)))
6119 continue;
Morten Rasmussen931bd822015-01-06 17:34:05 +00006120
Patrick Bellasieca58022017-07-05 10:59:59 +01006121 eenv->sg_top = sg;
Patrick Bellasi1fac7062017-07-31 11:21:37 +01006122 /* energy is unscaled to reduce rounding errors */
Patrick Bellasieca58022017-07-05 10:59:59 +01006123 if (compute_energy(eenv) == -EINVAL)
6124 return EAS_CPU_PRV;
Morten Rasmussen931bd822015-01-06 17:34:05 +00006125
Morten Rasmussen931bd822015-01-06 17:34:05 +00006126 } while (sg = sg->next, sg != sd->groups);
6127
Patrick Bellasi1fac7062017-07-31 11:21:37 +01006128 /* Scale energy before comparisons */
6129 for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx)
6130 eenv->cpu[cpu_idx].energy >>= SCHED_CAPACITY_SHIFT;
6131
Morten Rasmussen53838922016-03-30 14:20:12 +01006132 /*
Patrick Bellasieca58022017-07-05 10:59:59 +01006133 * Compute the dead-zone margin used to prevent too many task
6134 * migrations with negligible energy savings.
6135 * An energy saving is considered meaningful if it reduces the energy
6136 * consumption of EAS_CPU_PRV CPU candidate by at least ~1.56%
Morten Rasmussen53838922016-03-30 14:20:12 +01006137 */
Patrick Bellasieca58022017-07-05 10:59:59 +01006138 margin = eenv->cpu[EAS_CPU_PRV].energy >> 6;
Morten Rasmussen53838922016-03-30 14:20:12 +01006139
Patrick Bellasieca58022017-07-05 10:59:59 +01006140 /*
6141 * By default the EAS_CPU_PRV CPU is considered the most energy
6142 * efficient, with a 0 energy variation.
6143 */
6144 eenv->next_idx = EAS_CPU_PRV;
Morten Rasmussen53838922016-03-30 14:20:12 +01006145
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05306146 trace_sched_energy_diff(eenv->p, eenv->cpu[EAS_CPU_PRV].cpu_id,
6147 eenv->cpu[EAS_CPU_PRV].energy,
6148 eenv->cpu[EAS_CPU_NXT].cpu_id,
6149 eenv->cpu[EAS_CPU_NXT].energy,
6150 eenv->cpu[EAS_CPU_BKP].cpu_id,
6151 eenv->cpu[EAS_CPU_BKP].energy);
Patrick Bellasieca58022017-07-05 10:59:59 +01006152 /*
6153 * Compare the other CPU candidates to find a CPU which can be
6154 * more energy efficient then EAS_CPU_PRV
6155 */
6156 for (cpu_idx = EAS_CPU_NXT; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
6157 /* Skip not valid scheduled candidates */
6158 if (eenv->cpu[cpu_idx].cpu_id < 0)
6159 continue;
6160 /* Compute energy delta wrt EAS_CPU_PRV */
6161 eenv->cpu[cpu_idx].nrg_delta =
6162 eenv->cpu[cpu_idx].energy -
6163 eenv->cpu[EAS_CPU_PRV].energy;
6164 /* filter energy variations within the dead-zone margin */
6165 if (abs(eenv->cpu[cpu_idx].nrg_delta) < margin)
6166 eenv->cpu[cpu_idx].nrg_delta = 0;
6167 /* update the schedule candidate with min(nrg_delta) */
6168 if (eenv->cpu[cpu_idx].nrg_delta <
6169 eenv->cpu[eenv->next_idx].nrg_delta) {
6170 eenv->next_idx = cpu_idx;
Quentin Perrete93894c2017-12-11 14:56:12 +00006171 if (sched_feat(FBT_STRICT_ORDER))
6172 break;
Patrick Bellasieca58022017-07-05 10:59:59 +01006173 }
Chris Redpath293edee2017-03-27 18:20:20 +01006174 }
Patrick Bellasic5b20422016-07-29 15:45:57 +01006175
Patrick Bellasieca58022017-07-05 10:59:59 +01006176 return eenv->next_idx;
Patrick Bellasic5b20422016-07-29 15:45:57 +01006177}
6178
Morten Rasmussen61bf6252014-12-18 14:47:18 +00006179/*
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006180 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02006181 *
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006182 * A waker of many should wake a different task than the one last awakened
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02006183 * at a frequency roughly N times higher than one of its wakees.
6184 *
6185 * In order to determine whether we should let the load spread vs consolidating
6186 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
6187 * partner, and a factor of lls_size higher frequency in the other.
6188 *
6189 * With both conditions met, we can be relatively sure that the relationship is
6190 * non-monogamous, with partner count exceeding socket size.
6191 *
6192 * Waker/wakee being client/server, worker/dispatcher, interrupt source or
6193 * whatever is irrelevant, spread criteria is apparent partner count exceeds
6194 * socket size.
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006195 */
Michael Wang62470412013-07-04 12:55:51 +08006196static int wake_wide(struct task_struct *p)
6197{
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006198 unsigned int master = current->wakee_flips;
6199 unsigned int slave = p->wakee_flips;
Peter Zijlstra7d9ffa82013-07-04 12:56:46 +08006200 int factor = this_cpu_read(sd_llc_size);
Michael Wang62470412013-07-04 12:55:51 +08006201
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006202 if (master < slave)
6203 swap(master, slave);
6204 if (slave < factor || master < slave * factor)
6205 return 0;
6206 return 1;
Michael Wang62470412013-07-04 12:55:51 +08006207}
6208
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01006209static int wake_affine(struct sched_domain *sd, struct task_struct *p,
6210 int prev_cpu, int sync)
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006211{
Paul Turnere37b6a72011-01-21 20:44:59 -08006212 s64 this_load, load;
Vincent Guittotbd61c982014-08-26 13:06:50 +02006213 s64 this_eff_load, prev_eff_load;
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01006214 int idx, this_cpu;
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006215 struct task_group *tg;
Peter Zijlstra83378262008-06-27 13:41:37 +02006216 unsigned long weight;
Mike Galbraithb3137bc2008-05-29 11:11:41 +02006217 int balanced;
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006218
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006219 idx = sd->wake_idx;
6220 this_cpu = smp_processor_id();
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006221 load = source_load(prev_cpu, idx);
6222 this_load = target_load(this_cpu, idx);
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006223
6224 /*
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006225 * If sync wakeup then subtract the (maximum possible)
6226 * effect of the currently running task from the load
6227 * of the current CPU:
6228 */
Peter Zijlstra83378262008-06-27 13:41:37 +02006229 if (sync) {
6230 tg = task_group(current);
Yuyang Du9d89c252015-07-15 08:04:37 +08006231 weight = current->se.avg.load_avg;
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006232
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006233 this_load += effective_load(tg, this_cpu, -weight, -weight);
Peter Zijlstra83378262008-06-27 13:41:37 +02006234 load += effective_load(tg, prev_cpu, 0, -weight);
6235 }
6236
6237 tg = task_group(p);
Yuyang Du9d89c252015-07-15 08:04:37 +08006238 weight = p->se.avg.load_avg;
Peter Zijlstra83378262008-06-27 13:41:37 +02006239
Peter Zijlstra71a29aa2009-09-07 18:28:05 +02006240 /*
6241 * In low-load situations, where prev_cpu is idle and this_cpu is idle
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006242 * due to the sync cause above having dropped this_load to 0, we'll
6243 * always have an imbalance, but there's really nothing you can do
6244 * about that, so that's good too.
Peter Zijlstra71a29aa2009-09-07 18:28:05 +02006245 *
6246 * Otherwise check if either cpus are near enough in load to allow this
6247 * task to be woken on this_cpu.
6248 */
Vincent Guittotbd61c982014-08-26 13:06:50 +02006249 this_eff_load = 100;
6250 this_eff_load *= capacity_of(prev_cpu);
Peter Zijlstrae51fd5e2010-05-31 12:37:30 +02006251
Vincent Guittotbd61c982014-08-26 13:06:50 +02006252 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
6253 prev_eff_load *= capacity_of(this_cpu);
6254
6255 if (this_load > 0) {
Peter Zijlstrae51fd5e2010-05-31 12:37:30 +02006256 this_eff_load *= this_load +
6257 effective_load(tg, this_cpu, weight, weight);
6258
Peter Zijlstrae51fd5e2010-05-31 12:37:30 +02006259 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
Vincent Guittotbd61c982014-08-26 13:06:50 +02006260 }
Peter Zijlstrae51fd5e2010-05-31 12:37:30 +02006261
Vincent Guittotbd61c982014-08-26 13:06:50 +02006262 balanced = this_eff_load <= prev_eff_load;
Mike Galbraithb3137bc2008-05-29 11:11:41 +02006263
Josh Poimboeufae928822016-06-17 12:43:24 -05006264 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
Mike Galbraithb3137bc2008-05-29 11:11:41 +02006265
Vincent Guittot05bfb652014-08-26 13:06:45 +02006266 if (!balanced)
6267 return 0;
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006268
Josh Poimboeufae928822016-06-17 12:43:24 -05006269 schedstat_inc(sd->ttwu_move_affine);
6270 schedstat_inc(p->se.statistics.nr_wakeups_affine);
Vincent Guittot05bfb652014-08-26 13:06:45 +02006271
6272 return 1;
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006273}
6274
Chris Redpath505be1f2017-09-12 14:48:29 +01006275static inline unsigned long boosted_task_util(struct task_struct *p);
Patrick Bellasi9b2b8da2016-01-14 18:31:53 +00006276
Morten Rasmussenb9ac0092015-05-09 19:53:49 +01006277static inline bool __task_fits(struct task_struct *p, int cpu, int util)
6278{
Joonwoo Park01388ef2017-01-20 10:54:34 -08006279 unsigned int margin;
Morten Rasmussenb9ac0092015-05-09 19:53:49 +01006280
Patrick Bellasi9b2b8da2016-01-14 18:31:53 +00006281 util += boosted_task_util(p);
Morten Rasmussenb9ac0092015-05-09 19:53:49 +01006282
Joonwoo Park01388ef2017-01-20 10:54:34 -08006283 if (capacity_orig_of(task_cpu(p)) > capacity_orig_of(cpu))
Joonwoo Parkb02fc002017-06-16 11:58:58 -07006284 margin = sysctl_sched_capacity_margin_down;
Joonwoo Park01388ef2017-01-20 10:54:34 -08006285 else
Joonwoo Parkb02fc002017-06-16 11:58:58 -07006286 margin = sysctl_sched_capacity_margin;
Joonwoo Park01388ef2017-01-20 10:54:34 -08006287
Syed Rameez Mustafae21dd3c2017-03-07 11:25:39 -08006288 return (capacity_orig_of(cpu) * 1024) > (util * margin);
Morten Rasmussenb9ac0092015-05-09 19:53:49 +01006289}
6290
6291static inline bool task_fits_max(struct task_struct *p, int cpu)
6292{
Leo Yan5bc59022016-12-22 23:58:46 +08006293 unsigned long capacity = capacity_orig_of(cpu);
Dietmar Eggemannbbb138b2015-09-26 18:19:54 +01006294 unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
Morten Rasmussenb9ac0092015-05-09 19:53:49 +01006295
6296 if (capacity == max_capacity)
6297 return true;
6298
Abhijeet Dharmapurikar53ee4232018-06-15 09:34:34 -07006299 if (task_boost_policy(p) == SCHED_BOOST_ON_BIG)
Syed Rameez Mustafa20acfe72017-01-30 09:35:46 +05306300 return false;
6301
Morten Rasmussenb9ac0092015-05-09 19:53:49 +01006302 return __task_fits(p, cpu, 0);
6303}
6304
Maria Yuc28f0392019-05-15 11:45:50 +08006305static inline bool cpu_check_overutil_condition(int cpu,
6306 unsigned long util)
6307{
6308 return (capacity_orig_of(cpu) * 1024) < (util * capacity_margin);
6309}
6310
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306311bool __cpu_overutilized(int cpu, int delta)
Morten Rasmussenb9ac0092015-05-09 19:53:49 +01006312{
Satya Durga Srinivasu Prabhala999b62a2018-05-24 20:32:16 -07006313 return (capacity_orig_of(cpu) * 1024) <
6314 ((cpu_util(cpu) + delta) * capacity_margin);
Joonwoo Park2b901d52017-01-25 17:45:56 -08006315}
6316
Joonwoo Parkdc3420d2017-01-31 11:14:43 -08006317bool cpu_overutilized(int cpu)
Dietmar Eggemann90f309f2015-01-26 19:47:28 +00006318{
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306319 return __cpu_overutilized(cpu, 0);
Dietmar Eggemann90f309f2015-01-26 19:47:28 +00006320}
6321
Patrick Bellasib08685b2015-06-22 18:32:36 +01006322#ifdef CONFIG_SCHED_TUNE
6323
Patrick Bellasid8460c72016-10-13 17:31:24 +01006324struct reciprocal_value schedtune_spc_rdiv;
6325
Srinath Sridharane71c4252016-07-28 17:28:55 +01006326static long
6327schedtune_margin(unsigned long signal, long boost)
Patrick Bellasib08685b2015-06-22 18:32:36 +01006328{
Srinath Sridharane71c4252016-07-28 17:28:55 +01006329 long long margin = 0;
Patrick Bellasib08685b2015-06-22 18:32:36 +01006330
6331 /*
6332 * Signal proportional compensation (SPC)
6333 *
6334 * The Boost (B) value is used to compute a Margin (M) which is
6335 * proportional to the complement of the original Signal (S):
Patrick Bellasid8460c72016-10-13 17:31:24 +01006336 * M = B * (SCHED_CAPACITY_SCALE - S)
Patrick Bellasib08685b2015-06-22 18:32:36 +01006337 * The obtained M could be used by the caller to "boost" S.
6338 */
Srinath Sridharane71c4252016-07-28 17:28:55 +01006339 if (boost >= 0) {
6340 margin = SCHED_CAPACITY_SCALE - signal;
6341 margin *= boost;
Patrick Bellasic964a2b2018-02-12 15:56:18 +00006342 } else {
Srinath Sridharane71c4252016-07-28 17:28:55 +01006343 margin = -signal * boost;
Patrick Bellasic964a2b2018-02-12 15:56:18 +00006344 }
Patrick Bellasid8460c72016-10-13 17:31:24 +01006345
6346 margin = reciprocal_divide(margin, schedtune_spc_rdiv);
Srinath Sridharane71c4252016-07-28 17:28:55 +01006347 if (boost < 0)
6348 margin *= -1;
Patrick Bellasic964a2b2018-02-12 15:56:18 +00006349
Patrick Bellasib08685b2015-06-22 18:32:36 +01006350 return margin;
6351}
6352
Srinath Sridharane71c4252016-07-28 17:28:55 +01006353static inline int
Patrick Bellasiedd28d32015-07-07 15:33:20 +01006354schedtune_cpu_margin(unsigned long util, int cpu)
Patrick Bellasicaa24e42015-06-26 09:55:06 +01006355{
Patrick Bellasia33034d2016-07-28 17:42:36 +01006356 int boost = schedtune_cpu_boost(cpu);
Patrick Bellasicaa24e42015-06-26 09:55:06 +01006357
6358 if (boost == 0)
6359 return 0;
6360
6361 return schedtune_margin(util, boost);
6362}
6363
Srinath Sridharane71c4252016-07-28 17:28:55 +01006364static inline long
Chris Redpath505be1f2017-09-12 14:48:29 +01006365schedtune_task_margin(struct task_struct *p)
Patrick Bellasi9b2b8da2016-01-14 18:31:53 +00006366{
Chris Redpath505be1f2017-09-12 14:48:29 +01006367 int boost = schedtune_task_boost(p);
Patrick Bellasi9b2b8da2016-01-14 18:31:53 +00006368 unsigned long util;
Srinath Sridharane71c4252016-07-28 17:28:55 +01006369 long margin;
Patrick Bellasi9b2b8da2016-01-14 18:31:53 +00006370
Patrick Bellasi9b2b8da2016-01-14 18:31:53 +00006371 if (boost == 0)
6372 return 0;
6373
Chris Redpath505be1f2017-09-12 14:48:29 +01006374 util = task_util(p);
Patrick Bellasi9b2b8da2016-01-14 18:31:53 +00006375 margin = schedtune_margin(util, boost);
6376
6377 return margin;
6378}
6379
Patrick Bellasicaa24e42015-06-26 09:55:06 +01006380#else /* CONFIG_SCHED_TUNE */
6381
Srinath Sridharane71c4252016-07-28 17:28:55 +01006382static inline int
Patrick Bellasiedd28d32015-07-07 15:33:20 +01006383schedtune_cpu_margin(unsigned long util, int cpu)
Patrick Bellasicaa24e42015-06-26 09:55:06 +01006384{
6385 return 0;
6386}
6387
Srinath Sridharane71c4252016-07-28 17:28:55 +01006388static inline int
Chris Redpath505be1f2017-09-12 14:48:29 +01006389schedtune_task_margin(struct task_struct *p)
Patrick Bellasi9b2b8da2016-01-14 18:31:53 +00006390{
6391 return 0;
6392}
6393
Patrick Bellasib08685b2015-06-22 18:32:36 +01006394#endif /* CONFIG_SCHED_TUNE */
6395
Juri Lellic6e94382016-12-14 16:10:10 +00006396unsigned long
Pavankumar Kondeti12912ba2017-11-24 10:21:46 +05306397boosted_cpu_util(int cpu, struct sched_walt_cpu_load *walt_load)
Patrick Bellasicaa24e42015-06-26 09:55:06 +01006398{
Pavankumar Kondeti12912ba2017-11-24 10:21:46 +05306399 unsigned long util = cpu_util_freq(cpu, walt_load);
Srinath Sridharane71c4252016-07-28 17:28:55 +01006400 long margin = schedtune_cpu_margin(util, cpu);
Patrick Bellasicaa24e42015-06-26 09:55:06 +01006401
Patrick Bellasicccead12015-06-22 13:51:07 +01006402 trace_sched_boost_cpu(cpu, util, margin);
6403
Patrick Bellasicaa24e42015-06-26 09:55:06 +01006404 return util + margin;
6405}
6406
Patrick Bellasi9b2b8da2016-01-14 18:31:53 +00006407static inline unsigned long
Chris Redpath505be1f2017-09-12 14:48:29 +01006408boosted_task_util(struct task_struct *p)
Patrick Bellasi9b2b8da2016-01-14 18:31:53 +00006409{
Chris Redpath505be1f2017-09-12 14:48:29 +01006410 unsigned long util = task_util(p);
6411 long margin = schedtune_task_margin(p);
Patrick Bellasi9b2b8da2016-01-14 18:31:53 +00006412
Chris Redpath505be1f2017-09-12 14:48:29 +01006413 trace_sched_boost_task(p, util, margin);
Patrick Bellasiecccdb72016-01-14 18:43:37 +00006414
Patrick Bellasi9b2b8da2016-01-14 18:31:53 +00006415 return util + margin;
6416}
6417
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306418static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
6419{
Joel Fernandes7fd40752017-11-09 10:52:19 -08006420 return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306421}
6422
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006423/*
6424 * find_idlest_group finds and returns the least busy CPU group within the
6425 * domain.
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306426 *
6427 * Assumes p is allowed on at least one CPU in sd.
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006428 */
6429static struct sched_group *
Peter Zijlstra78e7ed52009-09-03 13:16:51 +02006430find_idlest_group(struct sched_domain *sd, struct task_struct *p,
Vincent Guittotc44f2a02013-10-18 13:52:21 +02006431 int this_cpu, int sd_flag)
Gregory Haskinse7693a32008-01-25 21:08:09 +01006432{
Andi Kleenb3bd3de2010-08-10 14:17:51 -07006433 struct sched_group *idlest = NULL, *group = sd->groups;
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306434 struct sched_group *most_spare_sg = NULL;
6435 unsigned long min_runnable_load = ULONG_MAX;
6436 unsigned long this_runnable_load = ULONG_MAX;
6437 unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
6438 unsigned long most_spare = 0, this_spare = 0;
Vincent Guittotc44f2a02013-10-18 13:52:21 +02006439 int load_idx = sd->forkexec_idx;
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306440 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
6441 unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
6442 (sd->imbalance_pct-100) / 100;
Joonwoo Parkb02fc002017-06-16 11:58:58 -07006443
Vincent Guittotc44f2a02013-10-18 13:52:21 +02006444 if (sd_flag & SD_BALANCE_WAKE)
6445 load_idx = sd->wake_idx;
6446
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006447 do {
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306448 unsigned long load, avg_load, runnable_load;
6449 unsigned long spare_cap, max_spare_cap;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006450 int local_group;
6451 int i;
Gregory Haskinse7693a32008-01-25 21:08:09 +01006452
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006453 /* Skip over this group if it has no CPUs allowed */
6454 if (!cpumask_intersects(sched_group_cpus(group),
Peter Zijlstrafa17b502011-06-16 12:23:22 +02006455 tsk_cpus_allowed(p)))
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006456 continue;
6457
6458 local_group = cpumask_test_cpu(this_cpu,
6459 sched_group_cpus(group));
6460
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306461 /*
6462 * Tally up the load of all CPUs in the group and find
6463 * the group containing the CPU with most spare capacity.
6464 */
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006465 avg_load = 0;
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306466 runnable_load = 0;
6467 max_spare_cap = 0;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006468
6469 for_each_cpu(i, sched_group_cpus(group)) {
6470 /* Bias balancing toward cpus of our domain */
6471 if (local_group)
6472 load = source_load(i, load_idx);
6473 else
6474 load = target_load(i, load_idx);
6475
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306476 runnable_load += load;
Morten Rasmussenb9ac0092015-05-09 19:53:49 +01006477
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306478 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
Morten Rasmussende9b6362015-07-06 15:01:10 +01006479
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306480 spare_cap = capacity_spare_wake(i, p);
6481
6482 if (spare_cap > max_spare_cap)
6483 max_spare_cap = spare_cap;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006484 }
6485
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04006486 /* Adjust by relative CPU capacity of the group */
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306487 avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
6488 group->sgc->capacity;
6489 runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
6490 group->sgc->capacity;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006491
6492 if (local_group) {
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306493 this_runnable_load = runnable_load;
6494 this_avg_load = avg_load;
6495 this_spare = max_spare_cap;
6496 } else {
6497 if (min_runnable_load > (runnable_load + imbalance)) {
6498 /*
6499 * The runnable load is significantly smaller
6500 * so we can pick this new cpu
6501 */
6502 min_runnable_load = runnable_load;
6503 min_avg_load = avg_load;
6504 idlest = group;
6505 } else if ((runnable_load < (min_runnable_load + imbalance)) &&
6506 (100*min_avg_load > imbalance_scale*avg_load)) {
6507 /*
6508 * The runnable loads are close so we take
6509 * into account blocked load through avg_load
6510 * which is blocked + runnable load
6511 */
6512 min_avg_load = avg_load;
6513 idlest = group;
6514 }
6515
6516 if (most_spare < max_spare_cap) {
6517 most_spare = max_spare_cap;
6518 most_spare_sg = group;
6519 }
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006520 }
6521 } while (group = group->next, group != sd->groups);
6522
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306523 /*
6524 * The cross-over point between using spare capacity or least load
6525 * is too conservative for high utilization tasks on partially
6526 * utilized systems if we require spare_capacity > task_util(p),
6527 * so we allow for some task stuffing by using
6528 * spare_capacity > task_util(p)/2.
6529 * spare capacity can't be used for fork because the utilization has
6530 * not been set yet as it need to get a rq to init the utilization
6531 */
6532 if (sd_flag & SD_BALANCE_FORK)
6533 goto skip_spare;
Morten Rasmussenb9ac0092015-05-09 19:53:49 +01006534
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306535 if (this_spare > task_util(p) / 2 &&
6536 imbalance_scale*this_spare > 100*most_spare)
6537 return NULL;
6538 else if (most_spare > task_util(p) / 2)
6539 return most_spare_sg;
Morten Rasmussende9b6362015-07-06 15:01:10 +01006540
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306541skip_spare:
6542 if (!idlest ||
6543 (min_runnable_load > (this_runnable_load + imbalance)) ||
6544 ((this_runnable_load < (min_runnable_load + imbalance)) &&
6545 (100*this_avg_load < imbalance_scale*min_avg_load)))
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006546 return NULL;
6547 return idlest;
6548}
6549
6550/*
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306551 * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006552 */
6553static int
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306554find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006555{
6556 unsigned long load, min_load = ULONG_MAX;
Nicolas Pitre83a0a962014-09-04 11:32:10 -04006557 unsigned int min_exit_latency = UINT_MAX;
6558 u64 latest_idle_timestamp = 0;
6559 int least_loaded_cpu = this_cpu;
6560 int shallowest_idle_cpu = -1;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006561 int i;
6562
Morten Rasmusseneaecf412016-06-22 18:03:14 +01006563 /* Check if we have any choice: */
6564 if (group->group_weight == 1)
6565 return cpumask_first(sched_group_cpus(group));
6566
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006567 /* Traverse only the allowed CPUs */
Peter Zijlstrafa17b502011-06-16 12:23:22 +02006568 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306569 if (idle_cpu(i)) {
Nicolas Pitre83a0a962014-09-04 11:32:10 -04006570 struct rq *rq = cpu_rq(i);
6571 struct cpuidle_state *idle = idle_get_state(rq);
6572 if (idle && idle->exit_latency < min_exit_latency) {
6573 /*
6574 * We give priority to a CPU whose idle state
6575 * has the smallest exit latency irrespective
6576 * of any idle timestamp.
6577 */
6578 min_exit_latency = idle->exit_latency;
6579 latest_idle_timestamp = rq->idle_stamp;
6580 shallowest_idle_cpu = i;
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306581 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
Nicolas Pitre83a0a962014-09-04 11:32:10 -04006582 rq->idle_stamp > latest_idle_timestamp) {
6583 /*
6584 * If equal or no active idle state, then
6585 * the most recently idled CPU might have
6586 * a warmer cache.
6587 */
6588 latest_idle_timestamp = rq->idle_stamp;
6589 shallowest_idle_cpu = i;
6590 }
Yao Dongdong9f967422014-10-28 04:08:06 +00006591 } else if (shallowest_idle_cpu == -1) {
Nicolas Pitre83a0a962014-09-04 11:32:10 -04006592 load = weighted_cpuload(i);
6593 if (load < min_load || (load == min_load && i == this_cpu)) {
6594 min_load = load;
6595 least_loaded_cpu = i;
6596 }
Gregory Haskinse7693a32008-01-25 21:08:09 +01006597 }
6598 }
6599
Nicolas Pitre83a0a962014-09-04 11:32:10 -04006600 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006601}
Gregory Haskinse7693a32008-01-25 21:08:09 +01006602
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306603static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
6604 int cpu, int prev_cpu, int sd_flag)
6605{
6606 int wu = sd_flag & SD_BALANCE_WAKE;
6607 int cas_cpu = -1;
6608 int new_cpu = cpu;
6609
6610 if (wu) {
6611 schedstat_inc(p->se.statistics.nr_wakeups_cas_attempts);
6612 schedstat_inc(this_rq()->eas_stats.cas_attempts);
6613 }
6614
6615 if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
6616 return prev_cpu;
6617
6618 while (sd) {
6619 struct sched_group *group;
6620 struct sched_domain *tmp;
6621 int weight;
6622
6623 if (wu)
6624 schedstat_inc(sd->eas_stats.cas_attempts);
6625
6626 if (!(sd->flags & sd_flag)) {
6627 sd = sd->child;
6628 continue;
6629 }
6630
6631 group = find_idlest_group(sd, p, cpu, sd_flag);
6632 if (!group) {
6633 sd = sd->child;
6634 continue;
6635 }
6636
6637 new_cpu = find_idlest_group_cpu(group, p, cpu);
6638 if (new_cpu == cpu) {
6639 /* Now try balancing at a lower domain level of cpu */
6640 sd = sd->child;
6641 continue;
6642 }
6643
6644 /* Now try balancing at a lower domain level of new_cpu */
6645 cpu = cas_cpu = new_cpu;
6646 weight = sd->span_weight;
6647 sd = NULL;
6648 for_each_domain(cpu, tmp) {
6649 if (weight <= tmp->span_weight)
6650 break;
6651 if (tmp->flags & sd_flag)
6652 sd = tmp;
6653 }
6654 /* while loop will break here if sd == NULL */
6655 }
6656
6657 if (wu && (cas_cpu >= 0)) {
6658 schedstat_inc(p->se.statistics.nr_wakeups_cas_count);
6659 schedstat_inc(this_rq()->eas_stats.cas_count);
6660 }
6661
6662 return new_cpu;
6663}
6664
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006665#ifdef CONFIG_SCHED_SMT
6666
6667static inline void set_idle_cores(int cpu, int val)
6668{
6669 struct sched_domain_shared *sds;
6670
6671 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6672 if (sds)
6673 WRITE_ONCE(sds->has_idle_cores, val);
6674}
6675
6676static inline bool test_idle_cores(int cpu, bool def)
6677{
6678 struct sched_domain_shared *sds;
6679
6680 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6681 if (sds)
6682 return READ_ONCE(sds->has_idle_cores);
6683
6684 return def;
6685}
6686
6687/*
6688 * Scans the local SMT mask to see if the entire core is idle, and records this
6689 * information in sd_llc_shared->has_idle_cores.
6690 *
6691 * Since SMT siblings share all cache levels, inspecting this limited remote
6692 * state should be fairly cheap.
6693 */
Greg Kroah-Hartmana0a93e32017-07-19 09:58:49 +02006694void update_idle_core(struct rq *rq)
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006695{
6696 int core = cpu_of(rq);
6697 int cpu;
6698
6699 rcu_read_lock();
6700 if (test_idle_cores(core, true))
6701 goto unlock;
6702
6703 for_each_cpu(cpu, cpu_smt_mask(core)) {
6704 if (cpu == core)
6705 continue;
6706
6707 if (!idle_cpu(cpu))
6708 goto unlock;
6709 }
6710
6711 set_idle_cores(core, 1);
6712unlock:
6713 rcu_read_unlock();
6714}
6715
6716/*
6717 * Scan the entire LLC domain for idle cores; this dynamically switches off if
6718 * there are no idle cores left in the system; tracked through
6719 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
6720 */
6721static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
6722{
6723 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
Peter Zijlstra542ebc92017-04-14 14:20:05 +02006724 int core, cpu;
Peter Zijlstra1b568f02016-05-09 10:38:41 +02006725
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006726 if (!test_idle_cores(target, false))
6727 return -1;
6728
6729 cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
6730
Peter Zijlstra542ebc92017-04-14 14:20:05 +02006731 for_each_cpu_wrap(core, cpus, target) {
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006732 bool idle = true;
6733
6734 for_each_cpu(cpu, cpu_smt_mask(core)) {
6735 cpumask_clear_cpu(cpu, cpus);
6736 if (!idle_cpu(cpu))
6737 idle = false;
6738 }
6739
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306740 if (idle)
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006741 return core;
6742 }
6743
6744 /*
6745 * Failed to find an idle core; stop looking for one.
6746 */
6747 set_idle_cores(target, 0);
6748
6749 return -1;
6750}
6751
6752/*
6753 * Scan the local SMT mask for idle CPUs.
6754 */
6755static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6756{
6757 int cpu;
6758
6759 for_each_cpu(cpu, cpu_smt_mask(target)) {
6760 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
6761 continue;
6762 if (idle_cpu(cpu))
6763 return cpu;
6764 }
6765
6766 return -1;
6767}
6768
6769#else /* CONFIG_SCHED_SMT */
6770
6771static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
6772{
6773 return -1;
6774}
6775
6776static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6777{
6778 return -1;
6779}
6780
6781#endif /* CONFIG_SCHED_SMT */
6782
6783/*
6784 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
6785 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
6786 * average idle time for this rq (as found in rq->avg_idle).
6787 */
6788static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
6789{
Wanpeng Li9cfb38a2016-10-09 08:04:03 +08006790 struct sched_domain *this_sd;
6791 u64 avg_cost, avg_idle = this_rq()->avg_idle;
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006792 u64 time, cost;
6793 s64 delta;
Peter Zijlstra542ebc92017-04-14 14:20:05 +02006794 int cpu;
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006795
Wanpeng Li9cfb38a2016-10-09 08:04:03 +08006796 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
6797 if (!this_sd)
6798 return -1;
6799
6800 avg_cost = this_sd->avg_scan_cost;
6801
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006802 /*
6803 * Due to large variance we need a large fuzz factor; hackbench in
6804 * particularly is sensitive here.
6805 */
Peter Zijlstra4e4a9eb2017-03-01 11:24:35 +01006806 if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost)
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006807 return -1;
6808
6809 time = local_clock();
6810
Peter Zijlstra542ebc92017-04-14 14:20:05 +02006811 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006812 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
6813 continue;
6814 if (idle_cpu(cpu))
6815 break;
6816 }
6817
6818 time = local_clock() - time;
6819 cost = this_sd->avg_scan_cost;
6820 delta = (s64)(time - cost) / 8;
6821 this_sd->avg_scan_cost += delta;
6822
6823 return cpu;
6824}
6825
6826/*
6827 * Try and locate an idle core/thread in the LLC cache domain.
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006828 */
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01006829static int select_idle_sibling(struct task_struct *p, int prev, int target)
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006830{
Suresh Siddha99bd5e22010-03-31 16:47:45 -07006831 struct sched_domain *sd;
Srinath Sridharanbf47bdd2016-07-14 09:57:29 +01006832 struct sched_group *sg;
6833 int i = task_cpu(p);
Dietmar Eggemann56ffdd62017-01-16 12:42:59 +00006834 int best_idle_cpu = -1;
6835 int best_idle_cstate = INT_MAX;
6836 unsigned long best_idle_capacity = ULONG_MAX;
Mike Galbraithe0a79f52013-01-28 12:19:25 +01006837
Dietmar Eggemannaf88a162017-03-22 18:23:13 +00006838 schedstat_inc(p->se.statistics.nr_wakeups_sis_attempts);
6839 schedstat_inc(this_rq()->eas_stats.sis_attempts);
Mike Galbraithe0a79f52013-01-28 12:19:25 +01006840
Srinath Sridharanbf47bdd2016-07-14 09:57:29 +01006841 if (!sysctl_sched_cstate_aware) {
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306842 if (idle_cpu(target)) {
Dietmar Eggemannaf88a162017-03-22 18:23:13 +00006843 schedstat_inc(p->se.statistics.nr_wakeups_sis_idle);
6844 schedstat_inc(this_rq()->eas_stats.sis_idle);
Srinath Sridharanbf47bdd2016-07-14 09:57:29 +01006845 return target;
Dietmar Eggemannaf88a162017-03-22 18:23:13 +00006846 }
Srinath Sridharanbf47bdd2016-07-14 09:57:29 +01006847
6848 /*
6849 * If the prevous cpu is cache affine and idle, don't be stupid.
6850 */
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306851 if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) {
Dietmar Eggemannaf88a162017-03-22 18:23:13 +00006852 schedstat_inc(p->se.statistics.nr_wakeups_sis_cache_affine);
6853 schedstat_inc(this_rq()->eas_stats.sis_cache_affine);
Srinath Sridharanbf47bdd2016-07-14 09:57:29 +01006854 return i;
Dietmar Eggemannaf88a162017-03-22 18:23:13 +00006855 }
Srinath Sridharanbf47bdd2016-07-14 09:57:29 +01006856
6857 sd = rcu_dereference(per_cpu(sd_llc, target));
6858 if (!sd)
6859 return target;
6860
6861 i = select_idle_core(p, sd, target);
6862 if ((unsigned)i < nr_cpumask_bits)
6863 return i;
6864
6865 i = select_idle_cpu(p, sd, target);
6866 if ((unsigned)i < nr_cpumask_bits)
6867 return i;
6868
6869 i = select_idle_smt(p, sd, target);
6870 if ((unsigned)i < nr_cpumask_bits)
6871 return i;
6872 }
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006873
6874 /*
Srinath Sridharanbf47bdd2016-07-14 09:57:29 +01006875 * Otherwise, iterate the domains and find an elegible idle cpu.
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006876 */
Peter Zijlstra518cd622011-12-07 15:07:31 +01006877 sd = rcu_dereference(per_cpu(sd_llc, target));
Srinath Sridharanbf47bdd2016-07-14 09:57:29 +01006878 for_each_lower_domain(sd) {
6879 sg = sd->groups;
6880 do {
6881 if (!cpumask_intersects(sched_group_cpus(sg),
6882 tsk_cpus_allowed(p)))
6883 goto next;
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01006884
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006885
Srinath Sridharanbf47bdd2016-07-14 09:57:29 +01006886 if (sysctl_sched_cstate_aware) {
6887 for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
Dietmar Eggemann56ffdd62017-01-16 12:42:59 +00006888 int idle_idx = idle_get_state_idx(cpu_rq(i));
Srinath Sridharanbf47bdd2016-07-14 09:57:29 +01006889 unsigned long new_usage = boosted_task_util(p);
6890 unsigned long capacity_orig = capacity_orig_of(i);
Syed Rameez Mustafa20acfe72017-01-30 09:35:46 +05306891
Srinath Sridharanbf47bdd2016-07-14 09:57:29 +01006892 if (new_usage > capacity_orig || !idle_cpu(i))
6893 goto next;
Mike Galbraith970e1782012-06-12 05:18:32 +02006894
Dietmar Eggemannaf88a162017-03-22 18:23:13 +00006895 if (i == target && new_usage <= capacity_curr_of(target)) {
6896 schedstat_inc(p->se.statistics.nr_wakeups_sis_suff_cap);
6897 schedstat_inc(this_rq()->eas_stats.sis_suff_cap);
6898 schedstat_inc(sd->eas_stats.sis_suff_cap);
Srinath Sridharanbf47bdd2016-07-14 09:57:29 +01006899 return target;
Dietmar Eggemannaf88a162017-03-22 18:23:13 +00006900 }
Linus Torvalds37407ea2012-09-16 12:29:43 -07006901
Dietmar Eggemann56ffdd62017-01-16 12:42:59 +00006902 if (idle_idx < best_idle_cstate &&
6903 capacity_orig <= best_idle_capacity) {
6904 best_idle_cpu = i;
Srinath Sridharanbf47bdd2016-07-14 09:57:29 +01006905 best_idle_cstate = idle_idx;
6906 best_idle_capacity = capacity_orig;
6907 }
6908 }
6909 } else {
6910 for_each_cpu(i, sched_group_cpus(sg)) {
6911 if (i == target || !idle_cpu(i))
6912 goto next;
6913 }
6914
6915 target = cpumask_first_and(sched_group_cpus(sg),
6916 tsk_cpus_allowed(p));
Dietmar Eggemannaf88a162017-03-22 18:23:13 +00006917 schedstat_inc(p->se.statistics.nr_wakeups_sis_idle_cpu);
6918 schedstat_inc(this_rq()->eas_stats.sis_idle_cpu);
6919 schedstat_inc(sd->eas_stats.sis_idle_cpu);
Srinath Sridharanbf47bdd2016-07-14 09:57:29 +01006920 goto done;
6921 }
6922next:
6923 sg = sg->next;
6924 } while (sg != sd->groups);
6925 }
Dietmar Eggemann56ffdd62017-01-16 12:42:59 +00006926
6927 if (best_idle_cpu >= 0)
6928 target = best_idle_cpu;
Srinath Sridharanbf47bdd2016-07-14 09:57:29 +01006929
6930done:
Dietmar Eggemannaf88a162017-03-22 18:23:13 +00006931 schedstat_inc(p->se.statistics.nr_wakeups_sis_count);
6932 schedstat_inc(this_rq()->eas_stats.sis_count);
6933
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006934 return target;
6935}
Chris Redpath7de1b832017-02-28 17:27:28 +00006936
Joonwoo Park4fdf00d2017-02-17 11:42:44 -08006937/*
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306938 * cpu_util_wake: Compute cpu utilization with any contributions from
6939 * the waking task p removed. check_for_migration() looks for a better CPU of
6940 * rq->curr. For that case we should return cpu util with contributions from
6941 * currently running task p removed.
Joonwoo Park4fdf00d2017-02-17 11:42:44 -08006942 */
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306943static int cpu_util_wake(int cpu, struct task_struct *p)
Joonwoo Park4fdf00d2017-02-17 11:42:44 -08006944{
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306945 unsigned long util, capacity;
Pavankumar Kondetieb486cc2017-06-19 15:28:50 +05306946
Joonwoo Park2b901d52017-01-25 17:45:56 -08006947#ifdef CONFIG_SCHED_WALT
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306948 /*
6949 * WALT does not decay idle tasks in the same manner
6950 * as PELT, so it makes little sense to subtract task
6951 * utilization from cpu utilization. Instead just use
6952 * cpu_util for this case.
6953 */
6954 if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
6955 p->state == TASK_WAKING)
6956 return cpu_util(cpu);
Joonwoo Park2b901d52017-01-25 17:45:56 -08006957#endif
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05306958 /* Task has no contribution or is new */
6959 if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
6960 return cpu_util(cpu);
6961
6962 capacity = capacity_orig_of(cpu);
6963 util = max_t(long, cpu_util(cpu) - task_util(p), 0);
6964
6965 return (util >= capacity) ? capacity : util;
6966}
6967
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05306968struct find_best_target_env {
6969 struct cpumask *rtg_target;
6970 bool need_idle;
Abhijeet Dharmapurikar53ee4232018-06-15 09:34:34 -07006971 int placement_boost;
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05306972 bool avoid_prev_cpu;
6973};
6974
Pavankumar Kondeti398bb7d2018-04-03 15:12:07 +05306975#ifdef CONFIG_SCHED_WALT
6976static unsigned long cpu_estimated_capacity(int cpu, struct task_struct *p)
6977{
6978 unsigned long tutil, estimated_capacity;
6979
6980 if (task_in_cum_window_demand(cpu_rq(cpu), p))
6981 tutil = 0;
6982 else
6983 tutil = task_util(p);
6984
6985 estimated_capacity = cpu_util_cum(cpu, tutil);
6986
6987 return estimated_capacity;
6988}
6989#else
6990static unsigned long cpu_estimated_capacity(int cpu, struct task_struct *p)
6991{
6992 return cpu_util_wake(cpu, p);
6993}
6994#endif
6995
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05306996static bool is_packing_eligible(struct task_struct *p, int target_cpu,
6997 struct find_best_target_env *fbt_env,
Pavankumar Kondeti646fe8f2018-03-23 08:10:20 +05306998 unsigned int target_cpus_count,
6999 int best_idle_cstate)
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307000{
Pavankumar Kondeti398bb7d2018-04-03 15:12:07 +05307001 unsigned long estimated_capacity;
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307002
7003 if (fbt_env->placement_boost || fbt_env->need_idle)
7004 return false;
7005
Pavankumar Kondeti646fe8f2018-03-23 08:10:20 +05307006 if (best_idle_cstate == -1)
7007 return false;
7008
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307009 if (target_cpus_count != 1)
7010 return true;
7011
Pavankumar Kondeti398bb7d2018-04-03 15:12:07 +05307012 estimated_capacity = cpu_estimated_capacity(target_cpu, p);
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307013 estimated_capacity = add_capacity_margin(estimated_capacity,
7014 target_cpu);
7015
7016 /*
7017 * If there is only one active CPU and it is already above its current
7018 * capacity, avoid placing additional task on the CPU.
7019 */
7020 return (estimated_capacity <= capacity_curr_of(target_cpu));
7021}
7022
Satya Durga Srinivasu Prabhalaa7fc94a2018-06-05 13:23:19 -07007023static int start_cpu(struct task_struct *p, bool boosted,
7024 struct cpumask *rtg_target)
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307025{
7026 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
Satya Durga Srinivasu Prabhalaa7fc94a2018-06-05 13:23:19 -07007027 int start_cpu = -1;
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307028
Satya Durga Srinivasu Prabhalaa7fc94a2018-06-05 13:23:19 -07007029 if (boosted)
7030 return rd->max_cap_orig_cpu;
7031
Pavankumar Kondeti16c70452018-06-29 16:41:33 +05307032 /* A task always fits on its rtg_target */
7033 if (rtg_target) {
7034 int rtg_target_cpu = cpumask_first_and(rtg_target,
7035 cpu_online_mask);
7036
7037 if (rtg_target_cpu < nr_cpu_ids)
7038 return rtg_target_cpu;
7039 }
7040
Satya Durga Srinivasu Prabhalaa7fc94a2018-06-05 13:23:19 -07007041 /* Where the task should land based on its demand */
7042 if (rd->min_cap_orig_cpu != -1
7043 && task_fits_max(p, rd->min_cap_orig_cpu))
7044 start_cpu = rd->min_cap_orig_cpu;
7045 else
7046 start_cpu = rd->max_cap_orig_cpu;
7047
Pavankumar Kondetie1856da2018-02-19 17:01:24 +05307048 return walt_start_cpu(start_cpu);
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307049}
7050
Pavankumar Kondetif9b1af12018-04-03 15:16:54 +05307051unsigned int sched_smp_overlap_capacity;
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307052static inline int find_best_target(struct task_struct *p, int *backup_cpu,
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307053 bool boosted, bool prefer_idle,
7054 struct find_best_target_env *fbt_env)
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307055{
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307056 unsigned long min_util = boosted_task_util(p);
7057 unsigned long target_capacity = ULONG_MAX;
7058 unsigned long min_wake_util = ULONG_MAX;
7059 unsigned long target_max_spare_cap = 0;
7060 unsigned long target_util = ULONG_MAX;
7061 unsigned long best_active_util = ULONG_MAX;
Ionela Voinescuad920122017-12-07 20:09:11 +00007062 unsigned long target_idle_max_spare_cap = 0;
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307063 int best_idle_cstate = INT_MAX;
7064 struct sched_domain *sd;
7065 struct sched_group *sg;
7066 int best_active_cpu = -1;
7067 int best_idle_cpu = -1;
7068 int target_cpu = -1;
7069 int cpu, i;
Satya Durga Srinivasu Prabhalaa7fc94a2018-06-05 13:23:19 -07007070 long spare_cap, most_spare_cap = 0;
7071 int most_spare_cap_cpu = -1;
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307072 unsigned int active_cpus_count = 0;
Pavankumar Kondetief13e1e2018-06-21 20:17:32 +05307073 int isolated_candidate = -1;
Abhijeet Dharmapurikar1a734c22018-06-26 14:24:37 -07007074 int prev_cpu = task_cpu(p);
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307075
7076 *backup_cpu = -1;
7077
7078 schedstat_inc(p->se.statistics.nr_wakeups_fbt_attempts);
7079 schedstat_inc(this_rq()->eas_stats.fbt_attempts);
7080
7081 /* Find start CPU based on boost value */
Satya Durga Srinivasu Prabhalaa7fc94a2018-06-05 13:23:19 -07007082 cpu = start_cpu(p, boosted, fbt_env->rtg_target);
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307083 if (cpu < 0) {
7084 schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_cpu);
7085 schedstat_inc(this_rq()->eas_stats.fbt_no_cpu);
7086 return -1;
7087 }
7088
7089 /* Find SD for the start CPU */
7090 sd = rcu_dereference(per_cpu(sd_ea, cpu));
7091 if (!sd) {
7092 schedstat_inc(p->se.statistics.nr_wakeups_fbt_no_sd);
7093 schedstat_inc(this_rq()->eas_stats.fbt_no_sd);
7094 return -1;
7095 }
7096
7097 /* Scan CPUs in all SDs */
7098 sg = sd->groups;
7099 do {
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307100 cpumask_t search_cpus;
7101 bool do_rotate = false, avoid_prev_cpu = false;
7102
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307103 cpumask_copy(&search_cpus, tsk_cpus_allowed(p));
7104 cpumask_and(&search_cpus, &search_cpus, sched_group_cpus(sg));
7105 i = find_first_cpu_bit(p, &search_cpus, sg, &avoid_prev_cpu,
7106 &do_rotate, &first_cpu_bit_env);
7107 if (do_rotate)
7108 fbt_env->avoid_prev_cpu = avoid_prev_cpu;
7109
7110retry:
7111 while ((i = cpumask_next(i, &search_cpus)) < nr_cpu_ids) {
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307112 unsigned long capacity_curr = capacity_curr_of(i);
7113 unsigned long capacity_orig = capacity_orig_of(i);
Ionela Voinescuad920122017-12-07 20:09:11 +00007114 unsigned long wake_util, new_util, min_capped_util;
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307115
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307116 cpumask_clear_cpu(i, &search_cpus);
Pavankumar Kondetief13e1e2018-06-21 20:17:32 +05307117
Abhijeet Dharmapurikar2dba40b2018-06-19 14:19:29 -07007118 trace_sched_cpu_util(i);
Pavankumar Kondetief13e1e2018-06-21 20:17:32 +05307119 if (!cpu_online(i) || cpu_isolated(i))
7120 continue;
7121
7122 isolated_candidate = i;
7123
Abhijeet Dharmapurikar1a734c22018-06-26 14:24:37 -07007124 if (avoid_prev_cpu && i == prev_cpu)
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307125 continue;
7126
Pavankumar Kondetief13e1e2018-06-21 20:17:32 +05307127 if (walt_cpu_high_irqload(i) || is_reserved(i))
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307128 continue;
7129
7130 /*
7131 * p's blocked utilization is still accounted for on prev_cpu
7132 * so prev_cpu will receive a negative bias due to the double
7133 * accounting. However, the blocked utilization may be zero.
7134 */
7135 wake_util = cpu_util_wake(i, p);
7136 new_util = wake_util + task_util(p);
Satya Durga Srinivasu Prabhalaa7fc94a2018-06-05 13:23:19 -07007137 spare_cap = capacity_orig_of(i) - wake_util;
7138
7139 if (spare_cap > most_spare_cap) {
7140 most_spare_cap = spare_cap;
7141 most_spare_cap_cpu = i;
7142 }
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307143
7144 /*
7145 * Ensure minimum capacity to grant the required boost.
7146 * The target CPU can be already at a capacity level higher
7147 * than the one required to boost the task.
7148 */
7149 new_util = max(min_util, new_util);
Ionela Voinescuad920122017-12-07 20:09:11 +00007150
7151 /*
7152 * Include minimum capacity constraint:
7153 * new_util contains the required utilization including
7154 * boost. min_capped_util also takes into account a
7155 * minimum capacity cap imposed on the CPU by external
7156 * actors.
7157 */
7158 min_capped_util = max(new_util, capacity_min_of(i));
7159
Maria Yuc28f0392019-05-15 11:45:50 +08007160 if (cpu_check_overutil_condition(i, new_util))
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307161 continue;
7162
7163 /*
7164 * Case A) Latency sensitive tasks
7165 *
7166 * Unconditionally favoring tasks that prefer idle CPU to
7167 * improve latency.
7168 *
7169 * Looking for:
7170 * - an idle CPU, whatever its idle_state is, since
7171 * the first CPUs we explore are more likely to be
7172 * reserved for latency sensitive tasks.
7173 * - a non idle CPU where the task fits in its current
7174 * capacity and has the maximum spare capacity.
7175 * - a non idle CPU with lower contention from other
7176 * tasks and running at the lowest possible OPP.
7177 *
7178 * The last two goals tries to favor a non idle CPU
7179 * where the task can run as if it is "almost alone".
7180 * A maximum spare capacity CPU is favoured since
7181 * the task already fits into that CPU's capacity
7182 * without waiting for an OPP chance.
7183 *
7184 * The following code path is the only one in the CPUs
7185 * exploration loop which is always used by
7186 * prefer_idle tasks. It exits the loop with wither a
7187 * best_active_cpu or a target_cpu which should
7188 * represent an optimal choice for latency sensitive
7189 * tasks.
7190 */
7191 if (prefer_idle) {
7192
7193 /*
7194 * Case A.1: IDLE CPU
7195 * Return the first IDLE CPU we find.
7196 */
7197 if (idle_cpu(i)) {
7198 schedstat_inc(p->se.statistics.nr_wakeups_fbt_pref_idle);
7199 schedstat_inc(this_rq()->eas_stats.fbt_pref_idle);
7200
7201 trace_sched_find_best_target(p,
7202 prefer_idle, min_util,
7203 cpu, best_idle_cpu,
Abhijeet Dharmapurikar2dba40b2018-06-19 14:19:29 -07007204 best_active_cpu,
7205 i, -1);
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307206
7207 return i;
7208 }
7209
7210 /*
7211 * Case A.2: Target ACTIVE CPU
7212 * Favor CPUs with max spare capacity.
7213 */
7214 if ((capacity_curr > new_util) &&
7215 (capacity_orig - new_util > target_max_spare_cap)) {
7216 target_max_spare_cap = capacity_orig - new_util;
7217 target_cpu = i;
7218 continue;
7219 }
7220 if (target_cpu != -1)
7221 continue;
7222
7223
7224 /*
7225 * Case A.3: Backup ACTIVE CPU
7226 * Favor CPUs with:
7227 * - lower utilization due to other tasks
7228 * - lower utilization with the task in
7229 */
7230 if (wake_util > min_wake_util)
7231 continue;
7232 if (new_util > best_active_util)
7233 continue;
7234 min_wake_util = wake_util;
7235 best_active_util = new_util;
7236 best_active_cpu = i;
7237 continue;
7238 }
7239
7240 /*
Pavankumar Kondeti602c51e2018-02-08 16:43:45 +05307241 * Favor CPUs with smaller capacity for Non latency
7242 * sensitive tasks.
7243 */
7244 if (capacity_orig > target_capacity)
7245 continue;
7246
7247 /*
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307248 * Case B) Non latency sensitive tasks on IDLE CPUs.
7249 *
7250 * Find an optimal backup IDLE CPU for non latency
7251 * sensitive tasks.
7252 *
7253 * Looking for:
7254 * - minimizing the capacity_orig,
7255 * i.e. preferring LITTLE CPUs
7256 * - favoring shallowest idle states
7257 * i.e. avoid to wakeup deep-idle CPUs
7258 *
7259 * The following code path is used by non latency
7260 * sensitive tasks if IDLE CPUs are available. If at
7261 * least one of such CPUs are available it sets the
7262 * best_idle_cpu to the most suitable idle CPU to be
7263 * selected.
7264 *
7265 * If idle CPUs are available, favour these CPUs to
7266 * improve performances by spreading tasks.
7267 * Indeed, the energy_diff() computed by the caller
7268 * will take care to ensure the minimization of energy
7269 * consumptions without affecting performance.
7270 */
7271 if (idle_cpu(i)) {
7272 int idle_idx = idle_get_state_idx(cpu_rq(i));
7273
Ionela Voinescuad920122017-12-07 20:09:11 +00007274 /* Favor CPUs that won't end up running at a
7275 * high OPP.
7276 */
7277 if ((capacity_orig - min_capped_util) <
7278 target_idle_max_spare_cap)
7279 continue;
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307280
7281 /*
7282 * Skip CPUs in deeper idle state, but only
7283 * if they are also less energy efficient.
7284 * IOW, prefer a deep IDLE LITTLE CPU vs a
7285 * shallow idle big CPU.
7286 */
7287 if (sysctl_sched_cstate_aware &&
7288 best_idle_cstate <= idle_idx)
7289 continue;
7290
7291 /* Keep track of best idle CPU */
Pavankumar Kondeti602c51e2018-02-08 16:43:45 +05307292 target_capacity = capacity_orig;
Ionela Voinescuad920122017-12-07 20:09:11 +00007293 target_idle_max_spare_cap = capacity_orig -
7294 min_capped_util;
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307295 best_idle_cstate = idle_idx;
7296 best_idle_cpu = i;
7297 continue;
7298 }
7299
7300 /*
Pavankumar Kondeti448f9ac2018-03-08 12:29:07 +05307301 * Consider only idle CPUs for active migration.
7302 */
7303 if (p->state == TASK_RUNNING)
7304 continue;
7305
7306 /*
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307307 * Case C) Non latency sensitive tasks on ACTIVE CPUs.
7308 *
7309 * Pack tasks in the most energy efficient capacities.
7310 *
7311 * This task packing strategy prefers more energy
7312 * efficient CPUs (i.e. pack on smaller maximum
7313 * capacity CPUs) while also trying to spread tasks to
7314 * run them all at the lower OPP.
7315 *
7316 * This assumes for example that it's more energy
7317 * efficient to run two tasks on two CPUs at a lower
7318 * OPP than packing both on a single CPU but running
7319 * that CPU at an higher OPP.
7320 *
7321 * Thus, this case keep track of the CPU with the
7322 * smallest maximum capacity and highest spare maximum
7323 * capacity.
7324 */
7325
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307326 active_cpus_count++;
7327
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307328 /* Favor CPUs with maximum spare capacity */
Ionela Voinescu881859c2017-12-07 20:09:50 +00007329 if ((capacity_orig - min_capped_util) <
7330 target_max_spare_cap)
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307331 continue;
7332
Ionela Voinescu881859c2017-12-07 20:09:50 +00007333 target_max_spare_cap = capacity_orig - min_capped_util;
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307334 target_capacity = capacity_orig;
7335 target_util = new_util;
7336 target_cpu = i;
Joonwoo Parkcc7d1272017-01-26 14:47:00 -08007337 }
Morten Rasmussen4017a8e2015-05-09 20:03:19 +01007338
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307339 if (do_rotate) {
7340 /*
7341 * We started iteration somewhere in the middle of
7342 * cpumask. Iterate once again from bit 0 to the
7343 * previous starting point bit.
7344 */
7345 do_rotate = false;
7346 i = -1;
7347 goto retry;
7348 }
7349
Pavankumar Kondetie1856da2018-02-19 17:01:24 +05307350 if (!sysctl_sched_is_big_little && !prefer_idle) {
7351
7352 /*
7353 * If we find an idle CPU in the primary cluster,
7354 * stop the search. We select this idle CPU or
7355 * the active CPU (if there is one), whichever
7356 * saves the energy.
7357 */
7358 if (best_idle_cpu != -1)
7359 break;
7360
Abhijeet Dharmapurikar53ee4232018-06-15 09:34:34 -07007361 if (fbt_env->placement_boost != SCHED_BOOST_NONE) {
Pavankumar Kondetie1856da2018-02-19 17:01:24 +05307362 target_capacity = ULONG_MAX;
7363 continue;
7364 }
7365
7366 /*
7367 * If we found an active CPU and its utilization
7368 * is below the minimum packing threshold (overlap),
7369 * no need to search further. Otherwise reset
7370 * the target_capacity and continue the search.
7371 */
7372 if (target_cpu != -1 && target_util <
7373 sched_smp_overlap_capacity)
7374 break;
7375
7376 target_capacity = ULONG_MAX;
7377 }
Abhijeet Dharmapurikarabda8d32018-06-20 22:11:59 -07007378 /*
7379 * if we have found a target cpu within a group, don't bother
7380 * checking other groups.
7381 */
7382 if (target_capacity != ULONG_MAX)
7383 break;
7384
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307385 } while (sg = sg->next, sg != sd->groups);
Joonwoo Parkbeaecb42017-03-31 17:26:25 -07007386
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307387 if (best_idle_cpu != -1 && !is_packing_eligible(p, target_cpu, fbt_env,
Pavankumar Kondeti646fe8f2018-03-23 08:10:20 +05307388 active_cpus_count, best_idle_cstate)) {
Abhijeet Dharmapurikar1a734c22018-06-26 14:24:37 -07007389 if (target_cpu == prev_cpu)
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307390 fbt_env->avoid_prev_cpu = true;
7391
7392 target_cpu = best_idle_cpu;
7393 best_idle_cpu = -1;
7394 }
7395
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307396 /*
7397 * For non latency sensitive tasks, cases B and C in the previous loop,
7398 * we pick the best IDLE CPU only if we was not able to find a target
7399 * ACTIVE CPU.
7400 *
7401 * Policies priorities:
7402 *
7403 * - prefer_idle tasks:
7404 *
7405 * a) IDLE CPU available, we return immediately
7406 * b) ACTIVE CPU where task fits and has the bigger maximum spare
7407 * capacity (i.e. target_cpu)
7408 * c) ACTIVE CPU with less contention due to other tasks
7409 * (i.e. best_active_cpu)
7410 *
7411 * - NON prefer_idle tasks:
7412 *
7413 * a) ACTIVE CPU: target_cpu
7414 * b) IDLE CPU: best_idle_cpu
7415 */
7416 if (target_cpu == -1)
7417 target_cpu = prefer_idle
7418 ? best_active_cpu
7419 : best_idle_cpu;
7420 else
7421 *backup_cpu = prefer_idle
7422 ? best_active_cpu
7423 : best_idle_cpu;
7424
Satya Durga Srinivasu Prabhalaa7fc94a2018-06-05 13:23:19 -07007425 if (target_cpu == -1 && most_spare_cap_cpu != -1 &&
7426 /* ensure we use active cpu for active migration */
7427 !(p->state == TASK_RUNNING && !idle_cpu(most_spare_cap_cpu)))
7428 target_cpu = most_spare_cap_cpu;
7429
Lingutla Chandrasekharee8cf2e2018-09-10 16:56:22 +05307430 if (cpu_isolated(prev_cpu)) {
Pavankumar Kondetief13e1e2018-06-21 20:17:32 +05307431 fbt_env->avoid_prev_cpu = true;
Lingutla Chandrasekharee8cf2e2018-09-10 16:56:22 +05307432 if (target_cpu == -1 && isolated_candidate != -1)
7433 target_cpu = isolated_candidate;
Pavankumar Kondetief13e1e2018-06-21 20:17:32 +05307434 }
7435
Abhijeet Dharmapurikar1a734c22018-06-26 14:24:37 -07007436 /*
7437 * - It is possible for target and backup
7438 * to select same CPU - if so, drop backup
7439 *
7440 * - The next step of energy evaluation includes
7441 * prev_cpu. Drop target or backup if it is
7442 * same as prev_cpu.
7443 */
7444 if (*backup_cpu == target_cpu || *backup_cpu == prev_cpu)
7445 *backup_cpu = -1;
7446
7447 if (target_cpu == prev_cpu) {
7448 target_cpu = *backup_cpu;
7449 *backup_cpu = -1;
7450 }
7451
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307452 trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
7453 best_idle_cpu, best_active_cpu,
Abhijeet Dharmapurikar2dba40b2018-06-19 14:19:29 -07007454 target_cpu, *backup_cpu);
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307455
7456 schedstat_inc(p->se.statistics.nr_wakeups_fbt_count);
7457 schedstat_inc(this_rq()->eas_stats.fbt_count);
7458
7459 return target_cpu;
7460}
7461
7462/*
7463 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
7464 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
7465 *
7466 * In that case WAKE_AFFINE doesn't make sense and we'll let
7467 * BALANCE_WAKE sort things out.
7468 */
7469static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
7470{
7471 long min_cap, max_cap;
7472 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
7473 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
7474 /* Minimum capacity is close to max, no need to abort wake_affine */
7475 if (max_cap - min_cap < max_cap >> 3)
7476 return 0;
7477
7478 /* Bring task utilization in sync with prev_cpu */
7479 sync_entity_load_avg(&p->se);
7480
7481 return min_cap * 1024 < task_util(p) * capacity_margin;
7482}
7483
Dmitry Adamushkoa2a2d682007-10-15 17:00:13 +02007484static inline int wake_to_idle(struct task_struct *p)
Ingo Molnar1799e352007-09-19 23:34:46 +02007485{
Dmitry Adamushko2b1e3152007-10-15 17:00:12 +02007486 return (current->flags & PF_WAKE_UP_IDLE) ||
Ingo Molnar1799e352007-09-19 23:34:46 +02007487 (p->flags & PF_WAKE_UP_IDLE);
7488}
7489
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307490static inline bool
7491bias_to_waker_cpu(struct task_struct *p, int cpu, struct cpumask *rtg_target)
Dmitry Adamushko2b1e3152007-10-15 17:00:12 +02007492{
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307493 int rtg_target_cpu = rtg_target ? cpumask_first(rtg_target) : cpu;
Ingo Molnar1799e352007-09-19 23:34:46 +02007494
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307495 return cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) &&
7496 cpu_active(cpu) && !cpu_isolated(cpu) &&
7497 capacity_orig_of(cpu) >= capacity_orig_of(rtg_target_cpu) &&
Puja Gupta2ceabae2019-01-08 14:13:25 -08007498 task_fits_max(p, cpu);
Gregory Haskinse7693a32008-01-25 21:08:09 +01007499}
7500
Pavankumar Kondeti7cc02922018-03-23 11:15:21 +05307501#define SCHED_SELECT_PREV_CPU_NSEC 2000000
7502#define SCHED_FORCE_CPU_SELECTION_NSEC 20000000
7503
7504static inline bool
7505bias_to_prev_cpu(struct task_struct *p, struct cpumask *rtg_target)
7506{
7507 int prev_cpu = task_cpu(p);
7508#ifdef CONFIG_SCHED_WALT
7509 u64 ms = p->ravg.mark_start;
7510#else
7511 u64 ms = sched_clock();
7512#endif
7513
7514 if (cpu_isolated(prev_cpu) || !idle_cpu(prev_cpu))
7515 return false;
7516
7517 if (!ms)
7518 return false;
7519
7520 if (ms - p->last_cpu_selected_ts >= SCHED_SELECT_PREV_CPU_NSEC) {
7521 p->last_cpu_selected_ts = ms;
7522 return false;
7523 }
7524
7525 if (ms - p->last_sleep_ts >= SCHED_SELECT_PREV_CPU_NSEC)
7526 return false;
7527
7528 if (rtg_target && !cpumask_test_cpu(prev_cpu, rtg_target))
7529 return false;
7530
7531 return true;
7532}
7533
Pavankumar Kondeti5eaea372018-04-03 14:36:52 +05307534#ifdef CONFIG_SCHED_WALT
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307535static inline struct cpumask *find_rtg_target(struct task_struct *p)
Gregory Haskinse7693a32008-01-25 21:08:09 +01007536{
Gregory Haskinse7693a32008-01-25 21:08:09 +01007537 struct related_thread_group *grp;
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307538 struct cpumask *rtg_target;
Gregory Haskinse7693a32008-01-25 21:08:09 +01007539
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307540 rcu_read_lock();
Gregory Haskinse7693a32008-01-25 21:08:09 +01007541
Gregory Haskinse7693a32008-01-25 21:08:09 +01007542 grp = task_related_thread_group(p);
Abhijeet Dharmapurikar53ee4232018-06-15 09:34:34 -07007543 if (grp && grp->preferred_cluster &&
7544 (task_util(p) >
7545 sysctl_sched_min_task_util_for_boost_colocation)) {
Gregory Haskinse7693a32008-01-25 21:08:09 +01007546 rtg_target = &grp->preferred_cluster->cpus;
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307547 if (!task_fits_max(p, cpumask_first(rtg_target)))
7548 rtg_target = NULL;
7549 } else {
7550 rtg_target = NULL;
Gregory Haskinse7693a32008-01-25 21:08:09 +01007551 }
7552
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307553 rcu_read_unlock();
Peter Zijlstraa50bde52009-11-12 15:55:28 +01007554
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307555 return rtg_target;
7556}
Pavankumar Kondeti5eaea372018-04-03 14:36:52 +05307557#else
7558static inline struct cpumask *find_rtg_target(struct task_struct *p)
7559{
7560 return NULL;
7561}
7562#endif
Peter Zijlstraa50bde52009-11-12 15:55:28 +01007563
Lingutla Chandrasekhar1d926052018-04-27 15:21:09 +05307564enum fastpaths {
7565 NONE = 0,
7566 SYNC_WAKEUP,
7567 PREV_CPU_BIAS,
7568};
7569
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307570static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
7571{
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307572 bool boosted, prefer_idle;
Patrick Bellasi271e65c2017-09-12 15:01:17 +01007573 struct sched_domain *sd;
7574 int target_cpu;
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307575 int backup_cpu = -1;
7576 int next_cpu = -1;
7577 struct cpumask *rtg_target = find_rtg_target(p);
7578 struct find_best_target_env fbt_env;
Pavankumar Kondeti152adab2018-04-03 12:33:27 +05307579 u64 start_t = 0;
Lingutla Chandrasekhar1d926052018-04-27 15:21:09 +05307580 int fastpath = 0;
Pavankumar Kondeti152adab2018-04-03 12:33:27 +05307581
7582 if (trace_sched_task_util_enabled())
7583 start_t = sched_clock();
Peter Zijlstraa50bde52009-11-12 15:55:28 +01007584
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307585 schedstat_inc(p->se.statistics.nr_wakeups_secb_attempts);
7586 schedstat_inc(this_rq()->eas_stats.secb_attempts);
Peter Zijlstraa50bde52009-11-12 15:55:28 +01007587
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307588#ifdef CONFIG_CGROUP_SCHEDTUNE
7589 boosted = schedtune_task_boost(p) > 0;
7590 prefer_idle = schedtune_prefer_idle(p) > 0;
7591#else
7592 boosted = get_sysctl_sched_cfs_boost() > 0;
7593 prefer_idle = 0;
7594#endif
Peter Zijlstraa50bde52009-11-12 15:55:28 +01007595
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307596 fbt_env.rtg_target = rtg_target;
Pavankumar Kondetif07ab012018-02-19 17:01:10 +05307597 if (sched_feat(EAS_USE_NEED_IDLE) && prefer_idle) {
7598 fbt_env.need_idle = true;
7599 prefer_idle = false;
7600 } else {
7601 fbt_env.need_idle = wake_to_idle(p);
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01007602 }
Abhijeet Dharmapurikar53ee4232018-06-15 09:34:34 -07007603
7604 fbt_env.placement_boost = task_boost_policy(p);
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307605 fbt_env.avoid_prev_cpu = false;
Dietmar Eggemann231678b2015-08-14 17:23:13 +01007606
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307607 if (prefer_idle || fbt_env.need_idle)
7608 sync = 0;
Dietmar Eggemann231678b2015-08-14 17:23:13 +01007609
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307610 if (sysctl_sched_sync_hint_enable && sync) {
7611 int cpu = smp_processor_id();
Peter Zijlstraaaee1202009-09-10 13:36:25 +02007612
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307613 if (bias_to_waker_cpu(p, cpu, rtg_target)) {
7614 schedstat_inc(p->se.statistics.nr_wakeups_secb_sync);
7615 schedstat_inc(this_rq()->eas_stats.secb_sync);
Lingutla Chandrasekhar1d926052018-04-27 15:21:09 +05307616 target_cpu = cpu;
7617 fastpath = SYNC_WAKEUP;
7618 goto out;
Morten Rasmussende91b9c2014-02-18 14:14:24 +00007619 }
Peter Zijlstraaaee1202009-09-10 13:36:25 +02007620 }
Morten Rasmussende91b9c2014-02-18 14:14:24 +00007621
Lingutla Chandrasekhar1d926052018-04-27 15:21:09 +05307622 if (bias_to_prev_cpu(p, rtg_target)) {
7623 target_cpu = prev_cpu;
7624 fastpath = PREV_CPU_BIAS;
7625 goto out;
7626 }
Pavankumar Kondeti7cc02922018-03-23 11:15:21 +05307627
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307628 sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
Patrick Bellasi271e65c2017-09-12 15:01:17 +01007629 if (!sd) {
7630 target_cpu = prev_cpu;
Joel Fernandes (Google)d35fc8e2018-06-18 11:54:56 -07007631 goto out;
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02007632 }
Suresh Siddha99bd5e22010-03-31 16:47:45 -07007633
Patrick Bellasi271e65c2017-09-12 15:01:17 +01007634 sync_entity_load_avg(&p->se);
7635
7636 /* Find a cpu with sufficient capacity */
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307637 next_cpu = find_best_target(p, &backup_cpu, boosted, prefer_idle,
7638 &fbt_env);
Patrick Bellasi271e65c2017-09-12 15:01:17 +01007639 if (next_cpu == -1) {
7640 target_cpu = prev_cpu;
Joel Fernandes (Google)d35fc8e2018-06-18 11:54:56 -07007641 goto out;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02007642 }
Peter Zijlstrae4f428882009-12-16 18:04:34 +01007643
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307644 if (fbt_env.placement_boost || fbt_env.need_idle ||
7645 fbt_env.avoid_prev_cpu || (rtg_target &&
Abhijeet Dharmapurikar023ebc72018-07-05 14:57:03 -07007646 (!cpumask_test_cpu(prev_cpu, rtg_target) ||
7647 cpumask_test_cpu(next_cpu, rtg_target)))) {
Pavankumar Kondeti4fa21cd2018-02-01 19:11:21 +05307648 target_cpu = next_cpu;
Joel Fernandes (Google)d35fc8e2018-06-18 11:54:56 -07007649 goto out;
Ingo Molnar098fb9d2008-03-16 20:36:10 +01007650 }
Gregory Haskinse7693a32008-01-25 21:08:09 +01007651
Patrick Bellasi271e65c2017-09-12 15:01:17 +01007652 /* Unconditionally prefer IDLE CPUs for boosted/prefer_idle tasks */
7653 if ((boosted || prefer_idle) && idle_cpu(next_cpu)) {
7654 schedstat_inc(p->se.statistics.nr_wakeups_secb_idle_bt);
7655 schedstat_inc(this_rq()->eas_stats.secb_idle_bt);
7656 target_cpu = next_cpu;
Joel Fernandes (Google)d35fc8e2018-06-18 11:54:56 -07007657 goto out;
Patrick Bellasi271e65c2017-09-12 15:01:17 +01007658 }
7659
7660 target_cpu = prev_cpu;
7661 if (next_cpu != prev_cpu) {
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307662 int delta = 0;
Suresh Siddha99bd5e22010-03-31 16:47:45 -07007663 struct energy_env eenv = {
Chris Redpath505be1f2017-09-12 14:48:29 +01007664 .p = p,
Patrick Bellasieca58022017-07-05 10:59:59 +01007665 .util_delta = task_util(p),
7666 /* Task's previous CPU candidate */
7667 .cpu[EAS_CPU_PRV] = {
7668 .cpu_id = prev_cpu,
7669 },
7670 /* Main alternative CPU candidate */
7671 .cpu[EAS_CPU_NXT] = {
7672 .cpu_id = next_cpu,
7673 },
7674 /* Backup alternative CPU candidate */
7675 .cpu[EAS_CPU_BKP] = {
7676 .cpu_id = backup_cpu,
7677 },
Gregory Haskinse7693a32008-01-25 21:08:09 +01007678 };
7679
Peter Zijlstrae4f428882009-12-16 18:04:34 +01007680
7681#ifdef CONFIG_SCHED_WALT
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307682 if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
7683 p->state == TASK_WAKING)
7684 delta = task_util(p);
Gregory Haskinse7693a32008-01-25 21:08:09 +01007685#endif
7686 /* Not enough spare capacity on previous cpu */
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307687 if (__cpu_overutilized(prev_cpu, delta)) {
7688 schedstat_inc(p->se.statistics.nr_wakeups_secb_insuff_cap);
7689 schedstat_inc(this_rq()->eas_stats.secb_insuff_cap);
Patrick Bellasi271e65c2017-09-12 15:01:17 +01007690 target_cpu = next_cpu;
Joel Fernandes (Google)d35fc8e2018-06-18 11:54:56 -07007691 goto out;
Peter Zijlstraae154be2009-09-10 14:40:57 +02007692 }
7693
Patrick Bellasieca58022017-07-05 10:59:59 +01007694 /* Check if EAS_CPU_NXT is a more energy efficient CPU */
7695 if (select_energy_cpu_idx(&eenv) != EAS_CPU_PRV) {
7696 schedstat_inc(p->se.statistics.nr_wakeups_secb_nrg_sav);
7697 schedstat_inc(this_rq()->eas_stats.secb_nrg_sav);
7698 target_cpu = eenv.cpu[eenv.next_idx].cpu_id;
Joel Fernandes (Google)d35fc8e2018-06-18 11:54:56 -07007699 goto out;
Suresh Siddha99bd5e22010-03-31 16:47:45 -07007700 }
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307701
Patrick Bellasieca58022017-07-05 10:59:59 +01007702 schedstat_inc(p->se.statistics.nr_wakeups_secb_no_nrg_sav);
7703 schedstat_inc(this_rq()->eas_stats.secb_no_nrg_sav);
7704 target_cpu = prev_cpu;
Joel Fernandes (Google)d35fc8e2018-06-18 11:54:56 -07007705 goto out;
Suresh Siddha99bd5e22010-03-31 16:47:45 -07007706 }
Alex Shif03542a2012-07-26 08:55:34 +08007707
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307708 schedstat_inc(p->se.statistics.nr_wakeups_secb_count);
7709 schedstat_inc(this_rq()->eas_stats.secb_count);
7710
Lingutla Chandrasekhar1d926052018-04-27 15:21:09 +05307711out:
7712 trace_sched_task_util(p, next_cpu, backup_cpu, target_cpu, sync,
7713 fbt_env.need_idle, fastpath,
7714 fbt_env.placement_boost, rtg_target ?
7715 cpumask_first(rtg_target) : -1, start_t);
Gregory Haskinse7693a32008-01-25 21:08:09 +01007716 return target_cpu;
7717}
Peter Zijlstraaaee1202009-09-10 13:36:25 +02007718
7719/*
7720 * select_task_rq_fair: Select target runqueue for the waking task in domains
7721 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
7722 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
7723 *
Peter Zijlstra59abf022009-09-16 08:28:30 +02007724 * Balances load by selecting the idlest cpu in the idlest group, or under
Peter Zijlstraae154be2009-09-10 14:40:57 +02007725 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
7726 *
7727 * Returns the target cpu number.
7728 *
Gregory Haskinse7693a32008-01-25 21:08:09 +01007729 * preempt must be disabled.
Peter Zijlstraae154be2009-09-10 14:40:57 +02007730 */
7731static int
7732select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
7733{
Gregory Haskinse7693a32008-01-25 21:08:09 +01007734 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
Peter Zijlstraae154be2009-09-10 14:40:57 +02007735 int cpu = smp_processor_id();
Ingo Molnar4ae7d5c2008-03-19 01:42:00 +01007736 int new_cpu = prev_cpu;
Peter Zijlstra59abf022009-09-16 08:28:30 +02007737 int want_affine = 0;
7738 int sync = wake_flags & WF_SYNC;
7739
7740 if (sd_flag & SD_BALANCE_WAKE) {
Peter Zijlstra29cd8ba2009-09-17 09:01:14 +02007741 record_wakee(p);
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307742 want_affine = (!wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
7743 cpumask_test_cpu(cpu, tsk_cpus_allowed(p)));
Peter Zijlstraaaee1202009-09-10 13:36:25 +02007744 }
Peter Zijlstrafe3bcfe2009-11-12 15:55:29 +01007745
Joel Fernandes (Google)d35fc8e2018-06-18 11:54:56 -07007746 if (energy_aware()) {
7747 rcu_read_lock();
7748 new_cpu = select_energy_cpu_brute(p, prev_cpu, sync);
7749 rcu_read_unlock();
7750 return new_cpu;
7751 }
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307752
Ingo Molnar098fb9d2008-03-16 20:36:10 +01007753 rcu_read_lock();
Ingo Molnar4ae7d5c2008-03-19 01:42:00 +01007754 for_each_domain(cpu, tmp) {
Gregory Haskinse7693a32008-01-25 21:08:09 +01007755 if (!(tmp->flags & SD_LOAD_BALANCE))
Peter Zijlstraaaee1202009-09-10 13:36:25 +02007756 break;
Peter Zijlstrafe3bcfe2009-11-12 15:55:29 +01007757
7758 /*
7759 * If both cpu and prev_cpu are part of this domain,
7760 * cpu is a valid SD_WAKE_AFFINE target.
7761 */
Suresh Siddha99bd5e22010-03-31 16:47:45 -07007762 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
7763 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
7764 affine_sd = tmp;
7765 break;
Peter Zijlstrac88d5912009-09-10 13:50:02 +02007766 }
7767
Alex Shif03542a2012-07-26 08:55:34 +08007768 if (tmp->flags & sd_flag)
Peter Zijlstra29cd8ba2009-09-17 09:01:14 +02007769 sd = tmp;
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02007770 else if (!want_affine)
7771 break;
Peter Zijlstrac88d5912009-09-10 13:50:02 +02007772 }
Peter Zijlstraaaee1202009-09-10 13:36:25 +02007773
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02007774 if (affine_sd) {
7775 sd = NULL; /* Prefer wake_affine over balance flags */
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01007776 if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02007777 new_cpu = cpu;
Mike Galbraith8b911ac2010-03-11 17:17:16 +01007778 }
Peter Zijlstra3b640892009-09-16 13:44:33 +02007779
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307780 if (sd && !(sd_flag & SD_BALANCE_FORK)) {
7781 /*
7782 * We're going to need the task's util for capacity_spare_wake
7783 * in find_idlest_group. Sync it up to prev_cpu's
7784 * last_update_time.
7785 */
7786 sync_entity_load_avg(&p->se);
7787 }
7788
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02007789 if (!sd) {
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307790 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01007791 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02007792
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05307793 } else {
7794 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
Gregory Haskinse7693a32008-01-25 21:08:09 +01007795 }
Peter Zijlstradce840a2011-04-07 14:09:50 +02007796 rcu_read_unlock();
Gregory Haskinse7693a32008-01-25 21:08:09 +01007797
Peter Zijlstrac88d5912009-09-10 13:50:02 +02007798 return new_cpu;
Gregory Haskinse7693a32008-01-25 21:08:09 +01007799}
Paul Turner0a74bef2012-10-04 13:18:30 +02007800
7801/*
7802 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
7803 * cfs_rq_of(p) references at time of call are still valid and identify the
Byungchul Park525628c2015-11-18 09:34:59 +09007804 * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
Paul Turner0a74bef2012-10-04 13:18:30 +02007805 */
xiaofeng.yan5a4fd032015-09-23 14:55:59 +08007806static void migrate_task_rq_fair(struct task_struct *p)
Paul Turner0a74bef2012-10-04 13:18:30 +02007807{
Paul Turneraff3e492012-10-04 13:18:30 +02007808 /*
Peter Zijlstra59efa0b2016-05-10 18:24:37 +02007809 * As blocked tasks retain absolute vruntime the migration needs to
7810 * deal with this by subtracting the old and adding the new
7811 * min_vruntime -- the latter is done by enqueue_entity() when placing
7812 * the task on the new runqueue.
7813 */
7814 if (p->state == TASK_WAKING) {
7815 struct sched_entity *se = &p->se;
7816 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7817 u64 min_vruntime;
7818
7819#ifndef CONFIG_64BIT
7820 u64 min_vruntime_copy;
7821
7822 do {
7823 min_vruntime_copy = cfs_rq->min_vruntime_copy;
7824 smp_rmb();
7825 min_vruntime = cfs_rq->min_vruntime;
7826 } while (min_vruntime != min_vruntime_copy);
7827#else
7828 min_vruntime = cfs_rq->min_vruntime;
7829#endif
7830
7831 se->vruntime -= min_vruntime;
7832 }
7833
7834 /*
Yuyang Du9d89c252015-07-15 08:04:37 +08007835 * We are supposed to update the task to "current" time, then its up to date
7836 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
7837 * what current time is, so simply throw away the out-of-date time. This
7838 * will result in the wakee task is less decayed, but giving the wakee more
7839 * load sounds not bad.
Paul Turneraff3e492012-10-04 13:18:30 +02007840 */
Yuyang Du9d89c252015-07-15 08:04:37 +08007841 remove_entity_load_avg(&p->se);
7842
7843 /* Tell new CPU we are migrated */
7844 p->se.avg.last_update_time = 0;
Ben Segall3944a922014-05-15 15:59:20 -07007845
7846 /* We have migrated, no longer consider this task hot */
Yuyang Du9d89c252015-07-15 08:04:37 +08007847 p->se.exec_start = 0;
Paul Turner0a74bef2012-10-04 13:18:30 +02007848}
Yuyang Du12695572015-07-15 08:04:40 +08007849
7850static void task_dead_fair(struct task_struct *p)
7851{
7852 remove_entity_load_avg(&p->se);
7853}
Patrick Bellasi2178e842016-07-22 11:35:59 +01007854#else
7855#define task_fits_max(p, cpu) true
Gregory Haskinse7693a32008-01-25 21:08:09 +01007856#endif /* CONFIG_SMP */
7857
Peter Zijlstrae52fb7c2009-01-14 12:39:19 +01007858static unsigned long
7859wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
Peter Zijlstra0bbd3332008-04-19 19:44:57 +02007860{
7861 unsigned long gran = sysctl_sched_wakeup_granularity;
7862
7863 /*
Peter Zijlstrae52fb7c2009-01-14 12:39:19 +01007864 * Since its curr running now, convert the gran from real-time
7865 * to virtual-time in his units.
Mike Galbraith13814d42010-03-11 17:17:04 +01007866 *
7867 * By using 'se' instead of 'curr' we penalize light tasks, so
7868 * they get preempted easier. That is, if 'se' < 'curr' then
7869 * the resulting gran will be larger, therefore penalizing the
7870 * lighter, if otoh 'se' > 'curr' then the resulting gran will
7871 * be smaller, again penalizing the lighter task.
7872 *
7873 * This is especially important for buddies when the leftmost
7874 * task is higher priority than the buddy.
Peter Zijlstra0bbd3332008-04-19 19:44:57 +02007875 */
Shaohua Lif4ad9bd2011-04-08 12:53:09 +08007876 return calc_delta_fair(gran, se);
Peter Zijlstra0bbd3332008-04-19 19:44:57 +02007877}
7878
7879/*
Peter Zijlstra464b7522008-10-24 11:06:15 +02007880 * Should 'se' preempt 'curr'.
7881 *
7882 * |s1
7883 * |s2
7884 * |s3
7885 * g
7886 * |<--->|c
7887 *
7888 * w(c, s1) = -1
7889 * w(c, s2) = 0
7890 * w(c, s3) = 1
7891 *
7892 */
7893static int
7894wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
7895{
7896 s64 gran, vdiff = curr->vruntime - se->vruntime;
7897
7898 if (vdiff <= 0)
7899 return -1;
7900
Peter Zijlstrae52fb7c2009-01-14 12:39:19 +01007901 gran = wakeup_gran(curr, se);
Peter Zijlstra464b7522008-10-24 11:06:15 +02007902 if (vdiff > gran)
7903 return 1;
7904
7905 return 0;
7906}
7907
Peter Zijlstra02479092008-11-04 21:25:10 +01007908static void set_last_buddy(struct sched_entity *se)
7909{
Venkatesh Pallipadi69c80f32011-04-13 18:21:09 -07007910 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
7911 return;
7912
7913 for_each_sched_entity(se)
7914 cfs_rq_of(se)->last = se;
Peter Zijlstra02479092008-11-04 21:25:10 +01007915}
7916
7917static void set_next_buddy(struct sched_entity *se)
7918{
Venkatesh Pallipadi69c80f32011-04-13 18:21:09 -07007919 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
7920 return;
7921
7922 for_each_sched_entity(se)
7923 cfs_rq_of(se)->next = se;
Peter Zijlstra02479092008-11-04 21:25:10 +01007924}
7925
Rik van Rielac53db52011-02-01 09:51:03 -05007926static void set_skip_buddy(struct sched_entity *se)
7927{
Venkatesh Pallipadi69c80f32011-04-13 18:21:09 -07007928 for_each_sched_entity(se)
7929 cfs_rq_of(se)->skip = se;
Rik van Rielac53db52011-02-01 09:51:03 -05007930}
7931
Peter Zijlstra464b7522008-10-24 11:06:15 +02007932/*
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007933 * Preempt the current task with a newly woken task if needed:
7934 */
Peter Zijlstra5a9b86f2009-09-16 13:47:58 +02007935static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007936{
7937 struct task_struct *curr = rq->curr;
Srivatsa Vaddagiri8651a862007-10-15 17:00:12 +02007938 struct sched_entity *se = &curr->se, *pse = &p->se;
Mike Galbraith03e89e42008-12-16 08:45:30 +01007939 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
Mike Galbraithf685cea2009-10-23 23:09:22 +02007940 int scale = cfs_rq->nr_running >= sched_nr_latency;
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07007941 int next_buddy_marked = 0;
Mike Galbraith03e89e42008-12-16 08:45:30 +01007942
Ingo Molnar4ae7d5c2008-03-19 01:42:00 +01007943 if (unlikely(se == pse))
7944 return;
7945
Paul Turner5238cdd2011-07-21 09:43:37 -07007946 /*
Kirill Tkhai163122b2014-08-20 13:48:29 +04007947 * This is possible from callers such as attach_tasks(), in which we
Paul Turner5238cdd2011-07-21 09:43:37 -07007948 * unconditionally check_prempt_curr() after an enqueue (which may have
7949 * lead to a throttle). This both saves work and prevents false
7950 * next-buddy nomination below.
7951 */
7952 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
7953 return;
7954
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07007955 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
Mike Galbraith3cb63d52009-09-11 12:01:17 +02007956 set_next_buddy(pse);
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07007957 next_buddy_marked = 1;
7958 }
Peter Zijlstra57fdc262008-09-23 15:33:45 +02007959
Bharata B Raoaec0a512008-08-28 14:42:49 +05307960 /*
7961 * We can come here with TIF_NEED_RESCHED already set from new task
7962 * wake up path.
Paul Turner5238cdd2011-07-21 09:43:37 -07007963 *
7964 * Note: this also catches the edge-case of curr being in a throttled
7965 * group (e.g. via set_curr_task), since update_curr() (in the
7966 * enqueue of curr) will have resulted in resched being set. This
7967 * prevents us from potentially nominating it as a false LAST_BUDDY
7968 * below.
Bharata B Raoaec0a512008-08-28 14:42:49 +05307969 */
7970 if (test_tsk_need_resched(curr))
7971 return;
7972
Darren Harta2f5c9a2011-02-22 13:04:33 -08007973 /* Idle tasks are by definition preempted by non-idle tasks. */
7974 if (unlikely(curr->policy == SCHED_IDLE) &&
7975 likely(p->policy != SCHED_IDLE))
7976 goto preempt;
7977
Ingo Molnar91c234b2007-10-15 17:00:18 +02007978 /*
Darren Harta2f5c9a2011-02-22 13:04:33 -08007979 * Batch and idle tasks do not preempt non-idle tasks (their preemption
7980 * is driven by the tick):
Ingo Molnar91c234b2007-10-15 17:00:18 +02007981 */
Ingo Molnar8ed92e52012-10-14 14:28:50 +02007982 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
Ingo Molnar91c234b2007-10-15 17:00:18 +02007983 return;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007984
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01007985 find_matching_se(&se, &pse);
Paul Turner9bbd7372011-07-05 19:07:21 -07007986 update_curr(cfs_rq_of(se));
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01007987 BUG_ON(!pse);
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07007988 if (wakeup_preempt_entity(se, pse) == 1) {
7989 /*
7990 * Bias pick_next to pick the sched entity that is
7991 * triggering this preemption.
7992 */
7993 if (!next_buddy_marked)
7994 set_next_buddy(pse);
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01007995 goto preempt;
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07007996 }
Jupyung Leea65ac742009-11-17 18:51:40 +09007997
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01007998 return;
7999
8000preempt:
Kirill Tkhai88751252014-06-29 00:03:57 +04008001 resched_curr(rq);
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01008002 /*
8003 * Only set the backward buddy when the current task is still
8004 * on the rq. This can happen when a wakeup gets interleaved
8005 * with schedule on the ->pre_schedule() or idle_balance()
8006 * point, either of which can * drop the rq lock.
8007 *
8008 * Also, during early boot the idle thread is in the fair class,
8009 * for obvious reasons its a bad idea to schedule back to it.
8010 */
8011 if (unlikely(!se->on_rq || curr == rq->idle))
8012 return;
8013
8014 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
8015 set_last_buddy(se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02008016}
8017
Peter Zijlstra606dba22012-02-11 06:05:00 +01008018static struct task_struct *
Matt Fleming5a91d732016-09-21 14:38:10 +01008019pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02008020{
8021 struct cfs_rq *cfs_rq = &rq->cfs;
8022 struct sched_entity *se;
Peter Zijlstra678d5712012-02-11 06:05:00 +01008023 struct task_struct *p;
Peter Zijlstra37e117c2014-02-14 12:25:08 +01008024 int new_tasks;
Peter Zijlstra678d5712012-02-11 06:05:00 +01008025
Peter Zijlstra6e831252014-02-11 16:11:48 +01008026again:
Peter Zijlstra678d5712012-02-11 06:05:00 +01008027#ifdef CONFIG_FAIR_GROUP_SCHED
8028 if (!cfs_rq->nr_running)
Peter Zijlstra38033c32014-01-23 20:32:21 +01008029 goto idle;
Peter Zijlstra678d5712012-02-11 06:05:00 +01008030
Peter Zijlstra3f1d2a32014-02-12 10:49:30 +01008031 if (prev->sched_class != &fair_sched_class)
Peter Zijlstra678d5712012-02-11 06:05:00 +01008032 goto simple;
8033
8034 /*
8035 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
8036 * likely that a next task is from the same cgroup as the current.
8037 *
8038 * Therefore attempt to avoid putting and setting the entire cgroup
8039 * hierarchy, only change the part that actually changes.
8040 */
8041
8042 do {
8043 struct sched_entity *curr = cfs_rq->curr;
8044
8045 /*
8046 * Since we got here without doing put_prev_entity() we also
8047 * have to consider cfs_rq->curr. If it is still a runnable
8048 * entity, update_curr() will update its vruntime, otherwise
8049 * forget we've ever seen it.
8050 */
Ben Segall54d27362015-04-06 15:28:10 -07008051 if (curr) {
8052 if (curr->on_rq)
8053 update_curr(cfs_rq);
8054 else
8055 curr = NULL;
Peter Zijlstra678d5712012-02-11 06:05:00 +01008056
Ben Segall54d27362015-04-06 15:28:10 -07008057 /*
8058 * This call to check_cfs_rq_runtime() will do the
8059 * throttle and dequeue its entity in the parent(s).
8060 * Therefore the 'simple' nr_running test will indeed
8061 * be correct.
8062 */
8063 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
8064 goto simple;
8065 }
Peter Zijlstra678d5712012-02-11 06:05:00 +01008066
8067 se = pick_next_entity(cfs_rq, curr);
8068 cfs_rq = group_cfs_rq(se);
8069 } while (cfs_rq);
8070
8071 p = task_of(se);
8072
8073 /*
8074 * Since we haven't yet done put_prev_entity and if the selected task
8075 * is a different task than we started out with, try and touch the
8076 * least amount of cfs_rqs.
8077 */
8078 if (prev != p) {
8079 struct sched_entity *pse = &prev->se;
8080
8081 while (!(cfs_rq = is_same_group(se, pse))) {
8082 int se_depth = se->depth;
8083 int pse_depth = pse->depth;
8084
8085 if (se_depth <= pse_depth) {
8086 put_prev_entity(cfs_rq_of(pse), pse);
8087 pse = parent_entity(pse);
8088 }
8089 if (se_depth >= pse_depth) {
8090 set_next_entity(cfs_rq_of(se), se);
8091 se = parent_entity(se);
8092 }
8093 }
8094
8095 put_prev_entity(cfs_rq, pse);
8096 set_next_entity(cfs_rq, se);
8097 }
8098
8099 if (hrtick_enabled(rq))
8100 hrtick_start_fair(rq, p);
8101
Morten Rasmussen4c6a8242016-02-25 12:47:54 +00008102 rq->misfit_task = !task_fits_max(p, rq->cpu);
8103
Peter Zijlstra678d5712012-02-11 06:05:00 +01008104 return p;
8105simple:
8106 cfs_rq = &rq->cfs;
8107#endif
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02008108
Tim Blechmann36ace272009-11-24 11:55:45 +01008109 if (!cfs_rq->nr_running)
Peter Zijlstra38033c32014-01-23 20:32:21 +01008110 goto idle;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02008111
Peter Zijlstra3f1d2a32014-02-12 10:49:30 +01008112 put_prev_task(rq, prev);
Peter Zijlstra606dba22012-02-11 06:05:00 +01008113
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02008114 do {
Peter Zijlstra678d5712012-02-11 06:05:00 +01008115 se = pick_next_entity(cfs_rq, NULL);
Peter Zijlstraf4b67552008-11-04 21:25:07 +01008116 set_next_entity(cfs_rq, se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02008117 cfs_rq = group_cfs_rq(se);
8118 } while (cfs_rq);
8119
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01008120 p = task_of(se);
Peter Zijlstra678d5712012-02-11 06:05:00 +01008121
Mike Galbraithb39e66e2011-11-22 15:20:07 +01008122 if (hrtick_enabled(rq))
8123 hrtick_start_fair(rq, p);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01008124
Morten Rasmussen4c6a8242016-02-25 12:47:54 +00008125 rq->misfit_task = !task_fits_max(p, rq->cpu);
8126
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01008127 return p;
Peter Zijlstra38033c32014-01-23 20:32:21 +01008128
8129idle:
Morten Rasmussen4c6a8242016-02-25 12:47:54 +00008130 rq->misfit_task = 0;
Peter Zijlstracbce1a62015-06-11 14:46:54 +02008131 /*
8132 * This is OK, because current is on_cpu, which avoids it being picked
8133 * for load-balance and preemption/IRQs are still disabled avoiding
8134 * further scheduler activity on it and we're being very careful to
8135 * re-start the picking loop.
8136 */
Matt Fleming5a91d732016-09-21 14:38:10 +01008137 rq_unpin_lock(rq, rf);
Kirill Tkhaie4aa3582014-03-06 13:31:55 +04008138 new_tasks = idle_balance(rq);
Matt Fleming5a91d732016-09-21 14:38:10 +01008139 rq_repin_lock(rq, rf);
Peter Zijlstra37e117c2014-02-14 12:25:08 +01008140 /*
8141 * Because idle_balance() releases (and re-acquires) rq->lock, it is
8142 * possible for any higher priority task to appear. In that case we
8143 * must re-start the pick_next_entity() loop.
8144 */
Kirill Tkhaie4aa3582014-03-06 13:31:55 +04008145 if (new_tasks < 0)
Peter Zijlstra37e117c2014-02-14 12:25:08 +01008146 return RETRY_TASK;
8147
Kirill Tkhaie4aa3582014-03-06 13:31:55 +04008148 if (new_tasks > 0)
Peter Zijlstra38033c32014-01-23 20:32:21 +01008149 goto again;
Peter Zijlstra38033c32014-01-23 20:32:21 +01008150
8151 return NULL;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02008152}
8153
8154/*
8155 * Account for a descheduled task:
8156 */
Ingo Molnar31ee5292007-08-09 11:16:49 +02008157static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02008158{
8159 struct sched_entity *se = &prev->se;
8160 struct cfs_rq *cfs_rq;
8161
8162 for_each_sched_entity(se) {
8163 cfs_rq = cfs_rq_of(se);
Ingo Molnarab6cde22007-08-09 11:16:48 +02008164 put_prev_entity(cfs_rq, se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02008165 }
8166}
8167
Rik van Rielac53db52011-02-01 09:51:03 -05008168/*
8169 * sched_yield() is very simple
8170 *
8171 * The magic of dealing with the ->skip buddy is in pick_next_entity.
8172 */
8173static void yield_task_fair(struct rq *rq)
8174{
8175 struct task_struct *curr = rq->curr;
8176 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
8177 struct sched_entity *se = &curr->se;
8178
8179 /*
8180 * Are we the only task in the tree?
8181 */
8182 if (unlikely(rq->nr_running == 1))
8183 return;
8184
8185 clear_buddies(cfs_rq, se);
8186
8187 if (curr->policy != SCHED_BATCH) {
8188 update_rq_clock(rq);
8189 /*
8190 * Update run-time statistics of the 'current'.
8191 */
8192 update_curr(cfs_rq);
Mike Galbraith916671c2011-11-22 15:21:26 +01008193 /*
8194 * Tell update_rq_clock() that we've just updated,
8195 * so we don't do microscopic update in schedule()
8196 * and double the fastpath cost.
8197 */
Peter Zijlstra9edfbfe2015-01-05 11:18:11 +01008198 rq_clock_skip_update(rq, true);
Rik van Rielac53db52011-02-01 09:51:03 -05008199 }
8200
8201 set_skip_buddy(se);
8202}
8203
Mike Galbraithd95f4122011-02-01 09:50:51 -05008204static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
8205{
8206 struct sched_entity *se = &p->se;
8207
Paul Turner5238cdd2011-07-21 09:43:37 -07008208 /* throttled hierarchies are not runnable */
8209 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
Mike Galbraithd95f4122011-02-01 09:50:51 -05008210 return false;
8211
8212 /* Tell the scheduler that we'd really like pse to run next. */
8213 set_next_buddy(se);
8214
Mike Galbraithd95f4122011-02-01 09:50:51 -05008215 yield_task_fair(rq);
8216
8217 return true;
8218}
8219
Peter Williams681f3e62007-10-24 18:23:51 +02008220#ifdef CONFIG_SMP
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02008221/**************************************************
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02008222 * Fair scheduling class load-balancing methods.
8223 *
8224 * BASICS
8225 *
8226 * The purpose of load-balancing is to achieve the same basic fairness the
8227 * per-cpu scheduler provides, namely provide a proportional amount of compute
8228 * time to each task. This is expressed in the following equation:
8229 *
8230 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
8231 *
8232 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
8233 * W_i,0 is defined as:
8234 *
8235 * W_i,0 = \Sum_j w_i,j (2)
8236 *
8237 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
Yuyang Du1c3de5e2016-03-30 07:07:51 +08008238 * is derived from the nice value as per sched_prio_to_weight[].
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02008239 *
8240 * The weight average is an exponential decay average of the instantaneous
8241 * weight:
8242 *
8243 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
8244 *
Nicolas Pitreced549f2014-05-26 18:19:38 -04008245 * C_i is the compute capacity of cpu i, typically it is the
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02008246 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
8247 * can also include other factors [XXX].
8248 *
8249 * To achieve this balance we define a measure of imbalance which follows
8250 * directly from (1):
8251 *
Nicolas Pitreced549f2014-05-26 18:19:38 -04008252 * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02008253 *
8254 * We them move tasks around to minimize the imbalance. In the continuous
8255 * function space it is obvious this converges, in the discrete case we get
8256 * a few fun cases generally called infeasible weight scenarios.
8257 *
8258 * [XXX expand on:
8259 * - infeasible weights;
8260 * - local vs global optima in the discrete case. ]
8261 *
8262 *
8263 * SCHED DOMAINS
8264 *
8265 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
8266 * for all i,j solution, we create a tree of cpus that follows the hardware
8267 * topology where each level pairs two lower groups (or better). This results
8268 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
8269 * tree to only the first of the previous level and we decrease the frequency
8270 * of load-balance at each level inv. proportional to the number of cpus in
8271 * the groups.
8272 *
8273 * This yields:
8274 *
8275 * log_2 n 1 n
8276 * \Sum { --- * --- * 2^i } = O(n) (5)
8277 * i = 0 2^i 2^i
8278 * `- size of each group
8279 * | | `- number of cpus doing load-balance
8280 * | `- freq
8281 * `- sum over all levels
8282 *
8283 * Coupled with a limit on how many tasks we can migrate every balance pass,
8284 * this makes (5) the runtime complexity of the balancer.
8285 *
8286 * An important property here is that each CPU is still (indirectly) connected
8287 * to every other cpu in at most O(log n) steps:
8288 *
8289 * The adjacency matrix of the resulting graph is given by:
8290 *
Byungchul Park97a71422015-07-05 18:33:48 +09008291 * log_2 n
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02008292 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
8293 * k = 0
8294 *
8295 * And you'll find that:
8296 *
8297 * A^(log_2 n)_i,j != 0 for all i,j (7)
8298 *
8299 * Showing there's indeed a path between every cpu in at most O(log n) steps.
8300 * The task movement gives a factor of O(m), giving a convergence complexity
8301 * of:
8302 *
8303 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
8304 *
8305 *
8306 * WORK CONSERVING
8307 *
8308 * In order to avoid CPUs going idle while there's still work to do, new idle
8309 * balancing is more aggressive and has the newly idle cpu iterate up the domain
8310 * tree itself instead of relying on other CPUs to bring it work.
8311 *
8312 * This adds some complexity to both (5) and (8) but it reduces the total idle
8313 * time.
8314 *
8315 * [XXX more?]
8316 *
8317 *
8318 * CGROUPS
8319 *
8320 * Cgroups make a horror show out of (2), instead of a simple sum we get:
8321 *
8322 * s_k,i
8323 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
8324 * S_k
8325 *
8326 * Where
8327 *
8328 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
8329 *
8330 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
8331 *
8332 * The big problem is S_k, its a global sum needed to compute a local (W_i)
8333 * property.
8334 *
8335 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
8336 * rewrite all of this once again.]
Byungchul Park97a71422015-07-05 18:33:48 +09008337 */
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02008338
Hiroshi Shimamotoed387b72012-01-31 11:40:32 +09008339static unsigned long __read_mostly max_load_balance_interval = HZ/10;
8340
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008341enum fbq_type { regular, remote, all };
8342
Morten Rasmussen4c6a8242016-02-25 12:47:54 +00008343enum group_type {
8344 group_other = 0,
8345 group_misfit_task,
8346 group_imbalanced,
8347 group_overloaded,
8348};
8349
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01008350#define LBF_ALL_PINNED 0x01
Peter Zijlstra367456c2012-02-20 21:49:09 +01008351#define LBF_NEED_BREAK 0x02
Peter Zijlstra62633222013-08-19 12:41:09 +02008352#define LBF_DST_PINNED 0x04
8353#define LBF_SOME_PINNED 0x08
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008354#define LBF_BIG_TASK_ACTIVE_BALANCE 0x80
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008355#define LBF_IGNORE_BIG_TASKS 0x100
8356#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
8357#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01008358
8359struct lb_env {
8360 struct sched_domain *sd;
8361
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01008362 struct rq *src_rq;
Prashanth Nageshappa85c1e7d2012-06-19 17:47:34 +05308363 int src_cpu;
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01008364
8365 int dst_cpu;
8366 struct rq *dst_rq;
8367
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05308368 struct cpumask *dst_grpmask;
8369 int new_dst_cpu;
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01008370 enum cpu_idle_type idle;
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008371 long imbalance;
Morten Rasmussen94beeae2015-07-02 17:16:34 +01008372 unsigned int src_grp_nr_running;
Michael Wangb94031302012-07-12 16:10:13 +08008373 /* The set of CPUs under consideration for load-balancing */
8374 struct cpumask *cpus;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008375 unsigned int busiest_grp_capacity;
8376 unsigned int busiest_nr_running;
Michael Wangb94031302012-07-12 16:10:13 +08008377
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01008378 unsigned int flags;
Peter Zijlstra367456c2012-02-20 21:49:09 +01008379
8380 unsigned int loop;
8381 unsigned int loop_break;
8382 unsigned int loop_max;
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008383
8384 enum fbq_type fbq_type;
Morten Rasmussenf95e8de2016-02-25 12:51:35 +00008385 enum group_type busiest_group_type;
Kirill Tkhai163122b2014-08-20 13:48:29 +04008386 struct list_head tasks;
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01008387};
8388
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008389/*
Peter Zijlstra029632f2011-10-25 10:00:11 +02008390 * Is this task likely cache-hot:
8391 */
Hillf Danton5d5e2b12014-06-10 10:58:43 +02008392static int task_hot(struct task_struct *p, struct lb_env *env)
Peter Zijlstra029632f2011-10-25 10:00:11 +02008393{
8394 s64 delta;
8395
Kirill Tkhaie5673f22014-08-20 13:48:01 +04008396 lockdep_assert_held(&env->src_rq->lock);
8397
Peter Zijlstra029632f2011-10-25 10:00:11 +02008398 if (p->sched_class != &fair_sched_class)
8399 return 0;
8400
8401 if (unlikely(p->policy == SCHED_IDLE))
8402 return 0;
8403
8404 /*
8405 * Buddy candidates are cache hot:
8406 */
Hillf Danton5d5e2b12014-06-10 10:58:43 +02008407 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
Peter Zijlstra029632f2011-10-25 10:00:11 +02008408 (&p->se == cfs_rq_of(&p->se)->next ||
8409 &p->se == cfs_rq_of(&p->se)->last))
8410 return 1;
8411
8412 if (sysctl_sched_migration_cost == -1)
8413 return 1;
8414 if (sysctl_sched_migration_cost == 0)
8415 return 0;
8416
Hillf Danton5d5e2b12014-06-10 10:58:43 +02008417 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
Peter Zijlstra029632f2011-10-25 10:00:11 +02008418
8419 return delta < (s64)sysctl_sched_migration_cost;
8420}
8421
Mel Gorman3a7053b2013-10-07 11:29:00 +01008422#ifdef CONFIG_NUMA_BALANCING
Rik van Rielc1ceac62015-05-14 22:59:36 -04008423/*
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05308424 * Returns 1, if task migration degrades locality
8425 * Returns 0, if task migration improves locality i.e migration preferred.
8426 * Returns -1, if task migration is not affected by locality.
Rik van Rielc1ceac62015-05-14 22:59:36 -04008427 */
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05308428static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
Mel Gorman3a7053b2013-10-07 11:29:00 +01008429{
Rik van Rielb1ad0652014-05-15 13:03:06 -04008430 struct numa_group *numa_group = rcu_dereference(p->numa_group);
Rik van Rielc1ceac62015-05-14 22:59:36 -04008431 unsigned long src_faults, dst_faults;
Mel Gorman3a7053b2013-10-07 11:29:00 +01008432 int src_nid, dst_nid;
8433
Srikar Dronamraju2a595722015-08-11 21:54:21 +05308434 if (!static_branch_likely(&sched_numa_balancing))
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05308435 return -1;
8436
Srikar Dronamrajuc3b9bc52015-08-11 16:30:12 +05308437 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05308438 return -1;
Mel Gorman7a0f3082013-10-07 11:29:01 +01008439
8440 src_nid = cpu_to_node(env->src_cpu);
8441 dst_nid = cpu_to_node(env->dst_cpu);
8442
Mel Gorman83e1d2c2013-10-07 11:29:27 +01008443 if (src_nid == dst_nid)
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05308444 return -1;
Mel Gorman7a0f3082013-10-07 11:29:01 +01008445
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05308446 /* Migrating away from the preferred node is always bad. */
8447 if (src_nid == p->numa_preferred_nid) {
8448 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
8449 return 1;
8450 else
8451 return -1;
8452 }
Mel Gorman83e1d2c2013-10-07 11:29:27 +01008453
Rik van Rielc1ceac62015-05-14 22:59:36 -04008454 /* Encourage migration to the preferred node. */
8455 if (dst_nid == p->numa_preferred_nid)
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05308456 return 0;
Rik van Rielc1ceac62015-05-14 22:59:36 -04008457
8458 if (numa_group) {
8459 src_faults = group_faults(p, src_nid);
8460 dst_faults = group_faults(p, dst_nid);
8461 } else {
8462 src_faults = task_faults(p, src_nid);
8463 dst_faults = task_faults(p, dst_nid);
8464 }
8465
8466 return dst_faults < src_faults;
Mel Gorman7a0f3082013-10-07 11:29:01 +01008467}
8468
Mel Gorman3a7053b2013-10-07 11:29:00 +01008469#else
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05308470static inline int migrate_degrades_locality(struct task_struct *p,
Mel Gorman3a7053b2013-10-07 11:29:00 +01008471 struct lb_env *env)
8472{
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05308473 return -1;
Mel Gorman7a0f3082013-10-07 11:29:01 +01008474}
Mel Gorman3a7053b2013-10-07 11:29:00 +01008475#endif
8476
Peter Zijlstra029632f2011-10-25 10:00:11 +02008477/*
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008478 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
8479 */
8480static
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01008481int can_migrate_task(struct task_struct *p, struct lb_env *env)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008482{
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05308483 int tsk_cache_hot;
Kirill Tkhaie5673f22014-08-20 13:48:01 +04008484
8485 lockdep_assert_held(&env->src_rq->lock);
8486
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008487 /*
8488 * We do not migrate tasks that are:
Joonsoo Kimd3198082013-04-23 17:27:40 +09008489 * 1) throttled_lb_pair, or
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008490 * 2) cannot be migrated to this CPU due to cpus_allowed, or
Joonsoo Kimd3198082013-04-23 17:27:40 +09008491 * 3) running (obviously), or
8492 * 4) are cache-hot on their current CPU.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008493 */
Joonsoo Kimd3198082013-04-23 17:27:40 +09008494 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
8495 return 0;
8496
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01008497 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
Joonsoo Kime02e60c2013-04-23 17:27:42 +09008498 int cpu;
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05308499
Josh Poimboeufae928822016-06-17 12:43:24 -05008500 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05308501
Peter Zijlstra62633222013-08-19 12:41:09 +02008502 env->flags |= LBF_SOME_PINNED;
8503
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05308504 /*
8505 * Remember if this task can be migrated to any other cpu in
8506 * our sched_group. We may want to revisit it if we couldn't
8507 * meet load balance goals by pulling other tasks on src_cpu.
8508 *
8509 * Also avoid computing new_dst_cpu if we have already computed
8510 * one in current iteration.
8511 */
Peter Zijlstra62633222013-08-19 12:41:09 +02008512 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05308513 return 0;
8514
Joonsoo Kime02e60c2013-04-23 17:27:42 +09008515 /* Prevent to re-select dst_cpu via env's cpus */
8516 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
8517 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
Peter Zijlstra62633222013-08-19 12:41:09 +02008518 env->flags |= LBF_DST_PINNED;
Joonsoo Kime02e60c2013-04-23 17:27:42 +09008519 env->new_dst_cpu = cpu;
8520 break;
8521 }
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05308522 }
Joonsoo Kime02e60c2013-04-23 17:27:42 +09008523
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008524 return 0;
8525 }
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05308526
Pavankumar Kondetic6f1dd82018-09-04 16:29:58 +05308527 /* Record that we found atleast one task that could run on dst_cpu */
8528 env->flags &= ~LBF_ALL_PINNED;
8529
Joonwoo Park5be62152017-02-09 14:45:57 -08008530 if (energy_aware() && !env->dst_rq->rd->overutilized &&
Pavankumar Kondeti027adbe2018-11-08 09:37:58 +05308531 env->idle == CPU_NEWLY_IDLE &&
8532 !task_in_related_thread_group(p)) {
Joonwoo Park5be62152017-02-09 14:45:57 -08008533 long util_cum_dst, util_cum_src;
8534 unsigned long demand;
8535
8536 demand = task_util(p);
8537 util_cum_dst = cpu_util_cum(env->dst_cpu, 0) + demand;
8538 util_cum_src = cpu_util_cum(env->src_cpu, 0) - demand;
8539
8540 if (util_cum_dst > util_cum_src)
8541 return 0;
8542 }
8543
Vikram Mulukutla5e6ecd72017-03-27 15:01:58 -07008544#ifdef CONFIG_SCHED_WALT
Syed Rameez Mustafa20acfe72017-01-30 09:35:46 +05308545 if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
8546 !preferred_cluster(cpu_rq(env->dst_cpu)->cluster, p))
8547 return 0;
8548
Vikram Mulukutla5e6ecd72017-03-27 15:01:58 -07008549 /* Don't detach task if it doesn't fit on the destination */
8550 if (env->flags & LBF_IGNORE_BIG_TASKS &&
8551 !task_fits_max(p, env->dst_cpu))
8552 return 0;
8553#endif
8554
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01008555 if (task_running(env->src_rq, p)) {
Josh Poimboeufae928822016-06-17 12:43:24 -05008556 schedstat_inc(p->se.statistics.nr_failed_migrations_running);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008557 return 0;
8558 }
8559
8560 /*
8561 * Aggressive migration if:
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008562 * 1) IDLE or NEWLY_IDLE balance.
8563 * 2) destination numa is preferred
8564 * 3) task is cache cold, or
8565 * 4) too many balance attempts have failed.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008566 */
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05308567 tsk_cache_hot = migrate_degrades_locality(p, env);
8568 if (tsk_cache_hot == -1)
8569 tsk_cache_hot = task_hot(p, env);
Mel Gorman3a7053b2013-10-07 11:29:00 +01008570
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008571 if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 ||
Kirill Tkhai7a96c232014-09-22 22:36:12 +04008572 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05308573 if (tsk_cache_hot == 1) {
Josh Poimboeufae928822016-06-17 12:43:24 -05008574 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
8575 schedstat_inc(p->se.statistics.nr_forced_migrations);
Mel Gorman3a7053b2013-10-07 11:29:00 +01008576 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008577 return 1;
8578 }
8579
Josh Poimboeufae928822016-06-17 12:43:24 -05008580 schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
Zhang Hang4e2dcb72013-04-10 14:04:55 +08008581 return 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008582}
8583
Peter Zijlstra897c3952009-12-17 17:45:42 +01008584/*
Kirill Tkhai163122b2014-08-20 13:48:29 +04008585 * detach_task() -- detach the task for the migration specified in env
Peter Zijlstra897c3952009-12-17 17:45:42 +01008586 */
Kirill Tkhai163122b2014-08-20 13:48:29 +04008587static void detach_task(struct task_struct *p, struct lb_env *env)
8588{
8589 lockdep_assert_held(&env->src_rq->lock);
8590
Kirill Tkhai163122b2014-08-20 13:48:29 +04008591 p->on_rq = TASK_ON_RQ_MIGRATING;
Joonwoo Park3ea94de2015-11-12 19:38:54 -08008592 deactivate_task(env->src_rq, p, 0);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008593 double_lock_balance(env->src_rq, env->dst_rq);
Kirill Tkhai163122b2014-08-20 13:48:29 +04008594 set_task_cpu(p, env->dst_cpu);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008595 if (task_in_related_thread_group(p))
8596 env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK;
8597 double_unlock_balance(env->src_rq, env->dst_rq);
Kirill Tkhai163122b2014-08-20 13:48:29 +04008598}
8599
8600/*
Kirill Tkhaie5673f22014-08-20 13:48:01 +04008601 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
Peter Zijlstra897c3952009-12-17 17:45:42 +01008602 * part of active balancing operations within "domain".
Peter Zijlstra897c3952009-12-17 17:45:42 +01008603 *
Kirill Tkhaie5673f22014-08-20 13:48:01 +04008604 * Returns a task if successful and NULL otherwise.
Peter Zijlstra897c3952009-12-17 17:45:42 +01008605 */
Kirill Tkhaie5673f22014-08-20 13:48:01 +04008606static struct task_struct *detach_one_task(struct lb_env *env)
Peter Zijlstra897c3952009-12-17 17:45:42 +01008607{
8608 struct task_struct *p, *n;
Peter Zijlstra897c3952009-12-17 17:45:42 +01008609
Kirill Tkhaie5673f22014-08-20 13:48:01 +04008610 lockdep_assert_held(&env->src_rq->lock);
8611
Peter Zijlstra367456c2012-02-20 21:49:09 +01008612 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
Peter Zijlstra367456c2012-02-20 21:49:09 +01008613 if (!can_migrate_task(p, env))
8614 continue;
Peter Zijlstra897c3952009-12-17 17:45:42 +01008615
Kirill Tkhai163122b2014-08-20 13:48:29 +04008616 detach_task(p, env);
Kirill Tkhaie5673f22014-08-20 13:48:01 +04008617
Peter Zijlstra367456c2012-02-20 21:49:09 +01008618 /*
Kirill Tkhaie5673f22014-08-20 13:48:01 +04008619 * Right now, this is only the second place where
Kirill Tkhai163122b2014-08-20 13:48:29 +04008620 * lb_gained[env->idle] is updated (other is detach_tasks)
Kirill Tkhaie5673f22014-08-20 13:48:01 +04008621 * so we can safely collect stats here rather than
Kirill Tkhai163122b2014-08-20 13:48:29 +04008622 * inside detach_tasks().
Peter Zijlstra367456c2012-02-20 21:49:09 +01008623 */
Josh Poimboeufae928822016-06-17 12:43:24 -05008624 schedstat_inc(env->sd->lb_gained[env->idle]);
Kirill Tkhaie5673f22014-08-20 13:48:01 +04008625 return p;
Peter Zijlstra897c3952009-12-17 17:45:42 +01008626 }
Kirill Tkhaie5673f22014-08-20 13:48:01 +04008627 return NULL;
Peter Zijlstra897c3952009-12-17 17:45:42 +01008628}
8629
Peter Zijlstraeb953082012-04-17 13:38:40 +02008630static const unsigned int sched_nr_migrate_break = 32;
8631
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01008632/*
Kirill Tkhai163122b2014-08-20 13:48:29 +04008633 * detach_tasks() -- tries to detach up to imbalance weighted load from
8634 * busiest_rq, as part of a balancing operation within domain "sd".
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01008635 *
Kirill Tkhai163122b2014-08-20 13:48:29 +04008636 * Returns number of detached tasks if successful and 0 otherwise.
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01008637 */
Kirill Tkhai163122b2014-08-20 13:48:29 +04008638static int detach_tasks(struct lb_env *env)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008639{
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01008640 struct list_head *tasks = &env->src_rq->cfs_tasks;
8641 struct task_struct *p;
Peter Zijlstra367456c2012-02-20 21:49:09 +01008642 unsigned long load;
Kirill Tkhai163122b2014-08-20 13:48:29 +04008643 int detached = 0;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008644 int orig_loop = env->loop;
Kirill Tkhai163122b2014-08-20 13:48:29 +04008645
8646 lockdep_assert_held(&env->src_rq->lock);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008647
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008648 if (env->imbalance <= 0)
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01008649 return 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008650
Pavankumar Kondeti8327a0e2017-01-23 06:39:12 +05308651 if (!same_cluster(env->dst_cpu, env->src_cpu))
8652 env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
8653
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07008654 if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008655 env->flags |= LBF_IGNORE_BIG_TASKS;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008656
8657redo:
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01008658 while (!list_empty(tasks)) {
Yuyang Du985d3a42015-07-06 06:11:51 +08008659 /*
8660 * We don't want to steal all, otherwise we may be treated likewise,
8661 * which could at worst lead to a livelock crash.
8662 */
8663 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
8664 break;
8665
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01008666 p = list_first_entry(tasks, struct task_struct, se.group_node);
8667
Peter Zijlstra367456c2012-02-20 21:49:09 +01008668 env->loop++;
8669 /* We've more or less seen every task there is, call it quits */
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01008670 if (env->loop > env->loop_max)
Peter Zijlstra367456c2012-02-20 21:49:09 +01008671 break;
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01008672
8673 /* take a breather every nr_migrate tasks */
Peter Zijlstra367456c2012-02-20 21:49:09 +01008674 if (env->loop > env->loop_break) {
Peter Zijlstraeb953082012-04-17 13:38:40 +02008675 env->loop_break += sched_nr_migrate_break;
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01008676 env->flags |= LBF_NEED_BREAK;
Peter Zijlstraee00e662009-12-17 17:25:20 +01008677 break;
Peter Zijlstraa195f002011-09-22 15:30:18 +02008678 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008679
Joonsoo Kimd3198082013-04-23 17:27:40 +09008680 if (!can_migrate_task(p, env))
Peter Zijlstra367456c2012-02-20 21:49:09 +01008681 goto next;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008682
Peter Zijlstra367456c2012-02-20 21:49:09 +01008683 load = task_h_load(p);
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01008684
Peter Zijlstraeb953082012-04-17 13:38:40 +02008685 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
Peter Zijlstra367456c2012-02-20 21:49:09 +01008686 goto next;
8687
Maria Yu69eab5c2019-04-15 12:41:12 +08008688 /*
8689 * p is not running task when we goes until here, so if p is one
8690 * of the 2 task in src cpu rq and not the running one,
8691 * that means it is the only task that can be balanced.
8692 * So only when there is other tasks can be balanced or
8693 * there is situation to ignore big task, it is needed
8694 * to skip the task load bigger than 2*imbalance.
8695 */
8696 if (((cpu_rq(env->src_cpu)->nr_running > 2) ||
8697 (env->flags & LBF_IGNORE_BIG_TASKS)) &&
8698 ((load / 2) > env->imbalance))
Peter Zijlstra367456c2012-02-20 21:49:09 +01008699 goto next;
8700
Kirill Tkhai163122b2014-08-20 13:48:29 +04008701 detach_task(p, env);
8702 list_add(&p->se.group_node, &env->tasks);
8703
8704 detached++;
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008705 env->imbalance -= load;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008706
8707#ifdef CONFIG_PREEMPT
Peter Zijlstraee00e662009-12-17 17:25:20 +01008708 /*
8709 * NEWIDLE balancing is a source of latency, so preemptible
Kirill Tkhai163122b2014-08-20 13:48:29 +04008710 * kernels will stop after the first task is detached to minimize
Peter Zijlstraee00e662009-12-17 17:25:20 +01008711 * the critical section.
8712 */
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01008713 if (env->idle == CPU_NEWLY_IDLE)
Peter Zijlstraee00e662009-12-17 17:25:20 +01008714 break;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008715#endif
8716
Peter Zijlstraee00e662009-12-17 17:25:20 +01008717 /*
8718 * We only want to steal up to the prescribed amount of
8719 * weighted load.
8720 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008721 if (env->imbalance <= 0)
Peter Zijlstraee00e662009-12-17 17:25:20 +01008722 break;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008723
Peter Zijlstra367456c2012-02-20 21:49:09 +01008724 continue;
8725next:
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01008726 list_move_tail(&p->se.group_node, tasks);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008727 }
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01008728
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008729 if (env->flags & (LBF_IGNORE_BIG_TASKS |
8730 LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) {
8731 tasks = &env->src_rq->cfs_tasks;
8732 env->flags &= ~(LBF_IGNORE_BIG_TASKS |
8733 LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
8734 env->loop = orig_loop;
8735 goto redo;
8736 }
8737
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008738 /*
Kirill Tkhai163122b2014-08-20 13:48:29 +04008739 * Right now, this is one of only two places we collect this stat
8740 * so we can safely collect detach_one_task() stats here rather
8741 * than inside detach_one_task().
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008742 */
Josh Poimboeufae928822016-06-17 12:43:24 -05008743 schedstat_add(env->sd->lb_gained[env->idle], detached);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008744
Kirill Tkhai163122b2014-08-20 13:48:29 +04008745 return detached;
8746}
8747
8748/*
8749 * attach_task() -- attach the task detached by detach_task() to its new rq.
8750 */
8751static void attach_task(struct rq *rq, struct task_struct *p)
8752{
8753 lockdep_assert_held(&rq->lock);
8754
8755 BUG_ON(task_rq(p) != rq);
Kirill Tkhai163122b2014-08-20 13:48:29 +04008756 activate_task(rq, p, 0);
Joonwoo Park3ea94de2015-11-12 19:38:54 -08008757 p->on_rq = TASK_ON_RQ_QUEUED;
Kirill Tkhai163122b2014-08-20 13:48:29 +04008758 check_preempt_curr(rq, p, 0);
8759}
8760
8761/*
8762 * attach_one_task() -- attaches the task returned from detach_one_task() to
8763 * its new rq.
8764 */
8765static void attach_one_task(struct rq *rq, struct task_struct *p)
8766{
8767 raw_spin_lock(&rq->lock);
8768 attach_task(rq, p);
8769 raw_spin_unlock(&rq->lock);
8770}
8771
8772/*
8773 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
8774 * new rq.
8775 */
8776static void attach_tasks(struct lb_env *env)
8777{
8778 struct list_head *tasks = &env->tasks;
8779 struct task_struct *p;
8780
8781 raw_spin_lock(&env->dst_rq->lock);
8782
8783 while (!list_empty(tasks)) {
8784 p = list_first_entry(tasks, struct task_struct, se.group_node);
8785 list_del_init(&p->se.group_node);
8786
8787 attach_task(env->dst_rq, p);
8788 }
8789
8790 raw_spin_unlock(&env->dst_rq->lock);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008791}
8792
Peter Zijlstra230059de2009-12-17 17:47:12 +01008793#ifdef CONFIG_FAIR_GROUP_SCHED
Paul Turner48a16752012-10-04 13:18:31 +02008794static void update_blocked_averages(int cpu)
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08008795{
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08008796 struct rq *rq = cpu_rq(cpu);
Paul Turner48a16752012-10-04 13:18:31 +02008797 struct cfs_rq *cfs_rq;
8798 unsigned long flags;
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08008799
Paul Turner48a16752012-10-04 13:18:31 +02008800 raw_spin_lock_irqsave(&rq->lock, flags);
8801 update_rq_clock(rq);
Yuyang Du9d89c252015-07-15 08:04:37 +08008802
Peter Zijlstra9763b672011-07-13 13:09:25 +02008803 /*
8804 * Iterates the task_group tree in a bottom up fashion, see
8805 * list_add_leaf_cfs_rq() for details.
8806 */
Paul Turner64660c82011-07-21 09:43:36 -07008807 for_each_leaf_cfs_rq(rq, cfs_rq) {
Vincent Guittot0b4a2f12017-03-17 14:47:22 +01008808 struct sched_entity *se;
8809
Yuyang Du9d89c252015-07-15 08:04:37 +08008810 /* throttled entities do not contribute to load */
8811 if (throttled_hierarchy(cfs_rq))
8812 continue;
Paul Turner48a16752012-10-04 13:18:31 +02008813
Steve Mucklea2c6c912016-03-24 15:26:07 -07008814 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
Yuyang Du9d89c252015-07-15 08:04:37 +08008815 update_tg_load_avg(cfs_rq, 0);
Vincent Guittot3a34bf52016-11-08 10:53:46 +01008816
Vincent Guittot0b4a2f12017-03-17 14:47:22 +01008817 /* Propagate pending load changes to the parent, if any: */
8818 se = cfs_rq->tg->se[cpu];
8819 if (se && !skip_blocked_update(se))
8820 update_load_avg(se, 0);
Yuyang Du9d89c252015-07-15 08:04:37 +08008821 }
Paul Turner48a16752012-10-04 13:18:31 +02008822 raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08008823}
8824
Peter Zijlstra9763b672011-07-13 13:09:25 +02008825/*
Vladimir Davydov68520792013-07-15 17:49:19 +04008826 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
Peter Zijlstra9763b672011-07-13 13:09:25 +02008827 * This needs to be done in a top-down fashion because the load of a child
8828 * group is a fraction of its parents load.
8829 */
Vladimir Davydov68520792013-07-15 17:49:19 +04008830static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
Peter Zijlstra9763b672011-07-13 13:09:25 +02008831{
Vladimir Davydov68520792013-07-15 17:49:19 +04008832 struct rq *rq = rq_of(cfs_rq);
8833 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
Peter Zijlstraa35b6462012-08-08 21:46:40 +02008834 unsigned long now = jiffies;
Vladimir Davydov68520792013-07-15 17:49:19 +04008835 unsigned long load;
Peter Zijlstraa35b6462012-08-08 21:46:40 +02008836
Vladimir Davydov68520792013-07-15 17:49:19 +04008837 if (cfs_rq->last_h_load_update == now)
Peter Zijlstraa35b6462012-08-08 21:46:40 +02008838 return;
8839
Mel Gorman6d1e1da2019-03-19 12:36:10 +00008840 WRITE_ONCE(cfs_rq->h_load_next, NULL);
Vladimir Davydov68520792013-07-15 17:49:19 +04008841 for_each_sched_entity(se) {
8842 cfs_rq = cfs_rq_of(se);
Mel Gorman6d1e1da2019-03-19 12:36:10 +00008843 WRITE_ONCE(cfs_rq->h_load_next, se);
Vladimir Davydov68520792013-07-15 17:49:19 +04008844 if (cfs_rq->last_h_load_update == now)
8845 break;
8846 }
Peter Zijlstraa35b6462012-08-08 21:46:40 +02008847
Vladimir Davydov68520792013-07-15 17:49:19 +04008848 if (!se) {
Yuyang Du7ea241a2015-07-15 08:04:42 +08008849 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
Vladimir Davydov68520792013-07-15 17:49:19 +04008850 cfs_rq->last_h_load_update = now;
8851 }
8852
Mel Gorman6d1e1da2019-03-19 12:36:10 +00008853 while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
Vladimir Davydov68520792013-07-15 17:49:19 +04008854 load = cfs_rq->h_load;
Yuyang Du7ea241a2015-07-15 08:04:42 +08008855 load = div64_ul(load * se->avg.load_avg,
8856 cfs_rq_load_avg(cfs_rq) + 1);
Vladimir Davydov68520792013-07-15 17:49:19 +04008857 cfs_rq = group_cfs_rq(se);
8858 cfs_rq->h_load = load;
8859 cfs_rq->last_h_load_update = now;
8860 }
Peter Zijlstra9763b672011-07-13 13:09:25 +02008861}
8862
Peter Zijlstra367456c2012-02-20 21:49:09 +01008863static unsigned long task_h_load(struct task_struct *p)
Peter Zijlstra230059de2009-12-17 17:47:12 +01008864{
Peter Zijlstra367456c2012-02-20 21:49:09 +01008865 struct cfs_rq *cfs_rq = task_cfs_rq(p);
Peter Zijlstra230059de2009-12-17 17:47:12 +01008866
Vladimir Davydov68520792013-07-15 17:49:19 +04008867 update_cfs_rq_h_load(cfs_rq);
Yuyang Du9d89c252015-07-15 08:04:37 +08008868 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
Yuyang Du7ea241a2015-07-15 08:04:42 +08008869 cfs_rq_load_avg(cfs_rq) + 1);
Peter Zijlstra230059de2009-12-17 17:47:12 +01008870}
8871#else
Paul Turner48a16752012-10-04 13:18:31 +02008872static inline void update_blocked_averages(int cpu)
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08008873{
Vincent Guittot6c1d47c2015-07-15 08:04:38 +08008874 struct rq *rq = cpu_rq(cpu);
8875 struct cfs_rq *cfs_rq = &rq->cfs;
8876 unsigned long flags;
8877
8878 raw_spin_lock_irqsave(&rq->lock, flags);
8879 update_rq_clock(rq);
Steve Mucklea2c6c912016-03-24 15:26:07 -07008880 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
Vincent Guittot6c1d47c2015-07-15 08:04:38 +08008881 raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08008882}
8883
Peter Zijlstra367456c2012-02-20 21:49:09 +01008884static unsigned long task_h_load(struct task_struct *p)
8885{
Yuyang Du9d89c252015-07-15 08:04:37 +08008886 return p->se.avg.load_avg;
Peter Zijlstra230059de2009-12-17 17:47:12 +01008887}
8888#endif
8889
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008890/********** Helpers for find_busiest_group ************************/
Rik van Rielcaeb1782014-07-28 14:16:28 -04008891
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008892/*
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008893 * sg_lb_stats - stats of a sched_group required for load_balancing
8894 */
8895struct sg_lb_stats {
8896 unsigned long avg_load; /*Avg load across the CPUs of the group */
8897 unsigned long group_load; /* Total load over the CPUs of the group */
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008898 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008899 unsigned long load_per_task;
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008900 unsigned long group_capacity;
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01008901 unsigned long group_util; /* Total utilization of the group */
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02008902 unsigned int sum_nr_running; /* Nr tasks running in the group */
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02008903 unsigned int idle_cpus;
8904 unsigned int group_weight;
Rik van Rielcaeb1782014-07-28 14:16:28 -04008905 enum group_type group_type;
Vincent Guittotea678212015-02-27 16:54:11 +01008906 int group_no_capacity;
Morten Rasmussen4c6a8242016-02-25 12:47:54 +00008907 int group_misfit_task; /* A cpu has a task too big for its capacity */
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008908#ifdef CONFIG_NUMA_BALANCING
8909 unsigned int nr_numa_running;
8910 unsigned int nr_preferred_running;
8911#endif
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008912};
8913
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008914/*
8915 * sd_lb_stats - Structure to store the statistics of a sched_domain
8916 * during load balancing.
8917 */
8918struct sd_lb_stats {
8919 struct sched_group *busiest; /* Busiest group in this sd */
8920 struct sched_group *local; /* Local group in this sd */
8921 unsigned long total_load; /* Total load of all groups in sd */
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008922 unsigned long total_capacity; /* Total capacity of all groups in sd */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008923 unsigned long avg_load; /* Average load across all groups in sd */
8924
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008925 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02008926 struct sg_lb_stats local_stat; /* Statistics of the local group */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008927};
8928
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02008929static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
8930{
8931 /*
8932 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
8933 * local_stat because update_sg_lb_stats() does a full clear/assignment.
8934 * We must however clear busiest_stat::avg_load because
8935 * update_sd_pick_busiest() reads this before assignment.
8936 */
8937 *sds = (struct sd_lb_stats){
8938 .busiest = NULL,
8939 .local = NULL,
8940 .total_load = 0UL,
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008941 .total_capacity = 0UL,
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02008942 .busiest_stat = {
8943 .avg_load = 0UL,
Rik van Rielcaeb1782014-07-28 14:16:28 -04008944 .sum_nr_running = 0,
8945 .group_type = group_other,
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02008946 },
8947 };
8948}
8949
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008950/**
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008951 * get_sd_load_idx - Obtain the load index for a given sched domain.
8952 * @sd: The sched_domain whose load_idx is to be obtained.
Kamalesh Babulaled1b7732013-10-13 23:06:15 +05308953 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
Yacine Belkadie69f6182013-07-12 20:45:47 +02008954 *
8955 * Return: The load index.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008956 */
8957static inline int get_sd_load_idx(struct sched_domain *sd,
8958 enum cpu_idle_type idle)
8959{
8960 int load_idx;
8961
8962 switch (idle) {
8963 case CPU_NOT_IDLE:
8964 load_idx = sd->busy_idx;
8965 break;
8966
8967 case CPU_NEWLY_IDLE:
8968 load_idx = sd->newidle_idx;
8969 break;
8970 default:
8971 load_idx = sd->idle_idx;
8972 break;
8973 }
8974
8975 return load_idx;
8976}
8977
Nicolas Pitreced549f2014-05-26 18:19:38 -04008978static unsigned long scale_rt_capacity(int cpu)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008979{
8980 struct rq *rq = cpu_rq(cpu);
Vincent Guittotb5b48602015-02-27 16:54:08 +01008981 u64 total, used, age_stamp, avg;
Peter Zijlstracadefd32014-02-27 10:40:35 +01008982 s64 delta;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008983
Peter Zijlstrab654f7d2012-05-22 14:04:28 +02008984 /*
8985 * Since we're reading these variables without serialization make sure
8986 * we read them once before doing sanity checks on them.
8987 */
Jason Low316c1608d2015-04-28 13:00:20 -07008988 age_stamp = READ_ONCE(rq->age_stamp);
8989 avg = READ_ONCE(rq->rt_avg);
Peter Zijlstracebde6d2015-01-05 11:18:10 +01008990 delta = __rq_clock_broken(rq) - age_stamp;
Venkatesh Pallipadiaa483802010-10-04 17:03:22 -07008991
Peter Zijlstracadefd32014-02-27 10:40:35 +01008992 if (unlikely(delta < 0))
8993 delta = 0;
8994
8995 total = sched_avg_period() + delta;
Peter Zijlstrab654f7d2012-05-22 14:04:28 +02008996
Vincent Guittotb5b48602015-02-27 16:54:08 +01008997 used = div_u64(avg, total);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008998
Vincent Guittotb5b48602015-02-27 16:54:08 +01008999 if (likely(used < SCHED_CAPACITY_SCALE))
9000 return SCHED_CAPACITY_SCALE - used;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009001
Vincent Guittotb5b48602015-02-27 16:54:08 +01009002 return 1;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009003}
9004
Dietmar Eggemannbbb138b2015-09-26 18:19:54 +01009005void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
9006{
9007 raw_spin_lock_init(&mcc->lock);
9008 mcc->val = 0;
9009 mcc->cpu = -1;
9010}
9011
Nicolas Pitreced549f2014-05-26 18:19:38 -04009012static void update_cpu_capacity(struct sched_domain *sd, int cpu)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009013{
Morten Rasmussen8cd56012015-08-14 17:23:10 +01009014 unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009015 struct sched_group *sdg = sd->groups;
Dietmar Eggemannbbb138b2015-09-26 18:19:54 +01009016 struct max_cpu_capacity *mcc;
9017 unsigned long max_capacity;
9018 int max_cap_cpu;
9019 unsigned long flags;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009020
Dietmar Eggemann29db1b42017-07-13 09:48:42 +01009021 capacity *= arch_scale_max_freq_capacity(sd, cpu);
9022 capacity >>= SCHED_CAPACITY_SHIFT;
9023
Pavankumar Kondeticf0babd2018-05-08 14:12:50 +05309024 capacity = min(capacity, thermal_cap(cpu));
9025 cpu_rq(cpu)->cpu_capacity_orig = capacity;
9026
Dietmar Eggemannbbb138b2015-09-26 18:19:54 +01009027 mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
9028
9029 raw_spin_lock_irqsave(&mcc->lock, flags);
9030 max_capacity = mcc->val;
9031 max_cap_cpu = mcc->cpu;
9032
9033 if ((max_capacity > capacity && max_cap_cpu == cpu) ||
9034 (max_capacity < capacity)) {
9035 mcc->val = capacity;
9036 mcc->cpu = cpu;
9037#ifdef CONFIG_SCHED_DEBUG
9038 raw_spin_unlock_irqrestore(&mcc->lock, flags);
Caesar Wangc1fdb252016-08-23 11:47:02 +01009039 printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
9040 cpu, capacity);
Dietmar Eggemannbbb138b2015-09-26 18:19:54 +01009041 goto skip_unlock;
9042#endif
9043 }
9044 raw_spin_unlock_irqrestore(&mcc->lock, flags);
9045
9046skip_unlock: __attribute__ ((unused));
Leo Yanc9507ba2016-12-22 07:58:00 -08009047
Nicolas Pitreced549f2014-05-26 18:19:38 -04009048 capacity *= scale_rt_capacity(cpu);
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04009049 capacity >>= SCHED_CAPACITY_SHIFT;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009050
Nicolas Pitreced549f2014-05-26 18:19:38 -04009051 if (!capacity)
9052 capacity = 1;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009053
Nicolas Pitreced549f2014-05-26 18:19:38 -04009054 cpu_rq(cpu)->cpu_capacity = capacity;
Pavankumar Kondetiedd112f2018-04-13 11:38:54 +05309055 if (!sd->child) {
9056 sdg->sgc->capacity = capacity;
9057 sdg->sgc->max_capacity = capacity;
9058 sdg->sgc->min_capacity = capacity;
9059 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009060}
9061
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009062void update_group_capacity(struct sched_domain *sd, int cpu)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009063{
9064 struct sched_domain *child = sd->child;
9065 struct sched_group *group, *sdg = sd->groups;
Morten Rasmussen3d8cb902016-10-14 14:41:09 +01009066 unsigned long capacity, max_capacity, min_capacity;
Vincent Guittot4ec44122011-12-12 20:21:08 +01009067 unsigned long interval;
9068
9069 interval = msecs_to_jiffies(sd->balance_interval);
9070 interval = clamp(interval, 1UL, max_load_balance_interval);
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009071 sdg->sgc->next_update = jiffies + interval;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009072
Pavankumar Kondetiedd112f2018-04-13 11:38:54 +05309073 /*
9074 * When there is only 1 CPU in the sched group of a higher
9075 * level sched domain (sd->child != NULL), the load balance
9076 * does not happen for the last level sched domain. Check
9077 * this condition and update the CPU capacity accordingly.
9078 */
9079 if (cpumask_weight(sched_group_cpus(sdg)) == 1) {
Nicolas Pitreced549f2014-05-26 18:19:38 -04009080 update_cpu_capacity(sd, cpu);
Pavankumar Kondetiedd112f2018-04-13 11:38:54 +05309081 if (!child)
9082 return;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009083 }
9084
Vincent Guittotdc7ff762015-03-03 11:35:03 +01009085 capacity = 0;
Morten Rasmussen5cdeb5f2016-02-25 12:43:49 +00009086 max_capacity = 0;
Morten Rasmussen3d8cb902016-10-14 14:41:09 +01009087 min_capacity = ULONG_MAX;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009088
Peter Zijlstra74a5ce22012-05-23 18:00:43 +02009089 if (child->flags & SD_OVERLAP) {
9090 /*
9091 * SD_OVERLAP domains cannot assume that child groups
9092 * span the current group.
9093 */
9094
Peter Zijlstra863bffc2013-08-28 11:44:39 +02009095 for_each_cpu(cpu, sched_group_cpus(sdg)) {
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009096 struct sched_group_capacity *sgc;
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05309097 struct rq *rq = cpu_rq(cpu);
Peter Zijlstra863bffc2013-08-28 11:44:39 +02009098
Olav Haugan3f2cb302016-05-31 14:34:46 -07009099 if (cpumask_test_cpu(cpu, cpu_isolated_mask))
9100 continue;
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05309101 /*
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009102 * build_sched_domains() -> init_sched_groups_capacity()
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05309103 * gets here before we've attached the domains to the
9104 * runqueues.
9105 *
Nicolas Pitreced549f2014-05-26 18:19:38 -04009106 * Use capacity_of(), which is set irrespective of domains
9107 * in update_cpu_capacity().
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05309108 *
Vincent Guittotdc7ff762015-03-03 11:35:03 +01009109 * This avoids capacity from being 0 and
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05309110 * causing divide-by-zero issues on boot.
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05309111 */
9112 if (unlikely(!rq->sd)) {
Nicolas Pitreced549f2014-05-26 18:19:38 -04009113 capacity += capacity_of(cpu);
Morten Rasmussen5cdeb5f2016-02-25 12:43:49 +00009114 } else {
9115 sgc = rq->sd->groups->sgc;
9116 capacity += sgc->capacity;
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05309117 }
9118
Morten Rasmussen5cdeb5f2016-02-25 12:43:49 +00009119 max_capacity = max(capacity, max_capacity);
Morten Rasmussen3d8cb902016-10-14 14:41:09 +01009120 min_capacity = min(capacity, min_capacity);
Peter Zijlstra863bffc2013-08-28 11:44:39 +02009121 }
Peter Zijlstra74a5ce22012-05-23 18:00:43 +02009122 } else {
9123 /*
9124 * !SD_OVERLAP domains can assume that child groups
9125 * span the current group.
Byungchul Park97a71422015-07-05 18:33:48 +09009126 */
Peter Zijlstra74a5ce22012-05-23 18:00:43 +02009127
9128 group = child->groups;
9129 do {
Morten Rasmussen5cdeb5f2016-02-25 12:43:49 +00009130 struct sched_group_capacity *sgc = group->sgc;
Olav Haugan3f2cb302016-05-31 14:34:46 -07009131 cpumask_t *cpus = sched_group_cpus(group);
9132
9133 /* Revisit this later. This won't work for MT domain */
Channagoud Kadabi8810e5f2017-02-17 16:01:05 -08009134 if (!cpu_isolated(cpumask_first(cpus))) {
9135 capacity += sgc->capacity;
9136 max_capacity = max(sgc->max_capacity, max_capacity);
Kyle Yane2486b72017-08-25 14:36:53 -07009137 min_capacity = min(sgc->min_capacity, min_capacity);
Channagoud Kadabi8810e5f2017-02-17 16:01:05 -08009138 }
Peter Zijlstra74a5ce22012-05-23 18:00:43 +02009139 group = group->next;
9140 } while (group != child->groups);
9141 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009142
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009143 sdg->sgc->capacity = capacity;
Morten Rasmussen5cdeb5f2016-02-25 12:43:49 +00009144 sdg->sgc->max_capacity = max_capacity;
Morten Rasmussen3d8cb902016-10-14 14:41:09 +01009145 sdg->sgc->min_capacity = min_capacity;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009146}
9147
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10009148/*
Vincent Guittotea678212015-02-27 16:54:11 +01009149 * Check whether the capacity of the rq has been noticeably reduced by side
9150 * activity. The imbalance_pct is used for the threshold.
9151 * Return true is the capacity is reduced
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10009152 */
9153static inline int
Vincent Guittotea678212015-02-27 16:54:11 +01009154check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10009155{
Vincent Guittotea678212015-02-27 16:54:11 +01009156 return ((rq->cpu_capacity * sd->imbalance_pct) <
9157 (rq->cpu_capacity_orig * 100));
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10009158}
9159
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02009160/*
9161 * Group imbalance indicates (and tries to solve) the problem where balancing
9162 * groups is inadequate due to tsk_cpus_allowed() constraints.
9163 *
9164 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
9165 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
9166 * Something like:
9167 *
9168 * { 0 1 2 3 } { 4 5 6 7 }
9169 * * * * *
9170 *
9171 * If we were to balance group-wise we'd place two tasks in the first group and
9172 * two tasks in the second group. Clearly this is undesired as it will overload
9173 * cpu 3 and leave one of the cpus in the second group unused.
9174 *
9175 * The current solution to this issue is detecting the skew in the first group
Peter Zijlstra62633222013-08-19 12:41:09 +02009176 * by noticing the lower domain failed to reach balance and had difficulty
9177 * moving tasks due to affinity constraints.
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02009178 *
9179 * When this is so detected; this group becomes a candidate for busiest; see
Kamalesh Babulaled1b7732013-10-13 23:06:15 +05309180 * update_sd_pick_busiest(). And calculate_imbalance() and
Peter Zijlstra62633222013-08-19 12:41:09 +02009181 * find_busiest_group() avoid some of the usual balance conditions to allow it
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02009182 * to create an effective group imbalance.
9183 *
9184 * This is a somewhat tricky proposition since the next run might not find the
9185 * group imbalance and decide the groups need to be balanced again. A most
9186 * subtle and fragile situation.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009187 */
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02009188
Peter Zijlstra62633222013-08-19 12:41:09 +02009189static inline int sg_imbalanced(struct sched_group *group)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009190{
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009191 return group->sgc->imbalance;
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02009192}
9193
Peter Zijlstrab37d9312013-08-28 11:50:34 +02009194/*
Vincent Guittotea678212015-02-27 16:54:11 +01009195 * group_has_capacity returns true if the group has spare capacity that could
9196 * be used by some tasks.
9197 * We consider that a group has spare capacity if the * number of task is
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01009198 * smaller than the number of CPUs or if the utilization is lower than the
9199 * available capacity for CFS tasks.
Vincent Guittotea678212015-02-27 16:54:11 +01009200 * For the latter, we use a threshold to stabilize the state, to take into
9201 * account the variance of the tasks' load and to return true if the available
9202 * capacity in meaningful for the load balancer.
9203 * As an example, an available capacity of 1% can appear but it doesn't make
9204 * any benefit for the load balance.
Peter Zijlstrab37d9312013-08-28 11:50:34 +02009205 */
Vincent Guittotea678212015-02-27 16:54:11 +01009206static inline bool
9207group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
Peter Zijlstrab37d9312013-08-28 11:50:34 +02009208{
Vincent Guittotea678212015-02-27 16:54:11 +01009209 if (sgs->sum_nr_running < sgs->group_weight)
9210 return true;
Peter Zijlstrab37d9312013-08-28 11:50:34 +02009211
Vincent Guittotea678212015-02-27 16:54:11 +01009212 if ((sgs->group_capacity * 100) >
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01009213 (sgs->group_util * env->sd->imbalance_pct))
Vincent Guittotea678212015-02-27 16:54:11 +01009214 return true;
Peter Zijlstrab37d9312013-08-28 11:50:34 +02009215
Vincent Guittotea678212015-02-27 16:54:11 +01009216 return false;
Peter Zijlstrab37d9312013-08-28 11:50:34 +02009217}
9218
Vincent Guittotea678212015-02-27 16:54:11 +01009219/*
9220 * group_is_overloaded returns true if the group has more tasks than it can
9221 * handle.
9222 * group_is_overloaded is not equals to !group_has_capacity because a group
9223 * with the exact right number of tasks, has no more spare capacity but is not
9224 * overloaded so both group_has_capacity and group_is_overloaded return
9225 * false.
9226 */
9227static inline bool
9228group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
Rik van Rielcaeb1782014-07-28 14:16:28 -04009229{
Vincent Guittotea678212015-02-27 16:54:11 +01009230 if (sgs->sum_nr_running <= sgs->group_weight)
9231 return false;
9232
9233 if ((sgs->group_capacity * 100) <
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01009234 (sgs->group_util * env->sd->imbalance_pct))
Vincent Guittotea678212015-02-27 16:54:11 +01009235 return true;
9236
9237 return false;
9238}
9239
Morten Rasmussenf95e8de2016-02-25 12:51:35 +00009240
9241/*
9242 * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
9243 * per-cpu capacity than sched_group ref.
9244 */
9245static inline bool
9246group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
9247{
Leo Yanc9507ba2016-12-22 07:58:00 -08009248 return sg->sgc->max_capacity < ref->sgc->max_capacity;
Morten Rasmussenf95e8de2016-02-25 12:51:35 +00009249}
9250
Leo Yan79a89f92015-09-15 18:56:45 +08009251static inline enum
9252group_type group_classify(struct sched_group *group,
9253 struct sg_lb_stats *sgs)
Vincent Guittotea678212015-02-27 16:54:11 +01009254{
9255 if (sgs->group_no_capacity)
Rik van Rielcaeb1782014-07-28 14:16:28 -04009256 return group_overloaded;
9257
9258 if (sg_imbalanced(group))
9259 return group_imbalanced;
9260
Morten Rasmussen4c6a8242016-02-25 12:47:54 +00009261 if (sgs->group_misfit_task)
9262 return group_misfit_task;
9263
Rik van Rielcaeb1782014-07-28 14:16:28 -04009264 return group_other;
9265}
9266
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05309267#ifdef CONFIG_NO_HZ_COMMON
9268/*
9269 * idle load balancing data
9270 * - used by the nohz balance, but we want it available here
9271 * so that we can see which CPUs have no tick.
9272 */
9273static struct {
9274 cpumask_var_t idle_cpus_mask;
9275 atomic_t nr_cpus;
9276 unsigned long next_balance; /* in jiffy units */
9277} nohz ____cacheline_aligned;
9278
9279static inline void update_cpu_stats_if_tickless(struct rq *rq)
9280{
9281 /* only called from update_sg_lb_stats when irqs are disabled */
9282 if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
9283 /* rate limit updates to once-per-jiffie at most */
9284 if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
9285 return;
9286
9287 raw_spin_lock(&rq->lock);
9288 update_rq_clock(rq);
9289 cpu_load_update_idle(rq);
9290 update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
9291 raw_spin_unlock(&rq->lock);
9292 }
9293}
9294
9295#else
9296static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
9297#endif
9298
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009299/**
9300 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
9301 * @env: The load balancing environment.
9302 * @group: sched_group whose statistics are to be updated.
9303 * @load_idx: Load index of sched_domain of this_cpu for load calc.
9304 * @local_group: Does group contain this_cpu.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009305 * @sgs: variable to hold the statistics for this group.
Masanari Iidacd3bd4e2014-07-28 12:38:06 +09009306 * @overload: Indicate more than one runnable task for any CPU.
Morten Rasmussena562dfc2015-05-09 16:49:57 +01009307 * @overutilized: Indicate overutilization for any CPU.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009308 */
9309static inline void update_sg_lb_stats(struct lb_env *env,
9310 struct sched_group *group, int load_idx,
Tim Chen4486edd2014-06-23 12:16:49 -07009311 int local_group, struct sg_lb_stats *sgs,
Morten Rasmussena562dfc2015-05-09 16:49:57 +01009312 bool *overload, bool *overutilized)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009313{
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02009314 unsigned long load;
Waiman Longa426f992015-11-25 14:09:38 -05009315 int i, nr_running;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009316
Peter Zijlstrab72ff132013-08-28 10:32:32 +02009317 memset(sgs, 0, sizeof(*sgs));
9318
Michael Wangb94031302012-07-12 16:10:13 +08009319 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009320 struct rq *rq = cpu_rq(i);
9321
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009322 trace_sched_cpu_load_lb(cpu_rq(i), idle_cpu(i),
9323 sched_irqload(i),
Puja Gupta487dec62017-06-27 10:13:50 -07009324 power_cost(i, 0));
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009325
Olav Haugan3f2cb302016-05-31 14:34:46 -07009326 if (cpu_isolated(i))
9327 continue;
9328
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +05309329 /* if we are entering idle and there are CPUs with
9330 * their tick stopped, do an update for them
9331 */
9332 if (env->idle == CPU_NEWLY_IDLE)
9333 update_cpu_stats_if_tickless(rq);
9334
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009335 /* Bias balancing toward cpus of our domain */
Peter Zijlstra62633222013-08-19 12:41:09 +02009336 if (local_group)
Peter Zijlstra04f733b2012-05-11 00:12:02 +02009337 load = target_load(i, load_idx);
Peter Zijlstra62633222013-08-19 12:41:09 +02009338 else
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009339 load = source_load(i, load_idx);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009340
9341 sgs->group_load += load;
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01009342 sgs->group_util += cpu_util(i);
Vincent Guittot65fdac02014-08-26 13:06:46 +02009343 sgs->sum_nr_running += rq->cfs.h_nr_running;
Tim Chen4486edd2014-06-23 12:16:49 -07009344
Waiman Longa426f992015-11-25 14:09:38 -05009345 nr_running = rq->nr_running;
9346 if (nr_running > 1)
Tim Chen4486edd2014-06-23 12:16:49 -07009347 *overload = true;
9348
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01009349#ifdef CONFIG_NUMA_BALANCING
9350 sgs->nr_numa_running += rq->nr_numa_running;
9351 sgs->nr_preferred_running += rq->nr_preferred_running;
9352#endif
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009353 sgs->sum_weighted_load += weighted_cpuload(i);
Waiman Longa426f992015-11-25 14:09:38 -05009354 /*
9355 * No need to call idle_cpu() if nr_running is not 0
9356 */
9357 if (!nr_running && idle_cpu(i))
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07009358 sgs->idle_cpus++;
Morten Rasmussena562dfc2015-05-09 16:49:57 +01009359
Leo Yane52c5092016-12-22 23:58:49 +08009360 if (cpu_overutilized(i))
Morten Rasmussena562dfc2015-05-09 16:49:57 +01009361 *overutilized = true;
Leo Yane52c5092016-12-22 23:58:49 +08009362
9363 if (!sgs->group_misfit_task && rq->misfit_task)
9364 sgs->group_misfit_task = capacity_of(i);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009365 }
9366
Olav Haugan3f2cb302016-05-31 14:34:46 -07009367 /* Isolated CPU has no weight */
9368 if (!group->group_weight) {
9369 sgs->group_capacity = 0;
9370 sgs->avg_load = 0;
9371 sgs->group_no_capacity = 1;
9372 sgs->group_type = group_other;
9373 sgs->group_weight = group->group_weight;
9374 } else {
9375 /* Adjust by relative CPU capacity of the group */
9376 sgs->group_capacity = group->sgc->capacity;
9377 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
9378 sgs->group_capacity;
9379
9380 sgs->group_weight = group->group_weight;
9381
9382 sgs->group_no_capacity = group_is_overloaded(env, sgs);
9383 sgs->group_type = group_classify(group, sgs);
9384 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009385
Suresh Siddhadd5feea2010-02-23 16:13:52 -08009386 if (sgs->sum_nr_running)
Peter Zijlstra38d0f772013-08-15 19:47:56 +02009387 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009388}
9389
9390/**
Michael Neuling532cb4c2010-06-08 14:57:02 +10009391 * update_sd_pick_busiest - return 1 on busiest group
Randy Dunlapcd968912012-06-08 13:18:33 -07009392 * @env: The load balancing environment.
Michael Neuling532cb4c2010-06-08 14:57:02 +10009393 * @sds: sched_domain statistics
9394 * @sg: sched_group candidate to be checked for being the busiest
Michael Neulingb6b12292010-06-10 12:06:21 +10009395 * @sgs: sched_group statistics
Michael Neuling532cb4c2010-06-08 14:57:02 +10009396 *
9397 * Determine if @sg is a busier group than the previously selected
9398 * busiest group.
Yacine Belkadie69f6182013-07-12 20:45:47 +02009399 *
9400 * Return: %true if @sg is a busier group than the previously selected
9401 * busiest group. %false otherwise.
Michael Neuling532cb4c2010-06-08 14:57:02 +10009402 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009403static bool update_sd_pick_busiest(struct lb_env *env,
Michael Neuling532cb4c2010-06-08 14:57:02 +10009404 struct sd_lb_stats *sds,
9405 struct sched_group *sg,
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009406 struct sg_lb_stats *sgs)
Michael Neuling532cb4c2010-06-08 14:57:02 +10009407{
Rik van Rielcaeb1782014-07-28 14:16:28 -04009408 struct sg_lb_stats *busiest = &sds->busiest_stat;
Michael Neuling532cb4c2010-06-08 14:57:02 +10009409
Rik van Rielcaeb1782014-07-28 14:16:28 -04009410 if (sgs->group_type > busiest->group_type)
Michael Neuling532cb4c2010-06-08 14:57:02 +10009411 return true;
9412
Rik van Rielcaeb1782014-07-28 14:16:28 -04009413 if (sgs->group_type < busiest->group_type)
9414 return false;
9415
Morten Rasmussenf95e8de2016-02-25 12:51:35 +00009416 /*
9417 * Candidate sg doesn't face any serious load-balance problems
9418 * so don't pick it if the local sg is already filled up.
9419 */
9420 if (sgs->group_type == group_other &&
9421 !group_has_capacity(env, &sds->local_stat))
9422 return false;
9423
Rik van Rielcaeb1782014-07-28 14:16:28 -04009424 if (sgs->avg_load <= busiest->avg_load)
9425 return false;
9426
Morten Rasmussen942295e2016-10-14 14:41:10 +01009427 if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
9428 goto asym_packing;
9429
Morten Rasmussenb19cdb92016-02-25 12:51:35 +00009430 /*
Morten Rasmussen942295e2016-10-14 14:41:10 +01009431 * Candidate sg has no more than one task per CPU and
9432 * has higher per-CPU capacity. Migrating tasks to less
9433 * capable CPUs may harm throughput. Maximize throughput,
9434 * power/energy consequences are not considered.
Morten Rasmussenb19cdb92016-02-25 12:51:35 +00009435 */
Kyle Yane2486b72017-08-25 14:36:53 -07009436
Morten Rasmussenf95e8de2016-02-25 12:51:35 +00009437 /*
9438 * Candiate sg has no more than one task per cpu and has higher
9439 * per-cpu capacity. No reason to pull tasks to less capable cpus.
9440 */
9441 if (sgs->sum_nr_running <= sgs->group_weight &&
9442 group_smaller_cpu_capacity(sds->local, sg))
9443 return false;
9444
Morten Rasmussen942295e2016-10-14 14:41:10 +01009445asym_packing:
Rik van Rielcaeb1782014-07-28 14:16:28 -04009446 /* This is the busiest node in its class. */
9447 if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling532cb4c2010-06-08 14:57:02 +10009448 return true;
9449
Srikar Dronamraju1f621e02016-04-06 18:47:40 +05309450 /* No ASYM_PACKING if target cpu is already busy */
9451 if (env->idle == CPU_NOT_IDLE)
9452 return true;
Michael Neuling532cb4c2010-06-08 14:57:02 +10009453 /*
9454 * ASYM_PACKING needs to move all the work to the lowest
9455 * numbered CPUs in the group, therefore mark all groups
9456 * higher than ourself as busy.
9457 */
Rik van Rielcaeb1782014-07-28 14:16:28 -04009458 if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
Michael Neuling532cb4c2010-06-08 14:57:02 +10009459 if (!sds->busiest)
9460 return true;
9461
Srikar Dronamraju1f621e02016-04-06 18:47:40 +05309462 /* Prefer to move from highest possible cpu's work */
9463 if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
Michael Neuling532cb4c2010-06-08 14:57:02 +10009464 return true;
9465 }
9466
9467 return false;
9468}
9469
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01009470#ifdef CONFIG_NUMA_BALANCING
9471static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
9472{
9473 if (sgs->sum_nr_running > sgs->nr_numa_running)
9474 return regular;
9475 if (sgs->sum_nr_running > sgs->nr_preferred_running)
9476 return remote;
9477 return all;
9478}
9479
9480static inline enum fbq_type fbq_classify_rq(struct rq *rq)
9481{
9482 if (rq->nr_running > rq->nr_numa_running)
9483 return regular;
9484 if (rq->nr_running > rq->nr_preferred_running)
9485 return remote;
9486 return all;
9487}
9488#else
9489static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
9490{
9491 return all;
9492}
9493
9494static inline enum fbq_type fbq_classify_rq(struct rq *rq)
9495{
9496 return regular;
9497}
9498#endif /* CONFIG_NUMA_BALANCING */
9499
Dietmar Eggemann06654992015-07-30 16:53:30 +01009500#define lb_sd_parent(sd) \
9501 (sd->parent && sd->parent->groups != sd->parent->groups->next)
9502
Michael Neuling532cb4c2010-06-08 14:57:02 +10009503/**
Hui Kang461819a2011-10-11 23:00:59 -04009504 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
Randy Dunlapcd968912012-06-08 13:18:33 -07009505 * @env: The load balancing environment.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009506 * @sds: variable to hold the statistics for this sched_domain.
9507 */
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01009508static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009509{
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009510 struct sched_domain *child = env->sd->child;
9511 struct sched_group *sg = env->sd->groups;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009512 struct sg_lb_stats tmp_sgs;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009513 int load_idx, prefer_sibling = 0;
Morten Rasmussena562dfc2015-05-09 16:49:57 +01009514 bool overload = false, overutilized = false;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009515
9516 if (child && child->flags & SD_PREFER_SIBLING)
9517 prefer_sibling = 1;
9518
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009519 load_idx = get_sd_load_idx(env->sd, env->idle);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009520
9521 do {
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009522 struct sg_lb_stats *sgs = &tmp_sgs;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009523 int local_group;
9524
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009525 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009526 if (local_group) {
9527 sds->local = sg;
9528 sgs = &sds->local_stat;
Peter Zijlstrab72ff132013-08-28 10:32:32 +02009529
9530 if (env->idle != CPU_NEWLY_IDLE ||
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009531 time_after_eq(jiffies, sg->sgc->next_update))
9532 update_group_capacity(env->sd, env->dst_cpu);
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009533 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009534
Tim Chen4486edd2014-06-23 12:16:49 -07009535 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
Morten Rasmussena562dfc2015-05-09 16:49:57 +01009536 &overload, &overutilized);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009537
Peter Zijlstrab72ff132013-08-28 10:32:32 +02009538 if (local_group)
9539 goto next_group;
9540
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009541 /*
9542 * In case the child domain prefers tasks go to siblings
Vincent Guittotea678212015-02-27 16:54:11 +01009543 * first, lower the sg capacity so that we'll try
Nikhil Rao75dd3212010-10-15 13:12:30 -07009544 * and move all the excess tasks away. We lower the capacity
9545 * of a group only if the local group has the capacity to fit
Vincent Guittotea678212015-02-27 16:54:11 +01009546 * these excess tasks. The extra check prevents the case where
9547 * you always pull from the heaviest group when it is already
9548 * under-utilized (possible with a large weight task outweighs
9549 * the tasks on the system).
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009550 */
Peter Zijlstrab72ff132013-08-28 10:32:32 +02009551 if (prefer_sibling && sds->local &&
Vincent Guittotea678212015-02-27 16:54:11 +01009552 group_has_capacity(env, &sds->local_stat) &&
9553 (sgs->sum_nr_running > 1)) {
9554 sgs->group_no_capacity = 1;
Leo Yan79a89f92015-09-15 18:56:45 +08009555 sgs->group_type = group_classify(sg, sgs);
Wanpeng Licb0b9f22014-11-05 07:44:50 +08009556 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009557
Morten Rasmussenf95e8de2016-02-25 12:51:35 +00009558 /*
9559 * Ignore task groups with misfit tasks if local group has no
9560 * capacity or if per-cpu capacity isn't higher.
9561 */
9562 if (sgs->group_type == group_misfit_task && sds->local &&
9563 (!group_has_capacity(env, &sds->local_stat) ||
9564 !group_smaller_cpu_capacity(sg, sds->local)))
9565 sgs->group_type = group_other;
9566
Peter Zijlstrab72ff132013-08-28 10:32:32 +02009567 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
Michael Neuling532cb4c2010-06-08 14:57:02 +10009568 sds->busiest = sg;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009569 sds->busiest_stat = *sgs;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009570 env->busiest_nr_running = sgs->sum_nr_running;
9571 env->busiest_grp_capacity = sgs->group_capacity;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009572 }
9573
Peter Zijlstrab72ff132013-08-28 10:32:32 +02009574next_group:
9575 /* Now, start updating sd_lb_stats */
9576 sds->total_load += sgs->group_load;
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009577 sds->total_capacity += sgs->group_capacity;
Peter Zijlstrab72ff132013-08-28 10:32:32 +02009578
Michael Neuling532cb4c2010-06-08 14:57:02 +10009579 sg = sg->next;
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009580 } while (sg != env->sd->groups);
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01009581
9582 if (env->sd->flags & SD_NUMA)
9583 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
Tim Chen4486edd2014-06-23 12:16:49 -07009584
Morten Rasmussen94beeae2015-07-02 17:16:34 +01009585 env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
9586
Dietmar Eggemann06654992015-07-30 16:53:30 +01009587 if (!lb_sd_parent(env->sd)) {
Tim Chen4486edd2014-06-23 12:16:49 -07009588 /* update overload indicator if we are at root domain */
9589 if (env->dst_rq->rd->overload != overload)
9590 env->dst_rq->rd->overload = overload;
Tim Chen4486edd2014-06-23 12:16:49 -07009591
Morten Rasmussena562dfc2015-05-09 16:49:57 +01009592 /* Update over-utilization (tipping point, U >= 0) indicator */
Patrick Bellasi8e45d942016-02-10 09:24:36 +00009593 if (env->dst_rq->rd->overutilized != overutilized) {
Morten Rasmussena562dfc2015-05-09 16:49:57 +01009594 env->dst_rq->rd->overutilized = overutilized;
Patrick Bellasi8e45d942016-02-10 09:24:36 +00009595 trace_sched_overutilized(overutilized);
9596 }
Morten Rasmussena562dfc2015-05-09 16:49:57 +01009597 } else {
Patrick Bellasi8e45d942016-02-10 09:24:36 +00009598 if (!env->dst_rq->rd->overutilized && overutilized) {
Morten Rasmussena562dfc2015-05-09 16:49:57 +01009599 env->dst_rq->rd->overutilized = true;
Patrick Bellasi8e45d942016-02-10 09:24:36 +00009600 trace_sched_overutilized(true);
9601 }
Michael Neuling532cb4c2010-06-08 14:57:02 +10009602 }
9603
9604}
9605
Michael Neuling532cb4c2010-06-08 14:57:02 +10009606/**
9607 * check_asym_packing - Check to see if the group is packed into the
9608 * sched doman.
9609 *
9610 * This is primarily intended to used at the sibling level. Some
9611 * cores like POWER7 prefer to use lower numbered SMT threads. In the
9612 * case of POWER7, it can move to lower SMT modes only when higher
9613 * threads are idle. When in lower SMT modes, the threads will
9614 * perform better since they share less core resources. Hence when we
9615 * have idle threads, we want them to be the higher ones.
9616 *
9617 * This packing function is run on idle threads. It checks to see if
9618 * the busiest CPU in this domain (core in the P7 case) has a higher
9619 * CPU number than the packing function is being run on. Here we are
9620 * assuming lower CPU number will be equivalent to lower a SMT thread
9621 * number.
9622 *
Yacine Belkadie69f6182013-07-12 20:45:47 +02009623 * Return: 1 when packing is required and a task should be moved to
Michael Neulingb6b12292010-06-10 12:06:21 +10009624 * this CPU. The amount of the imbalance is returned in *imbalance.
9625 *
Randy Dunlapcd968912012-06-08 13:18:33 -07009626 * @env: The load balancing environment.
Michael Neuling532cb4c2010-06-08 14:57:02 +10009627 * @sds: Statistics of the sched_domain which is to be packed
Michael Neuling532cb4c2010-06-08 14:57:02 +10009628 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009629static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
Michael Neuling532cb4c2010-06-08 14:57:02 +10009630{
9631 int busiest_cpu;
9632
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009633 if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling532cb4c2010-06-08 14:57:02 +10009634 return 0;
9635
Srikar Dronamraju1f621e02016-04-06 18:47:40 +05309636 if (env->idle == CPU_NOT_IDLE)
9637 return 0;
9638
Michael Neuling532cb4c2010-06-08 14:57:02 +10009639 if (!sds->busiest)
9640 return 0;
9641
9642 busiest_cpu = group_first_cpu(sds->busiest);
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009643 if (env->dst_cpu > busiest_cpu)
Michael Neuling532cb4c2010-06-08 14:57:02 +10009644 return 0;
9645
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009646 env->imbalance = DIV_ROUND_CLOSEST(
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009647 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04009648 SCHED_CAPACITY_SCALE);
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009649
Michael Neuling532cb4c2010-06-08 14:57:02 +10009650 return 1;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009651}
9652
9653/**
9654 * fix_small_imbalance - Calculate the minor imbalance that exists
9655 * amongst the groups of a sched_domain, during
9656 * load balancing.
Randy Dunlapcd968912012-06-08 13:18:33 -07009657 * @env: The load balancing environment.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009658 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009659 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009660static inline
9661void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009662{
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009663 unsigned long tmp, capa_now = 0, capa_move = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009664 unsigned int imbn = 2;
Suresh Siddhadd5feea2010-02-23 16:13:52 -08009665 unsigned long scaled_busy_load_per_task;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009666 struct sg_lb_stats *local, *busiest;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009667
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009668 local = &sds->local_stat;
9669 busiest = &sds->busiest_stat;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009670
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009671 if (!local->sum_nr_running)
9672 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
9673 else if (busiest->load_per_task > local->load_per_task)
9674 imbn = 1;
Suresh Siddhadd5feea2010-02-23 16:13:52 -08009675
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009676 scaled_busy_load_per_task =
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04009677 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009678 busiest->group_capacity;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009679
Vladimir Davydov3029ede2013-09-15 17:49:14 +04009680 if (busiest->avg_load + scaled_busy_load_per_task >=
9681 local->avg_load + (scaled_busy_load_per_task * imbn)) {
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009682 env->imbalance = busiest->load_per_task;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009683 return;
9684 }
9685
9686 /*
9687 * OK, we don't have enough imbalance to justify moving tasks,
Nicolas Pitreced549f2014-05-26 18:19:38 -04009688 * however we may be able to increase total CPU capacity used by
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009689 * moving them.
9690 */
9691
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009692 capa_now += busiest->group_capacity *
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009693 min(busiest->load_per_task, busiest->avg_load);
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009694 capa_now += local->group_capacity *
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009695 min(local->load_per_task, local->avg_load);
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04009696 capa_now /= SCHED_CAPACITY_SCALE;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009697
9698 /* Amount of load we'd subtract */
Vincent Guittota2cd4262014-03-11 17:26:06 +01009699 if (busiest->avg_load > scaled_busy_load_per_task) {
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009700 capa_move += busiest->group_capacity *
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009701 min(busiest->load_per_task,
Vincent Guittota2cd4262014-03-11 17:26:06 +01009702 busiest->avg_load - scaled_busy_load_per_task);
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009703 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009704
9705 /* Amount of load we'd add */
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009706 if (busiest->avg_load * busiest->group_capacity <
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04009707 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009708 tmp = (busiest->avg_load * busiest->group_capacity) /
9709 local->group_capacity;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009710 } else {
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04009711 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009712 local->group_capacity;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009713 }
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009714 capa_move += local->group_capacity *
Peter Zijlstra3ae11c92013-08-15 20:37:48 +02009715 min(local->load_per_task, local->avg_load + tmp);
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04009716 capa_move /= SCHED_CAPACITY_SCALE;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009717
9718 /* Move if we gain throughput */
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009719 if (capa_move > capa_now)
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009720 env->imbalance = busiest->load_per_task;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009721}
9722
9723/**
9724 * calculate_imbalance - Calculate the amount of imbalance present within the
9725 * groups of a given sched_domain during load balance.
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009726 * @env: load balance environment
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009727 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009728 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009729static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009730{
Suresh Siddhadd5feea2010-02-23 16:13:52 -08009731 unsigned long max_pull, load_above_capacity = ~0UL;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009732 struct sg_lb_stats *local, *busiest;
Suresh Siddhadd5feea2010-02-23 16:13:52 -08009733
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009734 local = &sds->local_stat;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009735 busiest = &sds->busiest_stat;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009736
Rik van Rielcaeb1782014-07-28 14:16:28 -04009737 if (busiest->group_type == group_imbalanced) {
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02009738 /*
9739 * In the group_imb case we cannot rely on group-wide averages
9740 * to ensure cpu-load equilibrium, look at wider averages. XXX
9741 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009742 busiest->load_per_task =
9743 min(busiest->load_per_task, sds->avg_load);
Suresh Siddhadd5feea2010-02-23 16:13:52 -08009744 }
9745
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009746 /*
Dietmar Eggemann885e5422016-04-29 20:32:39 +01009747 * Avg load of busiest sg can be less and avg load of local sg can
9748 * be greater than avg load across all sgs of sd because avg load
9749 * factors in sg capacity and sgs with smaller group_type are
9750 * skipped when updating the busiest sg:
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009751 */
Vladimir Davydovb1885552013-09-15 17:49:13 +04009752 if (busiest->avg_load <= sds->avg_load ||
9753 local->avg_load >= sds->avg_load) {
Morten Rasmussenf95e8de2016-02-25 12:51:35 +00009754 /* Misfitting tasks should be migrated in any case */
9755 if (busiest->group_type == group_misfit_task) {
9756 env->imbalance = busiest->group_misfit_task;
9757 return;
9758 }
9759
9760 /*
9761 * Busiest group is overloaded, local is not, use the spare
9762 * cycles to maximize throughput
9763 */
9764 if (busiest->group_type == group_overloaded &&
9765 local->group_type <= group_misfit_task) {
9766 env->imbalance = busiest->load_per_task;
9767 return;
9768 }
9769
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009770 env->imbalance = 0;
9771 return fix_small_imbalance(env, sds);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009772 }
9773
Peter Zijlstra9a5d9ba2014-07-29 17:15:11 +02009774 /*
9775 * If there aren't any idle cpus, avoid creating some.
9776 */
9777 if (busiest->group_type == group_overloaded &&
9778 local->group_type == group_overloaded) {
Peter Zijlstra1be0eb22016-05-06 12:21:23 +02009779 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
Morten Rasmussencfa10332016-04-29 20:32:40 +01009780 if (load_above_capacity > busiest->group_capacity) {
Vincent Guittotea678212015-02-27 16:54:11 +01009781 load_above_capacity -= busiest->group_capacity;
Dietmar Eggemann26656212016-08-10 11:27:27 +01009782 load_above_capacity *= scale_load_down(NICE_0_LOAD);
Morten Rasmussencfa10332016-04-29 20:32:40 +01009783 load_above_capacity /= busiest->group_capacity;
9784 } else
Vincent Guittotea678212015-02-27 16:54:11 +01009785 load_above_capacity = ~0UL;
Suresh Siddhadd5feea2010-02-23 16:13:52 -08009786 }
9787
9788 /*
9789 * We're trying to get all the cpus to the average_load, so we don't
9790 * want to push ourselves above the average load, nor do we wish to
9791 * reduce the max loaded cpu below the average load. At the same time,
Dietmar Eggemann0a9b23c2016-04-29 20:32:38 +01009792 * we also don't want to reduce the group load below the group
9793 * capacity. Thus we look for the minimum possible imbalance.
Suresh Siddhadd5feea2010-02-23 16:13:52 -08009794 */
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02009795 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009796
9797 /* How much load to actually move to equalise the imbalance */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009798 env->imbalance = min(
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009799 max_pull * busiest->group_capacity,
9800 (sds->avg_load - local->avg_load) * local->group_capacity
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04009801 ) / SCHED_CAPACITY_SCALE;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009802
Morten Rasmussenf95e8de2016-02-25 12:51:35 +00009803 /* Boost imbalance to allow misfit task to be balanced. */
9804 if (busiest->group_type == group_misfit_task)
9805 env->imbalance = max_t(long, env->imbalance,
9806 busiest->group_misfit_task);
9807
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009808 /*
9809 * if *imbalance is less than the average load per runnable task
Lucas De Marchi25985ed2011-03-30 22:57:33 -03009810 * there is no guarantee that any tasks will be moved so we'll have
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009811 * a think about bumping its value to force at least one task to be
9812 * moved
9813 */
Pavankumar Kondeti21910c22018-08-28 13:37:09 +05309814 if (env->imbalance < busiest->load_per_task) {
9815 /*
9816 * The busiest group is overloaded so it could use help
9817 * from the other groups. If the local group has idle CPUs
9818 * and it is not overloaded and has no imbalance with in
9819 * the group, allow the load balance by bumping the
9820 * imbalance.
9821 */
9822 if (busiest->group_type == group_overloaded &&
9823 local->group_type <= group_misfit_task &&
9824 env->idle != CPU_NOT_IDLE) {
9825 env->imbalance = busiest->load_per_task;
9826 return;
9827 }
9828
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009829 return fix_small_imbalance(env, sds);
Pavankumar Kondeti21910c22018-08-28 13:37:09 +05309830 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009831}
Nikhil Raofab47622010-10-15 13:12:29 -07009832
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009833/******* find_busiest_group() helpers end here *********************/
9834
9835/**
9836 * find_busiest_group - Returns the busiest group within the sched_domain
Dietmar Eggemann0a9b23c2016-04-29 20:32:38 +01009837 * if there is an imbalance.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009838 *
9839 * Also calculates the amount of weighted load which should be moved
9840 * to restore balance.
9841 *
Randy Dunlapcd968912012-06-08 13:18:33 -07009842 * @env: The load balancing environment.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009843 *
Yacine Belkadie69f6182013-07-12 20:45:47 +02009844 * Return: - The busiest group if imbalance exists.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009845 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009846static struct sched_group *find_busiest_group(struct lb_env *env)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009847{
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009848 struct sg_lb_stats *local, *busiest;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009849 struct sd_lb_stats sds;
9850
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02009851 init_sd_lb_stats(&sds);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009852
9853 /*
9854 * Compute the various statistics relavent for load balancing at
9855 * this level.
9856 */
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009857 update_sd_lb_stats(env, &sds);
Dietmar Eggemann53065e82015-05-10 15:17:32 +01009858
Joonwoo Park5be62152017-02-09 14:45:57 -08009859 if (energy_aware() && !env->dst_rq->rd->overutilized) {
9860 int cpu_local, cpu_busiest;
Pavankumar Kondeti79830a22017-11-20 11:34:10 +05309861 unsigned long energy_local, energy_busiest;
Joonwoo Park5be62152017-02-09 14:45:57 -08009862
9863 if (env->idle != CPU_NEWLY_IDLE)
9864 goto out_balanced;
9865
9866 if (!sds.local || !sds.busiest)
9867 goto out_balanced;
9868
9869 cpu_local = group_first_cpu(sds.local);
9870 cpu_busiest = group_first_cpu(sds.busiest);
9871
Pavankumar Kondeti79830a22017-11-20 11:34:10 +05309872 /* TODO: don't assume same energy cpus are in same domain */
9873 energy_local = cpu_max_power_cost(cpu_local);
9874 energy_busiest = cpu_max_power_cost(cpu_busiest);
9875 if (energy_local > energy_busiest) {
Joonwoo Park5be62152017-02-09 14:45:57 -08009876 goto out_balanced;
Pavankumar Kondeti79830a22017-11-20 11:34:10 +05309877 } else if (energy_local == energy_busiest) {
Joonwoo Park5be62152017-02-09 14:45:57 -08009878 if (cpu_rq(cpu_busiest)->nr_running < 2)
9879 goto out_balanced;
Joonwoo Park5be62152017-02-09 14:45:57 -08009880 }
9881 }
Dietmar Eggemann53065e82015-05-10 15:17:32 +01009882
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009883 local = &sds.local_stat;
9884 busiest = &sds.busiest_stat;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009885
Vincent Guittotea678212015-02-27 16:54:11 +01009886 /* ASYM feature bypasses nice load balance check */
Srikar Dronamraju1f621e02016-04-06 18:47:40 +05309887 if (check_asym_packing(env, &sds))
Michael Neuling532cb4c2010-06-08 14:57:02 +10009888 return sds.busiest;
9889
Peter Zijlstracc57aa82011-02-21 18:55:32 +01009890 /* There is no busy sibling group to pull tasks from */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009891 if (!sds.busiest || busiest->sum_nr_running == 0)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009892 goto out_balanced;
9893
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07009894 if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009895 goto force_balance;
9896
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04009897 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
9898 / sds.total_capacity;
Ken Chenb0432d82011-04-07 17:23:22 -07009899
Peter Zijlstra866ab432011-02-21 18:56:47 +01009900 /*
9901 * If the busiest group is imbalanced the below checks don't
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02009902 * work because they assume all things are equal, which typically
Peter Zijlstra866ab432011-02-21 18:56:47 +01009903 * isn't true due to cpus_allowed constraints and the like.
9904 */
Rik van Rielcaeb1782014-07-28 14:16:28 -04009905 if (busiest->group_type == group_imbalanced)
Peter Zijlstra866ab432011-02-21 18:56:47 +01009906 goto force_balance;
9907
tip-bot for Jacob Shin3d9aec72017-10-10 03:59:06 -07009908 /*
9909 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
9910 * capacities from resulting in underutilization due to avg_load.
9911 */
9912 if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
Vincent Guittotea678212015-02-27 16:54:11 +01009913 busiest->group_no_capacity)
Nikhil Raofab47622010-10-15 13:12:29 -07009914 goto force_balance;
9915
Morten Rasmussenf95e8de2016-02-25 12:51:35 +00009916 /* Misfitting tasks should be dealt with regardless of the avg load */
9917 if (busiest->group_type == group_misfit_task) {
9918 goto force_balance;
9919 }
9920
Peter Zijlstracc57aa82011-02-21 18:55:32 +01009921 /*
Zhihui Zhang9c58c792014-09-20 21:24:36 -04009922 * If the local group is busier than the selected busiest group
Peter Zijlstracc57aa82011-02-21 18:55:32 +01009923 * don't try and pull any tasks.
9924 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009925 if (local->avg_load >= busiest->avg_load)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009926 goto out_balanced;
9927
Peter Zijlstracc57aa82011-02-21 18:55:32 +01009928 /*
9929 * Don't pull any tasks if this group is already above the domain
9930 * average load.
9931 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009932 if (local->avg_load >= sds.avg_load)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009933 goto out_balanced;
9934
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009935 if (env->idle == CPU_IDLE) {
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07009936 /*
Vincent Guittot43f4d662014-10-01 15:38:55 +02009937 * This cpu is idle. If the busiest group is not overloaded
9938 * and there is no imbalance between this and busiest group
9939 * wrt idle cpus, it is balanced. The imbalance becomes
9940 * significant if the diff is greater than 1 otherwise we
9941 * might end up to just move the imbalance on another group
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07009942 */
Vincent Guittot43f4d662014-10-01 15:38:55 +02009943 if ((busiest->group_type != group_overloaded) &&
Morten Rasmussenf95e8de2016-02-25 12:51:35 +00009944 (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
9945 !group_smaller_cpu_capacity(sds.busiest, sds.local))
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07009946 goto out_balanced;
Peter Zijlstrac186faf2011-02-21 18:52:53 +01009947 } else {
9948 /*
9949 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
9950 * imbalance_pct to be conservative.
9951 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009952 if (100 * busiest->avg_load <=
9953 env->sd->imbalance_pct * local->avg_load)
Peter Zijlstrac186faf2011-02-21 18:52:53 +01009954 goto out_balanced;
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07009955 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009956
Nikhil Raofab47622010-10-15 13:12:29 -07009957force_balance:
Morten Rasmussenf95e8de2016-02-25 12:51:35 +00009958 env->busiest_group_type = busiest->group_type;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009959 /* Looks like there is an imbalance. Compute it */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009960 calculate_imbalance(env, &sds);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009961 return sds.busiest;
9962
9963out_balanced:
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009964 env->imbalance = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009965 return NULL;
9966}
9967
9968/*
9969 * find_busiest_queue - find the busiest runqueue among the cpus in group.
9970 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009971static struct rq *find_busiest_queue(struct lb_env *env,
Michael Wangb94031302012-07-12 16:10:13 +08009972 struct sched_group *group)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009973{
9974 struct rq *busiest = NULL, *rq;
Nicolas Pitreced549f2014-05-26 18:19:38 -04009975 unsigned long busiest_load = 0, busiest_capacity = 1;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009976 int i;
9977
Peter Zijlstra6906a402013-08-19 15:20:21 +02009978 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Vincent Guittotea678212015-02-27 16:54:11 +01009979 unsigned long capacity, wl;
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01009980 enum fbq_type rt;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009981
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01009982 rq = cpu_rq(i);
9983 rt = fbq_classify_rq(rq);
9984
9985 /*
9986 * We classify groups/runqueues into three groups:
9987 * - regular: there are !numa tasks
9988 * - remote: there are numa tasks that run on the 'wrong' node
9989 * - all: there is no distinction
9990 *
9991 * In order to avoid migrating ideally placed numa tasks,
9992 * ignore those when there's better options.
9993 *
9994 * If we ignore the actual busiest queue to migrate another
9995 * task, the next balance pass can still reduce the busiest
9996 * queue by moving tasks around inside the node.
9997 *
9998 * If we cannot move enough load due to this classification
9999 * the next pass will adjust the group classification and
10000 * allow migration of more tasks.
10001 *
10002 * Both cases only affect the total convergence complexity.
10003 */
10004 if (rt > env->fbq_type)
10005 continue;
10006
Nicolas Pitreced549f2014-05-26 18:19:38 -040010007 capacity = capacity_of(i);
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +100010008
Chris Redpath8b40f5e2018-05-30 13:16:41 +010010009 /*
10010 * For ASYM_CPUCAPACITY domains, don't pick a cpu that could
10011 * eventually lead to active_balancing high->low capacity.
10012 * Higher per-cpu capacity is considered better than balancing
10013 * average load.
10014 */
10015 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
10016 capacity_of(env->dst_cpu) < capacity &&
Pavankumar Kondeti916e7032019-03-08 10:04:40 +053010017 (rq->nr_running == 1 || (rq->nr_running == 2 &&
10018 task_util(rq->curr) < sched_small_task_threshold)))
Chris Redpath8b40f5e2018-05-30 13:16:41 +010010019 continue;
10020
Thomas Gleixner6e40f5b2010-02-16 16:48:56 +010010021 wl = weighted_cpuload(i);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010022
Thomas Gleixner6e40f5b2010-02-16 16:48:56 +010010023 /*
10024 * When comparing with imbalance, use weighted_cpuload()
Nicolas Pitreced549f2014-05-26 18:19:38 -040010025 * which is not scaled with the cpu capacity.
Thomas Gleixner6e40f5b2010-02-16 16:48:56 +010010026 */
Vincent Guittotea678212015-02-27 16:54:11 +010010027
10028 if (rq->nr_running == 1 && wl > env->imbalance &&
Morten Rasmussenf95e8de2016-02-25 12:51:35 +000010029 !check_cpu_capacity(rq, env->sd) &&
10030 env->busiest_group_type != group_misfit_task)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010031 continue;
10032
Thomas Gleixner6e40f5b2010-02-16 16:48:56 +010010033 /*
Leo Yana1dd2712016-12-22 23:58:50 +080010034 * After enable energy awared scheduling, it has higher
10035 * priority to migrate misfit task rather than from most
10036 * loaded CPU; E.g. one CPU with single misfit task and
10037 * other CPUs with multiple lower load tasks, we should
10038 * firstly make sure the misfit task can be migrated onto
10039 * higher capacity CPU.
10040 */
10041 if (energy_aware() &&
10042 capacity_orig_of(i) < capacity_orig_of(env->dst_cpu) &&
Syed Rameez Mustafae21dd3c2017-03-07 11:25:39 -080010043 rq->misfit_task &&
Leo Yana1dd2712016-12-22 23:58:50 +080010044 env->busiest_group_type == group_misfit_task) {
10045 busiest_load = wl;
10046 busiest_capacity = capacity;
10047 busiest = rq;
10048 break;
10049 }
10050
10051 /*
Thomas Gleixner6e40f5b2010-02-16 16:48:56 +010010052 * For the load comparisons with the other cpu's, consider
Nicolas Pitreced549f2014-05-26 18:19:38 -040010053 * the weighted_cpuload() scaled with the cpu capacity, so
10054 * that the load can be moved away from the cpu that is
10055 * potentially running at a lower capacity.
Joonsoo Kim95a79b82013-08-06 17:36:41 +090010056 *
Nicolas Pitreced549f2014-05-26 18:19:38 -040010057 * Thus we're looking for max(wl_i / capacity_i), crosswise
Joonsoo Kim95a79b82013-08-06 17:36:41 +090010058 * multiplication to rid ourselves of the division works out
Nicolas Pitreced549f2014-05-26 18:19:38 -040010059 * to: wl_i * capacity_j > wl_j * capacity_i; where j is
10060 * our previous maximum.
Thomas Gleixner6e40f5b2010-02-16 16:48:56 +010010061 */
Nicolas Pitreced549f2014-05-26 18:19:38 -040010062 if (wl * busiest_capacity > busiest_load * capacity) {
Joonsoo Kim95a79b82013-08-06 17:36:41 +090010063 busiest_load = wl;
Nicolas Pitreced549f2014-05-26 18:19:38 -040010064 busiest_capacity = capacity;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010065 busiest = rq;
10066 }
10067 }
10068
10069 return busiest;
10070}
10071
10072/*
10073 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
10074 * so long as it is large enough.
10075 */
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010076#define MAX_PINNED_INTERVAL 16
10077#define NEED_ACTIVE_BALANCE_THRESHOLD 10
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010078
Peter Zijlstrabd939f42012-05-02 14:20:37 +020010079static int need_active_balance(struct lb_env *env)
Peter Zijlstra1af3ed32009-12-23 15:10:31 +010010080{
Peter Zijlstrabd939f42012-05-02 14:20:37 +020010081 struct sched_domain *sd = env->sd;
10082
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -070010083 if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010084 return 1;
10085
Peter Zijlstrabd939f42012-05-02 14:20:37 +020010086 if (env->idle == CPU_NEWLY_IDLE) {
Michael Neuling532cb4c2010-06-08 14:57:02 +100010087
10088 /*
10089 * ASYM_PACKING needs to force migrate tasks from busy but
10090 * higher numbered CPUs in order to pack all tasks in the
10091 * lowest numbered CPUs.
10092 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +020010093 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
Michael Neuling532cb4c2010-06-08 14:57:02 +100010094 return 1;
Peter Zijlstra1af3ed32009-12-23 15:10:31 +010010095 }
10096
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010010097 /*
10098 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
10099 * It's worth migrating the task if the src_cpu's capacity is reduced
10100 * because of other sched_class or IRQs if more capacity stays
10101 * available on dst_cpu.
Maria Yu6f701bb2019-04-26 15:20:18 +080010102 * Avoid pulling the CFS task if it is the only task running.
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010010103 */
10104 if ((env->idle != CPU_NOT_IDLE) &&
Maria Yu6f701bb2019-04-26 15:20:18 +080010105 (env->src_rq->nr_running > 1) &&
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010010106 (env->src_rq->cfs.h_nr_running == 1)) {
10107 if ((check_cpu_capacity(env->src_rq, sd)) &&
10108 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
10109 return 1;
10110 }
10111
Leo Yan9cf7f362016-12-22 23:58:51 +080010112 if ((env->idle != CPU_NOT_IDLE) &&
10113 (capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu)) &&
Syed Rameez Mustafa20acfe72017-01-30 09:35:46 +053010114 env->src_rq->misfit_task)
Leo Yan9cf7f362016-12-22 23:58:51 +080010115 return 1;
10116
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010117 return unlikely(sd->nr_balance_failed >
10118 sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
Peter Zijlstra1af3ed32009-12-23 15:10:31 +010010119}
10120
Olav Haugand67250b2016-11-01 17:30:36 -070010121static int group_balance_cpu_not_isolated(struct sched_group *sg)
10122{
10123 cpumask_t cpus;
10124
10125 cpumask_and(&cpus, sched_group_cpus(sg), sched_group_mask(sg));
10126 cpumask_andnot(&cpus, &cpus, cpu_isolated_mask);
10127 return cpumask_first(&cpus);
10128}
10129
Tejun Heo969c7922010-05-06 18:49:21 +020010130static int active_load_balance_cpu_stop(void *data);
10131
Joonsoo Kim23f0d202013-08-06 17:36:42 +090010132static int should_we_balance(struct lb_env *env)
10133{
10134 struct sched_group *sg = env->sd->groups;
10135 struct cpumask *sg_cpus, *sg_mask;
10136 int cpu, balance_cpu = -1;
10137
10138 /*
10139 * In the newly idle case, we will allow all the cpu's
10140 * to do the newly idle load balance.
10141 */
10142 if (env->idle == CPU_NEWLY_IDLE)
10143 return 1;
10144
10145 sg_cpus = sched_group_cpus(sg);
10146 sg_mask = sched_group_mask(sg);
10147 /* Try to find first idle cpu */
10148 for_each_cpu_and(cpu, sg_cpus, env->cpus) {
Olav Haugand67250b2016-11-01 17:30:36 -070010149 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu) ||
10150 cpu_isolated(cpu))
Joonsoo Kim23f0d202013-08-06 17:36:42 +090010151 continue;
10152
10153 balance_cpu = cpu;
10154 break;
10155 }
10156
10157 if (balance_cpu == -1)
Olav Haugand67250b2016-11-01 17:30:36 -070010158 balance_cpu = group_balance_cpu_not_isolated(sg);
Joonsoo Kim23f0d202013-08-06 17:36:42 +090010159
10160 /*
10161 * First idle cpu or the first cpu(busiest) in this sched group
10162 * is eligible for doing load balancing at this and above domains.
10163 */
Joonsoo Kimb0cff9d2013-09-10 15:54:49 +090010164 return balance_cpu == env->dst_cpu;
Joonsoo Kim23f0d202013-08-06 17:36:42 +090010165}
10166
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010167/*
10168 * Check this_cpu to ensure it is balanced within domain. Attempt to move
10169 * tasks if there is an imbalance.
10170 */
10171static int load_balance(int this_cpu, struct rq *this_rq,
10172 struct sched_domain *sd, enum cpu_idle_type idle,
Joonsoo Kim23f0d202013-08-06 17:36:42 +090010173 int *continue_balancing)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010174{
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010175 int ld_moved = 0, cur_ld_moved, active_balance = 0;
Peter Zijlstra62633222013-08-19 12:41:09 +020010176 struct sched_domain *sd_parent = sd->parent;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010177 struct sched_group *group = NULL;
10178 struct rq *busiest = NULL;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010179 unsigned long flags;
Christoph Lameter4ba29682014-08-26 19:12:21 -050010180 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010181
Peter Zijlstra8e45cb52012-02-22 12:47:19 +010010182 struct lb_env env = {
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -070010183 .sd = sd,
10184 .dst_cpu = this_cpu,
10185 .dst_rq = this_rq,
10186 .dst_grpmask = sched_group_cpus(sd->groups),
10187 .idle = idle,
10188 .loop_break = sched_nr_migrate_break,
10189 .cpus = cpus,
10190 .fbq_type = all,
10191 .tasks = LIST_HEAD_INIT(env.tasks),
10192 .imbalance = 0,
10193 .flags = 0,
10194 .loop = 0,
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010195 .busiest_nr_running = 0,
10196 .busiest_grp_capacity = 0,
Peter Zijlstra8e45cb52012-02-22 12:47:19 +010010197 };
10198
Joonsoo Kimcfc03112013-04-23 17:27:39 +090010199 /*
10200 * For NEWLY_IDLE load_balancing, we don't need to consider
10201 * other cpus in our group
10202 */
Joonsoo Kime02e60c2013-04-23 17:27:42 +090010203 if (idle == CPU_NEWLY_IDLE)
Joonsoo Kimcfc03112013-04-23 17:27:39 +090010204 env.dst_grpmask = NULL;
Joonsoo Kimcfc03112013-04-23 17:27:39 +090010205
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010206 cpumask_copy(cpus, cpu_active_mask);
10207
Josh Poimboeufae928822016-06-17 12:43:24 -050010208 schedstat_inc(sd->lb_count[idle]);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010209
10210redo:
Joonsoo Kim23f0d202013-08-06 17:36:42 +090010211 if (!should_we_balance(&env)) {
10212 *continue_balancing = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010213 goto out_balanced;
Joonsoo Kim23f0d202013-08-06 17:36:42 +090010214 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010215
Joonsoo Kim23f0d202013-08-06 17:36:42 +090010216 group = find_busiest_group(&env);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010217 if (!group) {
Josh Poimboeufae928822016-06-17 12:43:24 -050010218 schedstat_inc(sd->lb_nobusyg[idle]);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010219 goto out_balanced;
10220 }
10221
Michael Wangb94031302012-07-12 16:10:13 +080010222 busiest = find_busiest_queue(&env, group);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010223 if (!busiest) {
Josh Poimboeufae928822016-06-17 12:43:24 -050010224 schedstat_inc(sd->lb_nobusyq[idle]);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010225 goto out_balanced;
10226 }
10227
Michael Wang78feefc2012-08-06 16:41:59 +080010228 BUG_ON(busiest == env.dst_rq);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010229
Josh Poimboeufae928822016-06-17 12:43:24 -050010230 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010231
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010010232 env.src_cpu = busiest->cpu;
10233 env.src_rq = busiest;
10234
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010235 ld_moved = 0;
10236 if (busiest->nr_running > 1) {
10237 /*
10238 * Attempt to move tasks. If find_busiest_group has found
10239 * an imbalance but busiest->nr_running <= 1, the group is
10240 * still unbalanced. ld_moved simply stays zero, so it is
10241 * correctly treated as an imbalance.
10242 */
Peter Zijlstra8e45cb52012-02-22 12:47:19 +010010243 env.flags |= LBF_ALL_PINNED;
Peter Zijlstrac82513e2012-04-26 13:12:27 +020010244 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
Peter Zijlstra8e45cb52012-02-22 12:47:19 +010010245
Peter Zijlstra5d6523e2012-03-10 00:07:36 +010010246more_balance:
Kirill Tkhai163122b2014-08-20 13:48:29 +040010247 raw_spin_lock_irqsave(&busiest->lock, flags);
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +053010248
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010249 /* The world might have changed. Validate assumptions */
10250 if (busiest->nr_running <= 1) {
10251 raw_spin_unlock_irqrestore(&busiest->lock, flags);
10252 env.flags &= ~LBF_ALL_PINNED;
10253 goto no_move;
10254 }
10255
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +053010256 /*
10257 * cur_ld_moved - load moved in current iteration
10258 * ld_moved - cumulative load moved across iterations
10259 */
Kirill Tkhai163122b2014-08-20 13:48:29 +040010260 cur_ld_moved = detach_tasks(&env);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010261
10262 /*
Kirill Tkhai163122b2014-08-20 13:48:29 +040010263 * We've detached some tasks from busiest_rq. Every
10264 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
10265 * unlock busiest->lock, and we are able to be sure
10266 * that nobody can manipulate the tasks in parallel.
10267 * See task_rq_lock() family for the details.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010268 */
Kirill Tkhai163122b2014-08-20 13:48:29 +040010269
10270 raw_spin_unlock(&busiest->lock);
10271
10272 if (cur_ld_moved) {
10273 attach_tasks(&env);
10274 ld_moved += cur_ld_moved;
10275 }
10276
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010277 local_irq_restore(flags);
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +053010278
Joonsoo Kimf1cd0852013-04-23 17:27:37 +090010279 if (env.flags & LBF_NEED_BREAK) {
10280 env.flags &= ~LBF_NEED_BREAK;
10281 goto more_balance;
10282 }
10283
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +053010284 /*
10285 * Revisit (affine) tasks on src_cpu that couldn't be moved to
10286 * us and move them to an alternate dst_cpu in our sched_group
10287 * where they can run. The upper limit on how many times we
10288 * iterate on same src_cpu is dependent on number of cpus in our
10289 * sched_group.
10290 *
10291 * This changes load balance semantics a bit on who can move
10292 * load to a given_cpu. In addition to the given_cpu itself
10293 * (or a ilb_cpu acting on its behalf where given_cpu is
10294 * nohz-idle), we now have balance_cpu in a position to move
10295 * load to given_cpu. In rare situations, this may cause
10296 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
10297 * _independently_ and at _same_ time to move some load to
10298 * given_cpu) causing exceess load to be moved to given_cpu.
10299 * This however should not happen so much in practice and
10300 * moreover subsequent load balance cycles should correct the
10301 * excess load moved.
10302 */
Peter Zijlstra62633222013-08-19 12:41:09 +020010303 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +053010304
Vladimir Davydov7aff2e32013-09-15 21:30:13 +040010305 /* Prevent to re-select dst_cpu via env's cpus */
10306 cpumask_clear_cpu(env.dst_cpu, env.cpus);
10307
Michael Wang78feefc2012-08-06 16:41:59 +080010308 env.dst_rq = cpu_rq(env.new_dst_cpu);
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +053010309 env.dst_cpu = env.new_dst_cpu;
Peter Zijlstra62633222013-08-19 12:41:09 +020010310 env.flags &= ~LBF_DST_PINNED;
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +053010311 env.loop = 0;
10312 env.loop_break = sched_nr_migrate_break;
Joonsoo Kime02e60c2013-04-23 17:27:42 +090010313
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +053010314 /*
10315 * Go back to "more_balance" rather than "redo" since we
10316 * need to continue with same src_cpu.
10317 */
10318 goto more_balance;
10319 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010320
Peter Zijlstra62633222013-08-19 12:41:09 +020010321 /*
10322 * We failed to reach balance because of affinity.
10323 */
10324 if (sd_parent) {
Nicolas Pitre63b2ca32014-05-26 18:19:37 -040010325 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
Peter Zijlstra62633222013-08-19 12:41:09 +020010326
Vincent Guittotafdeee02014-08-26 13:06:44 +020010327 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
Peter Zijlstra62633222013-08-19 12:41:09 +020010328 *group_imbalance = 1;
Peter Zijlstra62633222013-08-19 12:41:09 +020010329 }
10330
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010331 /* All tasks on this runqueue were pinned by CPU affinity */
Peter Zijlstra8e45cb52012-02-22 12:47:19 +010010332 if (unlikely(env.flags & LBF_ALL_PINNED)) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010333 cpumask_clear_cpu(cpu_of(busiest), cpus);
Prashanth Nageshappabbf18b12012-06-19 17:52:07 +053010334 if (!cpumask_empty(cpus)) {
10335 env.loop = 0;
10336 env.loop_break = sched_nr_migrate_break;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010337 goto redo;
Prashanth Nageshappabbf18b12012-06-19 17:52:07 +053010338 }
Vincent Guittotafdeee02014-08-26 13:06:44 +020010339 goto out_all_pinned;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010340 }
10341 }
10342
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010343no_move:
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010344 if (!ld_moved) {
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -070010345 if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010346 schedstat_inc(sd->lb_failed[idle]);
Venkatesh Pallipadi58b26c42010-09-10 18:19:17 -070010347 /*
10348 * Increment the failure counter only on periodic balance.
10349 * We do not want newidle balance, which can be very
10350 * frequent, pollute the failure counter causing
10351 * excessive cache_hot migrations and active balances.
10352 */
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010353 if (idle != CPU_NEWLY_IDLE &&
Channagoud Kadabi8810e5f2017-02-17 16:01:05 -080010354 !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) {
Morten Rasmussen94beeae2015-07-02 17:16:34 +010010355 if (env.src_grp_nr_running > 1)
10356 sd->nr_balance_failed++;
Channagoud Kadabi8810e5f2017-02-17 16:01:05 -080010357 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010358
Peter Zijlstrabd939f42012-05-02 14:20:37 +020010359 if (need_active_balance(&env)) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010360 raw_spin_lock_irqsave(&busiest->lock, flags);
10361
Pavankumar Kondetie4d0b6b2018-03-07 12:05:49 +053010362 /*
10363 * The CPUs are marked as reserved if tasks
10364 * are pushed/pulled from other CPUs. In that case,
10365 * bail out from the load balancer.
10366 */
10367 if (is_reserved(this_cpu) ||
10368 is_reserved(cpu_of(busiest))) {
10369 raw_spin_unlock_irqrestore(&busiest->lock,
10370 flags);
10371 *continue_balancing = 0;
10372 goto out;
10373 }
10374
Tejun Heo969c7922010-05-06 18:49:21 +020010375 /* don't kick the active_load_balance_cpu_stop,
10376 * if the curr task on busiest cpu can't be
10377 * moved to this_cpu
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010378 */
10379 if (!cpumask_test_cpu(this_cpu,
Peter Zijlstrafa17b502011-06-16 12:23:22 +020010380 tsk_cpus_allowed(busiest->curr))) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010381 raw_spin_unlock_irqrestore(&busiest->lock,
10382 flags);
Peter Zijlstra8e45cb52012-02-22 12:47:19 +010010383 env.flags |= LBF_ALL_PINNED;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010384 goto out_one_pinned;
10385 }
10386
Tejun Heo969c7922010-05-06 18:49:21 +020010387 /*
10388 * ->active_balance synchronizes accesses to
10389 * ->active_balance_work. Once set, it's cleared
10390 * only after active load balance is finished.
10391 */
Olav Haugand67250b2016-11-01 17:30:36 -070010392 if (!busiest->active_balance &&
10393 !cpu_isolated(cpu_of(busiest))) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010394 busiest->active_balance = 1;
10395 busiest->push_cpu = this_cpu;
10396 active_balance = 1;
Maria Yue96bc192019-04-03 18:26:21 +080010397 mark_reserved(this_cpu);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010398 }
10399 raw_spin_unlock_irqrestore(&busiest->lock, flags);
Tejun Heo969c7922010-05-06 18:49:21 +020010400
Peter Zijlstrabd939f42012-05-02 14:20:37 +020010401 if (active_balance) {
Tejun Heo969c7922010-05-06 18:49:21 +020010402 stop_one_cpu_nowait(cpu_of(busiest),
10403 active_load_balance_cpu_stop, busiest,
10404 &busiest->active_balance_work);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010405 *continue_balancing = 0;
Peter Zijlstrabd939f42012-05-02 14:20:37 +020010406 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010407
Srikar Dronamrajud02c0712016-03-23 17:54:44 +053010408 /* We've kicked active balancing, force task migration. */
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010409 sd->nr_balance_failed = sd->cache_nice_tries +
10410 NEED_ACTIVE_BALANCE_THRESHOLD - 1;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010411 }
Puja Gupta487dec62017-06-27 10:13:50 -070010412 } else
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010413 sd->nr_balance_failed = 0;
10414
10415 if (likely(!active_balance)) {
10416 /* We were unbalanced, so reset the balancing interval */
10417 sd->balance_interval = sd->min_interval;
10418 } else {
10419 /*
10420 * If we've begun active balancing, start to back off. This
10421 * case may not be covered by the all_pinned logic if there
10422 * is only 1 task on the busy runqueue (because we don't call
Kirill Tkhai163122b2014-08-20 13:48:29 +040010423 * detach_tasks).
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010424 */
10425 if (sd->balance_interval < sd->max_interval)
10426 sd->balance_interval *= 2;
10427 }
10428
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010429 goto out;
10430
10431out_balanced:
Vincent Guittotafdeee02014-08-26 13:06:44 +020010432 /*
10433 * We reach balance although we may have faced some affinity
Vincent Guittotb5fd7a12019-07-01 17:47:02 +020010434 * constraints. Clear the imbalance flag only if other tasks got
10435 * a chance to move and fix the imbalance.
Vincent Guittotafdeee02014-08-26 13:06:44 +020010436 */
Vincent Guittotb5fd7a12019-07-01 17:47:02 +020010437 if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
Vincent Guittotafdeee02014-08-26 13:06:44 +020010438 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
10439
10440 if (*group_imbalance)
10441 *group_imbalance = 0;
10442 }
10443
10444out_all_pinned:
10445 /*
10446 * We reach balance because all tasks are pinned at this level so
10447 * we can't migrate them. Let the imbalance flag set so parent level
10448 * can try to migrate them.
10449 */
Josh Poimboeufae928822016-06-17 12:43:24 -050010450 schedstat_inc(sd->lb_balanced[idle]);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010451
10452 sd->nr_balance_failed = 0;
10453
10454out_one_pinned:
Valentin Schneidere1f78c12018-09-26 16:12:07 +010010455 ld_moved = 0;
10456
10457 /*
10458 * idle_balance() disregards balance intervals, so we could repeatedly
10459 * reach this code, which would lead to balance_interval skyrocketting
10460 * in a short amount of time. Skip the balance_interval increase logic
10461 * to avoid that.
10462 */
10463 if (env.idle == CPU_NEWLY_IDLE)
10464 goto out;
10465
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010466 /* tune up the balancing interval */
Peter Zijlstra8e45cb52012-02-22 12:47:19 +010010467 if (((env.flags & LBF_ALL_PINNED) &&
Peter Zijlstra5b54b562011-09-22 15:23:13 +020010468 sd->balance_interval < MAX_PINNED_INTERVAL) ||
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010469 (sd->balance_interval < sd->max_interval))
10470 sd->balance_interval *= 2;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010471out:
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010472 trace_sched_load_balance(this_cpu, idle, *continue_balancing,
10473 group ? group->cpumask[0] : 0,
10474 busiest ? busiest->nr_running : 0,
10475 env.imbalance, env.flags, ld_moved,
10476 sd->balance_interval);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010477 return ld_moved;
10478}
10479
Jason Low52a08ef2014-05-08 17:49:22 -070010480static inline unsigned long
10481get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
10482{
10483 unsigned long interval = sd->balance_interval;
10484
10485 if (cpu_busy)
10486 interval *= sd->busy_factor;
10487
10488 /* scale ms to jiffies */
10489 interval = msecs_to_jiffies(interval);
10490 interval = clamp(interval, 1UL, max_load_balance_interval);
10491
10492 return interval;
10493}
10494
10495static inline void
Leo Yan31851a92016-08-05 14:31:29 +080010496update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
Jason Low52a08ef2014-05-08 17:49:22 -070010497{
10498 unsigned long interval, next;
10499
Leo Yan31851a92016-08-05 14:31:29 +080010500 /* used by idle balance, so cpu_busy = 0 */
10501 interval = get_sd_balance_interval(sd, 0);
Jason Low52a08ef2014-05-08 17:49:22 -070010502 next = sd->last_balance + interval;
10503
10504 if (time_after(*next_balance, next))
10505 *next_balance = next;
10506}
10507
Pavankumar Kondetie39358b2019-03-22 11:40:00 +053010508#ifdef CONFIG_SCHED_WALT
10509static inline bool min_cap_cluster_has_misfit_task(void)
10510{
10511 int cpu;
10512
10513 for_each_possible_cpu(cpu) {
10514 if (!is_min_capacity_cpu(cpu))
10515 break;
10516 if (cpu_rq(cpu)->walt_stats.nr_big_tasks)
10517 return true;
10518 }
10519
10520 return false;
10521}
10522#else
10523static inline bool min_cap_cluster_has_misfit_task(void)
10524{
10525 return false;
10526}
10527#endif
10528
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010529/*
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010530 * idle_balance is called by schedule() if this_cpu is about to become
10531 * idle. Attempts to pull tasks from other CPUs.
10532 */
Peter Zijlstra6e831252014-02-11 16:11:48 +010010533static int idle_balance(struct rq *this_rq)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010534{
Jason Low52a08ef2014-05-08 17:49:22 -070010535 unsigned long next_balance = jiffies + HZ;
10536 int this_cpu = this_rq->cpu;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010537 struct sched_domain *sd;
10538 int pulled_task = 0;
Jason Low9bd721c2013-09-13 11:26:52 -070010539 u64 curr_cost = 0;
Pavankumar Kondetie39358b2019-03-22 11:40:00 +053010540 bool force_lb = false;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010541
Olav Haugan3f2cb302016-05-31 14:34:46 -070010542 if (cpu_isolated(this_cpu))
10543 return 0;
10544
Peter Zijlstra6e831252014-02-11 16:11:48 +010010545 /*
Pavankumar Kondetie39358b2019-03-22 11:40:00 +053010546 * Force higher capacity CPUs doing load balance, when the lower
10547 * capacity CPUs has some misfit tasks.
10548 */
10549 if (!is_min_capacity_cpu(this_cpu) && min_cap_cluster_has_misfit_task())
10550 force_lb = true;
10551
10552 /*
Peter Zijlstra6e831252014-02-11 16:11:48 +010010553 * We must set idle_stamp _before_ calling idle_balance(), such that we
10554 * measure the duration of idle_balance() as idle time.
10555 */
10556 this_rq->idle_stamp = rq_clock(this_rq);
10557
Pavankumar Kondetie39358b2019-03-22 11:40:00 +053010558 if (!energy_aware() && !force_lb &&
Dietmar Eggemann785367f2016-01-13 15:49:44 +000010559 (this_rq->avg_idle < sysctl_sched_migration_cost ||
10560 !this_rq->rd->overload)) {
Jason Low52a08ef2014-05-08 17:49:22 -070010561 rcu_read_lock();
10562 sd = rcu_dereference_check_sched_domain(this_rq->sd);
10563 if (sd)
Leo Yan31851a92016-08-05 14:31:29 +080010564 update_next_balance(sd, &next_balance);
Jason Low52a08ef2014-05-08 17:49:22 -070010565 rcu_read_unlock();
10566
Peter Zijlstra6e831252014-02-11 16:11:48 +010010567 goto out;
Jason Low52a08ef2014-05-08 17:49:22 -070010568 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010569
Peter Zijlstraf492e122009-12-23 15:29:42 +010010570 raw_spin_unlock(&this_rq->lock);
10571
Paul Turner48a16752012-10-04 13:18:31 +020010572 update_blocked_averages(this_cpu);
Peter Zijlstradce840a2011-04-07 14:09:50 +020010573 rcu_read_lock();
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010574 for_each_domain(this_cpu, sd) {
Joonsoo Kim23f0d202013-08-06 17:36:42 +090010575 int continue_balancing = 1;
Jason Low9bd721c2013-09-13 11:26:52 -070010576 u64 t0, domain_cost;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010577
10578 if (!(sd->flags & SD_LOAD_BALANCE))
10579 continue;
10580
Pavankumar Kondetie39358b2019-03-22 11:40:00 +053010581 if (!force_lb &&
10582 this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
Leo Yan31851a92016-08-05 14:31:29 +080010583 update_next_balance(sd, &next_balance);
Jason Low9bd721c2013-09-13 11:26:52 -070010584 break;
Jason Low52a08ef2014-05-08 17:49:22 -070010585 }
Jason Low9bd721c2013-09-13 11:26:52 -070010586
Peter Zijlstraf492e122009-12-23 15:29:42 +010010587 if (sd->flags & SD_BALANCE_NEWIDLE) {
Jason Low9bd721c2013-09-13 11:26:52 -070010588 t0 = sched_clock_cpu(this_cpu);
10589
Peter Zijlstraf492e122009-12-23 15:29:42 +010010590 pulled_task = load_balance(this_cpu, this_rq,
Joonsoo Kim23f0d202013-08-06 17:36:42 +090010591 sd, CPU_NEWLY_IDLE,
10592 &continue_balancing);
Jason Low9bd721c2013-09-13 11:26:52 -070010593
10594 domain_cost = sched_clock_cpu(this_cpu) - t0;
10595 if (domain_cost > sd->max_newidle_lb_cost)
10596 sd->max_newidle_lb_cost = domain_cost;
10597
10598 curr_cost += domain_cost;
Peter Zijlstraf492e122009-12-23 15:29:42 +010010599 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010600
Leo Yan31851a92016-08-05 14:31:29 +080010601 update_next_balance(sd, &next_balance);
Jason Low39a4d9c2014-04-23 18:30:35 -070010602
10603 /*
10604 * Stop searching for tasks to pull if there are
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010605 * now runnable tasks on the balance rq or if
10606 * continue_balancing has been unset (only possible
10607 * due to active migration).
Jason Low39a4d9c2014-04-23 18:30:35 -070010608 */
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010609 if (pulled_task || this_rq->nr_running > 0 ||
10610 !continue_balancing)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010611 break;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010612 }
Peter Zijlstradce840a2011-04-07 14:09:50 +020010613 rcu_read_unlock();
Peter Zijlstraf492e122009-12-23 15:29:42 +010010614
10615 raw_spin_lock(&this_rq->lock);
10616
Jason Low0e5b5332014-04-28 15:45:54 -070010617 if (curr_cost > this_rq->max_idle_balance_cost)
10618 this_rq->max_idle_balance_cost = curr_cost;
10619
Daniel Lezcanoe5fc6612014-01-17 10:04:02 +010010620 /*
Jason Low0e5b5332014-04-28 15:45:54 -070010621 * While browsing the domains, we released the rq lock, a task could
10622 * have been enqueued in the meantime. Since we're not going idle,
10623 * pretend we pulled a task.
Daniel Lezcanoe5fc6612014-01-17 10:04:02 +010010624 */
Jason Low0e5b5332014-04-28 15:45:54 -070010625 if (this_rq->cfs.h_nr_running && !pulled_task)
Peter Zijlstra6e831252014-02-11 16:11:48 +010010626 pulled_task = 1;
Daniel Lezcanoe5fc6612014-01-17 10:04:02 +010010627
Peter Zijlstra6e831252014-02-11 16:11:48 +010010628out:
Jason Low52a08ef2014-05-08 17:49:22 -070010629 /* Move the next balance forward */
10630 if (time_after(this_rq->next_balance, next_balance))
10631 this_rq->next_balance = next_balance;
10632
Kirill Tkhaie4aa3582014-03-06 13:31:55 +040010633 /* Is there a task of a high priority class? */
Kirill Tkhai46383642014-03-15 02:15:07 +040010634 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
Kirill Tkhaie4aa3582014-03-06 13:31:55 +040010635 pulled_task = -1;
10636
Dietmar Eggemann38c6ade2015-10-20 13:04:41 +010010637 if (pulled_task)
Peter Zijlstra6e831252014-02-11 16:11:48 +010010638 this_rq->idle_stamp = 0;
10639
Daniel Lezcano3c4017c2014-01-17 10:04:03 +010010640 return pulled_task;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010641}
10642
10643/*
Tejun Heo969c7922010-05-06 18:49:21 +020010644 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
10645 * running tasks off the busiest CPU onto idle CPUs. It requires at
10646 * least 1 task to be running on each physical CPU where possible, and
10647 * avoids physical / logical imbalances.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010648 */
Tejun Heo969c7922010-05-06 18:49:21 +020010649static int active_load_balance_cpu_stop(void *data)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010650{
Tejun Heo969c7922010-05-06 18:49:21 +020010651 struct rq *busiest_rq = data;
10652 int busiest_cpu = cpu_of(busiest_rq);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010653 int target_cpu = busiest_rq->push_cpu;
Tejun Heo969c7922010-05-06 18:49:21 +020010654 struct rq *target_rq = cpu_rq(target_cpu);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010655 struct sched_domain *sd = NULL;
Kirill Tkhaie5673f22014-08-20 13:48:01 +040010656 struct task_struct *p = NULL;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010657 struct task_struct *push_task;
10658 int push_task_detached = 0;
10659 struct lb_env env = {
10660 .sd = sd,
10661 .dst_cpu = target_cpu,
10662 .dst_rq = target_rq,
10663 .src_cpu = busiest_rq->cpu,
10664 .src_rq = busiest_rq,
10665 .idle = CPU_IDLE,
10666 .busiest_nr_running = 0,
10667 .busiest_grp_capacity = 0,
10668 .flags = 0,
10669 .loop = 0,
10670 };
10671 bool moved = false;
Tejun Heo969c7922010-05-06 18:49:21 +020010672
10673 raw_spin_lock_irq(&busiest_rq->lock);
10674
10675 /* make sure the requested cpu hasn't gone down in the meantime */
10676 if (unlikely(busiest_cpu != smp_processor_id() ||
10677 !busiest_rq->active_balance))
10678 goto out_unlock;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010679
10680 /* Is there any task to move? */
10681 if (busiest_rq->nr_running <= 1)
Tejun Heo969c7922010-05-06 18:49:21 +020010682 goto out_unlock;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010683
10684 /*
10685 * This condition is "impossible", if it occurs
10686 * we need to fix it. Originally reported by
10687 * Bjorn Helgaas on a 128-cpu setup.
10688 */
10689 BUG_ON(busiest_rq == target_rq);
10690
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010691 push_task = busiest_rq->push_task;
10692 target_cpu = busiest_rq->push_cpu;
10693 if (push_task) {
10694 if (task_on_rq_queued(push_task) &&
10695 push_task->state == TASK_RUNNING &&
10696 task_cpu(push_task) == busiest_cpu &&
10697 cpu_online(target_cpu)) {
10698 detach_task(push_task, &env);
10699 push_task_detached = 1;
10700 moved = true;
10701 }
10702 goto out_unlock;
10703 }
10704
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010705 /* Search for an sd spanning us and the target CPU. */
Peter Zijlstradce840a2011-04-07 14:09:50 +020010706 rcu_read_lock();
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010707 for_each_domain(target_cpu, sd) {
10708 if ((sd->flags & SD_LOAD_BALANCE) &&
10709 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10710 break;
10711 }
10712
10713 if (likely(sd)) {
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010714 env.sd = sd;
Josh Poimboeufae928822016-06-17 12:43:24 -050010715 schedstat_inc(sd->alb_count);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010716
Kirill Tkhaie5673f22014-08-20 13:48:01 +040010717 p = detach_one_task(&env);
Srikar Dronamrajud02c0712016-03-23 17:54:44 +053010718 if (p) {
Josh Poimboeufae928822016-06-17 12:43:24 -050010719 schedstat_inc(sd->alb_pushed);
Srikar Dronamrajud02c0712016-03-23 17:54:44 +053010720 /* Active balancing done, reset the failure counter. */
10721 sd->nr_balance_failed = 0;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010722 moved = true;
Srikar Dronamrajud02c0712016-03-23 17:54:44 +053010723 } else {
Josh Poimboeufae928822016-06-17 12:43:24 -050010724 schedstat_inc(sd->alb_failed);
Srikar Dronamrajud02c0712016-03-23 17:54:44 +053010725 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010726 }
Peter Zijlstradce840a2011-04-07 14:09:50 +020010727 rcu_read_unlock();
Tejun Heo969c7922010-05-06 18:49:21 +020010728out_unlock:
10729 busiest_rq->active_balance = 0;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010730 push_task = busiest_rq->push_task;
10731 target_cpu = busiest_rq->push_cpu;
Maria Yue96bc192019-04-03 18:26:21 +080010732 clear_reserved(target_cpu);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010733
10734 if (push_task)
10735 busiest_rq->push_task = NULL;
10736
Kirill Tkhaie5673f22014-08-20 13:48:01 +040010737 raw_spin_unlock(&busiest_rq->lock);
10738
Syed Rameez Mustafaebc437b2016-12-13 15:57:19 -080010739 if (push_task) {
10740 if (push_task_detached)
10741 attach_one_task(target_rq, push_task);
10742 put_task_struct(push_task);
Syed Rameez Mustafaebc437b2016-12-13 15:57:19 -080010743 }
10744
Kirill Tkhaie5673f22014-08-20 13:48:01 +040010745 if (p)
10746 attach_one_task(target_rq, p);
10747
10748 local_irq_enable();
10749
Tejun Heo969c7922010-05-06 18:49:21 +020010750 return 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010751}
10752
Mike Galbraithd987fc72011-12-05 10:01:47 +010010753static inline int on_null_domain(struct rq *rq)
10754{
10755 return unlikely(!rcu_dereference_sched(rq->sd));
10756}
10757
Frederic Weisbecker3451d022011-08-10 23:21:01 +020010758#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010759/*
10760 * idle load balancing details
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010761 * - When one of the busy CPUs notice that there may be an idle rebalancing
10762 * needed, they will kick the idle load balancer, which then does idle
10763 * load balancing for all the idle CPUs.
10764 */
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010765static inline int find_new_ilb(int type)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010766{
Joonwoo Parkb1b4aed2016-12-12 13:55:57 -080010767 int ilb = nr_cpu_ids;
10768 struct sched_domain *sd;
10769 int cpu = raw_smp_processor_id();
10770 struct rq *rq = cpu_rq(cpu);
10771 cpumask_t cpumask;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010772
Joonwoo Parkb1b4aed2016-12-12 13:55:57 -080010773 rcu_read_lock();
10774 sd = rcu_dereference_check_sched_domain(rq->sd);
10775 if (sd) {
Maria Yu0cd30992018-08-15 19:31:38 +080010776 if (energy_aware() && rq->misfit_task)
10777 cpumask_andnot(&cpumask, nohz.idle_cpus_mask,
10778 sched_domain_span(sd));
10779 else
10780 cpumask_and(&cpumask, nohz.idle_cpus_mask,
10781 sched_domain_span(sd));
Puja Gupta9c7d6442017-08-03 14:21:57 -070010782 cpumask_andnot(&cpumask, &cpumask,
10783 cpu_isolated_mask);
Joonwoo Parkb1b4aed2016-12-12 13:55:57 -080010784 ilb = cpumask_first(&cpumask);
10785 }
10786 rcu_read_unlock();
10787
10788 if (sd && (ilb >= nr_cpu_ids || !idle_cpu(ilb))) {
10789 if (!energy_aware() ||
10790 (capacity_orig_of(cpu) ==
10791 cpu_rq(cpu)->rd->max_cpu_capacity.val ||
Maria Yu0cd30992018-08-15 19:31:38 +080010792 (cpu_overutilized(cpu) && rq->nr_running > 1))) {
Puja Gupta9c7d6442017-08-03 14:21:57 -070010793 cpumask_andnot(&cpumask, nohz.idle_cpus_mask,
10794 cpu_isolated_mask);
10795 ilb = cpumask_first(&cpumask);
10796 }
Joonwoo Parkb1b4aed2016-12-12 13:55:57 -080010797 }
10798
Suresh Siddha786d6dc72011-12-01 17:07:35 -080010799 if (ilb < nr_cpu_ids && idle_cpu(ilb))
10800 return ilb;
10801
10802 return nr_cpu_ids;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010803}
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010804
10805/*
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010806 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
10807 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
10808 * CPU (if there is one).
10809 */
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010810static void nohz_balancer_kick(int type)
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010811{
10812 int ilb_cpu;
10813
10814 nohz.next_balance++;
10815
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010816 ilb_cpu = find_new_ilb(type);
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010817
Suresh Siddha0b005cf2011-12-01 17:07:34 -080010818 if (ilb_cpu >= nr_cpu_ids)
10819 return;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010820
Suresh Siddhacd490c52011-12-06 11:26:34 -080010821 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
Suresh Siddha1c792db2011-12-01 17:07:32 -080010822 return;
10823 /*
10824 * Use smp_send_reschedule() instead of resched_cpu().
10825 * This way we generate a sched IPI on the target cpu which
10826 * is idle. And the softirq performing nohz idle load balance
10827 * will be run before returning from the IPI.
10828 */
10829 smp_send_reschedule(ilb_cpu);
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010830 return;
10831}
10832
Thomas Gleixner20a5c8c2016-03-10 12:54:20 +010010833void nohz_balance_exit_idle(unsigned int cpu)
Suresh Siddha71325962012-01-19 18:28:57 -080010834{
10835 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
Mike Galbraithd987fc72011-12-05 10:01:47 +010010836 /*
10837 * Completely isolated CPUs don't ever set, so we must test.
10838 */
Puja Gupta9c7d6442017-08-03 14:21:57 -070010839 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
10840 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
10841 atomic_dec(&nohz.nr_cpus);
10842 }
Suresh Siddha71325962012-01-19 18:28:57 -080010843 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
10844 }
10845}
10846
Suresh Siddha69e1e812011-12-01 17:07:33 -080010847static inline void set_cpu_sd_state_busy(void)
10848{
10849 struct sched_domain *sd;
Preeti U Murthy37dc6b52013-10-30 08:42:52 +053010850 int cpu = smp_processor_id();
Suresh Siddha69e1e812011-12-01 17:07:33 -080010851
Suresh Siddha69e1e812011-12-01 17:07:33 -080010852 rcu_read_lock();
Peter Zijlstra0e369d72016-05-09 10:38:01 +020010853 sd = rcu_dereference(per_cpu(sd_llc, cpu));
Vincent Guittot25f55d92013-04-23 16:59:02 +020010854
10855 if (!sd || !sd->nohz_idle)
10856 goto unlock;
10857 sd->nohz_idle = 0;
10858
Peter Zijlstra0e369d72016-05-09 10:38:01 +020010859 atomic_inc(&sd->shared->nr_busy_cpus);
Vincent Guittot25f55d92013-04-23 16:59:02 +020010860unlock:
Suresh Siddha69e1e812011-12-01 17:07:33 -080010861 rcu_read_unlock();
10862}
10863
10864void set_cpu_sd_state_idle(void)
10865{
10866 struct sched_domain *sd;
Preeti U Murthy37dc6b52013-10-30 08:42:52 +053010867 int cpu = smp_processor_id();
Suresh Siddha69e1e812011-12-01 17:07:33 -080010868
Suresh Siddha69e1e812011-12-01 17:07:33 -080010869 rcu_read_lock();
Peter Zijlstra0e369d72016-05-09 10:38:01 +020010870 sd = rcu_dereference(per_cpu(sd_llc, cpu));
Vincent Guittot25f55d92013-04-23 16:59:02 +020010871
10872 if (!sd || sd->nohz_idle)
10873 goto unlock;
10874 sd->nohz_idle = 1;
10875
Peter Zijlstra0e369d72016-05-09 10:38:01 +020010876 atomic_dec(&sd->shared->nr_busy_cpus);
Vincent Guittot25f55d92013-04-23 16:59:02 +020010877unlock:
Suresh Siddha69e1e812011-12-01 17:07:33 -080010878 rcu_read_unlock();
10879}
10880
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010881/*
Alex Shic1cc0172012-09-10 15:10:58 +080010882 * This routine will record that the cpu is going idle with tick stopped.
Suresh Siddha0b005cf2011-12-01 17:07:34 -080010883 * This info will be used in performing idle load balancing in the future.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010884 */
Alex Shic1cc0172012-09-10 15:10:58 +080010885void nohz_balance_enter_idle(int cpu)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010886{
Suresh Siddha71325962012-01-19 18:28:57 -080010887 /*
10888 * If this cpu is going down, then nothing needs to be done.
10889 */
10890 if (!cpu_active(cpu))
10891 return;
10892
Alex Shic1cc0172012-09-10 15:10:58 +080010893 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
10894 return;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010895
Mike Galbraithd987fc72011-12-05 10:01:47 +010010896 /*
10897 * If we're a completely isolated CPU, we don't play.
10898 */
Olav Haugan3f2cb302016-05-31 14:34:46 -070010899 if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu))
Mike Galbraithd987fc72011-12-05 10:01:47 +010010900 return;
10901
Alex Shic1cc0172012-09-10 15:10:58 +080010902 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
10903 atomic_inc(&nohz.nr_cpus);
10904 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010905}
10906#endif
10907
10908static DEFINE_SPINLOCK(balancing);
10909
Peter Zijlstra49c022e2011-04-05 10:14:25 +020010910/*
10911 * Scale the max load_balance interval with the number of CPUs in the system.
10912 * This trades load-balance latency on larger machines for less cross talk.
10913 */
Peter Zijlstra029632f2011-10-25 10:00:11 +020010914void update_max_interval(void)
Peter Zijlstra49c022e2011-04-05 10:14:25 +020010915{
Olav Haugan3f2cb302016-05-31 14:34:46 -070010916 cpumask_t avail_mask;
10917 unsigned int available_cpus;
10918
10919 cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask);
10920 available_cpus = cpumask_weight(&avail_mask);
10921
10922 max_load_balance_interval = HZ*available_cpus/10;
Peter Zijlstra49c022e2011-04-05 10:14:25 +020010923}
10924
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010925/*
10926 * It checks each scheduling domain to see if it is due to be balanced,
10927 * and initiates a balancing operation if so.
10928 *
Libinb9b08532013-04-01 19:14:01 +080010929 * Balancing parameters are set up in init_sched_domains.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010930 */
Daniel Lezcanof7ed0a82014-01-06 12:34:43 +010010931static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010932{
Joonsoo Kim23f0d202013-08-06 17:36:42 +090010933 int continue_balancing = 1;
Daniel Lezcanof7ed0a82014-01-06 12:34:43 +010010934 int cpu = rq->cpu;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010935 unsigned long interval;
Peter Zijlstra04f733b2012-05-11 00:12:02 +020010936 struct sched_domain *sd;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010937 /* Earliest time when we have to do rebalance again */
10938 unsigned long next_balance = jiffies + 60*HZ;
10939 int update_next_balance = 0;
Jason Lowf48627e2013-09-13 11:26:53 -070010940 int need_serialize, need_decay = 0;
10941 u64 max_cost = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010942
Paul Turner48a16752012-10-04 13:18:31 +020010943 update_blocked_averages(cpu);
Peter Zijlstra2069dd72010-11-15 15:47:00 -080010944
Peter Zijlstradce840a2011-04-07 14:09:50 +020010945 rcu_read_lock();
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010946 for_each_domain(cpu, sd) {
Jason Lowf48627e2013-09-13 11:26:53 -070010947 /*
10948 * Decay the newidle max times here because this is a regular
10949 * visit to all the domains. Decay ~1% per second.
10950 */
10951 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
10952 sd->max_newidle_lb_cost =
10953 (sd->max_newidle_lb_cost * 253) / 256;
10954 sd->next_decay_max_lb_cost = jiffies + HZ;
10955 need_decay = 1;
10956 }
10957 max_cost += sd->max_newidle_lb_cost;
10958
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010959 if (!(sd->flags & SD_LOAD_BALANCE))
10960 continue;
10961
Jason Lowf48627e2013-09-13 11:26:53 -070010962 /*
10963 * Stop the load balance at this level. There is another
10964 * CPU in our sched group which is doing load balancing more
10965 * actively.
10966 */
10967 if (!continue_balancing) {
10968 if (need_decay)
10969 continue;
10970 break;
10971 }
10972
Jason Low52a08ef2014-05-08 17:49:22 -070010973 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010974
10975 need_serialize = sd->flags & SD_SERIALIZE;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010976 if (need_serialize) {
10977 if (!spin_trylock(&balancing))
10978 goto out;
10979 }
10980
10981 if (time_after_eq(jiffies, sd->last_balance + interval)) {
Joonsoo Kim23f0d202013-08-06 17:36:42 +090010982 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010983 /*
Peter Zijlstra62633222013-08-19 12:41:09 +020010984 * The LBF_DST_PINNED logic could have changed
Joonsoo Kimde5eb2d2013-04-23 17:27:38 +090010985 * env->dst_cpu, so we can't know our idle
10986 * state even if we migrated tasks. Update it.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010987 */
Joonsoo Kimde5eb2d2013-04-23 17:27:38 +090010988 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010989 }
10990 sd->last_balance = jiffies;
Jason Low52a08ef2014-05-08 17:49:22 -070010991 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010992 }
10993 if (need_serialize)
10994 spin_unlock(&balancing);
10995out:
10996 if (time_after(next_balance, sd->last_balance + interval)) {
10997 next_balance = sd->last_balance + interval;
10998 update_next_balance = 1;
10999 }
Jason Lowf48627e2013-09-13 11:26:53 -070011000 }
11001 if (need_decay) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011002 /*
Jason Lowf48627e2013-09-13 11:26:53 -070011003 * Ensure the rq-wide value also decays but keep it at a
11004 * reasonable floor to avoid funnies with rq->avg_idle.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011005 */
Jason Lowf48627e2013-09-13 11:26:53 -070011006 rq->max_idle_balance_cost =
11007 max((u64)sysctl_sched_migration_cost, max_cost);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011008 }
Peter Zijlstradce840a2011-04-07 14:09:50 +020011009 rcu_read_unlock();
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011010
11011 /*
11012 * next_balance will be updated only when there is a need.
11013 * When the cpu is attached to null domain for ex, it will not be
11014 * updated.
11015 */
Vincent Guittotc5afb6a2015-08-03 11:55:50 +020011016 if (likely(update_next_balance)) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011017 rq->next_balance = next_balance;
Vincent Guittotc5afb6a2015-08-03 11:55:50 +020011018
11019#ifdef CONFIG_NO_HZ_COMMON
11020 /*
11021 * If this CPU has been elected to perform the nohz idle
11022 * balance. Other idle CPUs have already rebalanced with
11023 * nohz_idle_balance() and nohz.next_balance has been
11024 * updated accordingly. This CPU is now running the idle load
11025 * balance for itself and we need to update the
11026 * nohz.next_balance accordingly.
11027 */
11028 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
11029 nohz.next_balance = rq->next_balance;
11030#endif
11031 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011032}
11033
Frederic Weisbecker3451d022011-08-10 23:21:01 +020011034#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011035/*
Frederic Weisbecker3451d022011-08-10 23:21:01 +020011036 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011037 * rebalancing for all the cpus for whom scheduler ticks are stopped.
11038 */
Daniel Lezcano208cb162014-01-06 12:34:44 +010011039static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011040{
Daniel Lezcano208cb162014-01-06 12:34:44 +010011041 int this_cpu = this_rq->cpu;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011042 struct rq *rq;
11043 int balance_cpu;
Vincent Guittotc5afb6a2015-08-03 11:55:50 +020011044 /* Earliest time when we have to do rebalance again */
11045 unsigned long next_balance = jiffies + 60*HZ;
11046 int update_next_balance = 0;
Olav Haugand67250b2016-11-01 17:30:36 -070011047 cpumask_t cpus;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011048
Suresh Siddha1c792db2011-12-01 17:07:32 -080011049 if (idle != CPU_IDLE ||
11050 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
11051 goto end;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011052
Olav Haugand67250b2016-11-01 17:30:36 -070011053 cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);
11054
11055 for_each_cpu(balance_cpu, &cpus) {
Suresh Siddha8a6d42d2011-12-06 11:19:37 -080011056 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011057 continue;
11058
11059 /*
11060 * If this cpu gets work to do, stop the load balancing
11061 * work being done for other cpus. Next load
11062 * balancing owner will pick it up.
11063 */
Suresh Siddha1c792db2011-12-01 17:07:32 -080011064 if (need_resched())
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011065 break;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011066
Vincent Guittot5ed4f1d2012-09-13 06:11:26 +020011067 rq = cpu_rq(balance_cpu);
11068
Tim Chened61bbc2014-05-20 14:39:27 -070011069 /*
11070 * If time for next balance is due,
11071 * do the balance.
11072 */
11073 if (time_after_eq(jiffies, rq->next_balance)) {
11074 raw_spin_lock_irq(&rq->lock);
11075 update_rq_clock(rq);
Frederic Weisbeckercee1afc2016-04-13 15:56:50 +020011076 cpu_load_update_idle(rq);
Tim Chened61bbc2014-05-20 14:39:27 -070011077 raw_spin_unlock_irq(&rq->lock);
11078 rebalance_domains(rq, CPU_IDLE);
11079 }
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011080
Vincent Guittotc5afb6a2015-08-03 11:55:50 +020011081 if (time_after(next_balance, rq->next_balance)) {
11082 next_balance = rq->next_balance;
11083 update_next_balance = 1;
11084 }
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011085 }
Vincent Guittotc5afb6a2015-08-03 11:55:50 +020011086
11087 /*
11088 * next_balance will be updated only when there is a need.
11089 * When the CPU is attached to null domain for ex, it will not be
11090 * updated.
11091 */
11092 if (likely(update_next_balance))
11093 nohz.next_balance = next_balance;
Suresh Siddha1c792db2011-12-01 17:07:32 -080011094end:
11095 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011096}
11097
11098/*
Suresh Siddha0b005cf2011-12-01 17:07:34 -080011099 * Current heuristic for kicking the idle load balancer in the presence
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010011100 * of an idle cpu in the system.
Suresh Siddha0b005cf2011-12-01 17:07:34 -080011101 * - This rq has more than one task.
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010011102 * - This rq has at least one CFS task and the capacity of the CPU is
11103 * significantly reduced because of RT tasks or IRQs.
11104 * - At parent of LLC scheduler domain level, this cpu's scheduler group has
11105 * multiple busy cpu.
Suresh Siddha0b005cf2011-12-01 17:07:34 -080011106 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
11107 * domain span are idle.
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011108 */
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070011109static inline bool nohz_kick_needed(struct rq *rq, int *type)
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011110{
11111 unsigned long now = jiffies;
Peter Zijlstra0e369d72016-05-09 10:38:01 +020011112 struct sched_domain_shared *sds;
Suresh Siddha0b005cf2011-12-01 17:07:34 -080011113 struct sched_domain *sd;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070011114 int nr_busy;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070011115 int cpu = rq->cpu;
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010011116 bool kick = false;
Puja Gupta9c7d6442017-08-03 14:21:57 -070011117 cpumask_t cpumask;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011118
Daniel Lezcano4a725622014-01-06 12:34:39 +010011119 if (unlikely(rq->idle_balance))
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010011120 return false;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011121
Suresh Siddha1c792db2011-12-01 17:07:32 -080011122 /*
11123 * We may be recently in ticked or tickless idle mode. At the first
11124 * busy tick after returning from idle, we will update the busy stats.
11125 */
Suresh Siddha69e1e812011-12-01 17:07:33 -080011126 set_cpu_sd_state_busy();
Alex Shic1cc0172012-09-10 15:10:58 +080011127 nohz_balance_exit_idle(cpu);
Suresh Siddha0b005cf2011-12-01 17:07:34 -080011128
11129 /*
11130 * None are in tickless mode and hence no need for NOHZ idle load
11131 * balancing.
11132 */
Puja Gupta9c7d6442017-08-03 14:21:57 -070011133 cpumask_andnot(&cpumask, nohz.idle_cpus_mask, cpu_isolated_mask);
11134 if (cpumask_empty(&cpumask))
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010011135 return false;
Suresh Siddha1c792db2011-12-01 17:07:32 -080011136
11137 if (time_before(now, nohz.next_balance))
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010011138 return false;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011139
Morten Rasmussenf69e2dc2015-02-03 13:54:11 +000011140 if (rq->nr_running >= 2 &&
11141 (!energy_aware() || cpu_overutilized(cpu)))
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010011142 return true;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011143
Leo Yan0db9eac2017-03-27 15:00:14 +010011144 /* Do idle load balance if there have misfit task */
Joonwoo Park7c3a60e2017-03-07 18:08:24 -080011145 if (energy_aware())
Kyle Yane2486b72017-08-25 14:36:53 -070011146 return rq->misfit_task;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011147
Peter Zijlstra067491b2011-12-07 14:32:08 +010011148 rcu_read_lock();
Peter Zijlstra0e369d72016-05-09 10:38:01 +020011149 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
Morten Rasmussenf69e2dc2015-02-03 13:54:11 +000011150 if (sds && !energy_aware()) {
Peter Zijlstra0e369d72016-05-09 10:38:01 +020011151 /*
11152 * XXX: write a coherent comment on why we do this.
11153 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
11154 */
11155 nr_busy = atomic_read(&sds->nr_busy_cpus);
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010011156 if (nr_busy > 1) {
11157 kick = true;
11158 goto unlock;
11159 }
11160
11161 }
11162
11163 sd = rcu_dereference(rq->sd);
11164 if (sd) {
11165 if ((rq->cfs.h_nr_running >= 1) &&
11166 check_cpu_capacity(rq, sd)) {
11167 kick = true;
11168 goto unlock;
11169 }
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011170 }
Preeti U Murthy37dc6b52013-10-30 08:42:52 +053011171
11172 sd = rcu_dereference(per_cpu(sd_asym, cpu));
Puja Gupta9c7d6442017-08-03 14:21:57 -070011173 if (sd && (cpumask_first_and(&cpumask, sched_domain_span(sd)) < cpu)) {
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010011174 kick = true;
11175 goto unlock;
11176 }
Preeti U Murthy37dc6b52013-10-30 08:42:52 +053011177
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010011178unlock:
Peter Zijlstra067491b2011-12-07 14:32:08 +010011179 rcu_read_unlock();
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010011180 return kick;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011181}
11182#else
Daniel Lezcano208cb162014-01-06 12:34:44 +010011183static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011184#endif
11185
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011186/*
11187 * run_rebalance_domains is triggered when needed from the scheduler tick.
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011188 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011189 */
Emese Revfy0766f782016-06-20 20:42:34 +020011190static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011191{
Daniel Lezcano208cb162014-01-06 12:34:44 +010011192 struct rq *this_rq = this_rq();
Suresh Siddha6eb57e02011-10-03 15:09:01 -070011193 enum cpu_idle_type idle = this_rq->idle_balance ?
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011194 CPU_IDLE : CPU_NOT_IDLE;
11195
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011196 /*
Puja Gupta9c7d6442017-08-03 14:21:57 -070011197 * Since core isolation doesn't update nohz.idle_cpus_mask, there
11198 * is a possibility this nohz kicked cpu could be isolated. Hence
11199 * return if the cpu is isolated.
11200 */
11201 if (cpu_isolated(this_rq->cpu))
11202 return;
11203 /*
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011204 * If this cpu has a pending nohz_balance_kick, then do the
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011205 * balancing on behalf of the other idle cpus whose ticks are
Preeti U Murthyd4573c32015-03-26 18:32:44 +053011206 * stopped. Do nohz_idle_balance *before* rebalance_domains to
11207 * give the idle cpus a chance to load balance. Else we may
11208 * load balance only within the local sched_domain hierarchy
11209 * and abort nohz_idle_balance altogether if we pull some load.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011210 */
Daniel Lezcano208cb162014-01-06 12:34:44 +010011211 nohz_idle_balance(this_rq, idle);
Preeti U Murthyd4573c32015-03-26 18:32:44 +053011212 rebalance_domains(this_rq, idle);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011213}
11214
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011215/*
11216 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011217 */
Daniel Lezcano7caff662014-01-06 12:34:38 +010011218void trigger_load_balance(struct rq *rq)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011219{
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070011220 int type = NOHZ_KICK_ANY;
11221
Olav Haugan3f2cb302016-05-31 14:34:46 -070011222 /* Don't need to rebalance while attached to NULL domain or
11223 * cpu is isolated.
11224 */
11225 if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq)))
Daniel Lezcanoc7260992014-01-06 12:34:45 +010011226 return;
11227
11228 if (time_after_eq(jiffies, rq->next_balance))
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011229 raise_softirq(SCHED_SOFTIRQ);
Frederic Weisbecker3451d022011-08-10 23:21:01 +020011230#ifdef CONFIG_NO_HZ_COMMON
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070011231 if (nohz_kick_needed(rq, &type))
11232 nohz_balancer_kick(type);
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070011233#endif
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011234}
11235
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +010011236static void rq_online_fair(struct rq *rq)
11237{
11238 update_sysctl();
Kirill Tkhai0e59bda2014-06-25 12:19:42 +040011239
11240 update_runtime_enabled(rq);
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +010011241}
11242
11243static void rq_offline_fair(struct rq *rq)
11244{
11245 update_sysctl();
Peter Boonstoppela4c96ae2012-08-09 15:34:47 -070011246
11247 /* Ensure any throttled groups are reachable by pick_next_task */
11248 unthrottle_offline_cfs_rqs(rq);
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +010011249}
11250
Dhaval Giani55e12e52008-06-24 23:39:43 +053011251#endif /* CONFIG_SMP */
Peter Williamse1d14842007-10-24 18:23:51 +020011252
Pavankumar Kondeti35213222018-04-03 15:10:44 +053011253#ifdef CONFIG_SCHED_WALT
11254static inline void
11255walt_update_misfit_task(struct rq *rq, struct task_struct *curr)
11256{
11257 bool misfit = rq->misfit_task;
11258
11259 if (curr->misfit != misfit) {
11260 walt_fixup_nr_big_tasks(rq, curr, 1, misfit);
11261 curr->misfit = misfit;
11262 }
11263}
11264#else
11265static inline void
11266walt_update_misfit_task(struct rq *rq, struct task_struct *curr)
11267{
11268}
11269#endif
11270
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011271/*
11272 * scheduler tick hitting a task of our scheduling class:
11273 */
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +010011274static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011275{
11276 struct cfs_rq *cfs_rq;
11277 struct sched_entity *se = &curr->se;
11278
11279 for_each_sched_entity(se) {
11280 cfs_rq = cfs_rq_of(se);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +010011281 entity_tick(cfs_rq, se, queued);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011282 }
Ben Segall18bf2802012-10-04 12:51:20 +020011283
Srikar Dronamrajub52da862015-10-02 07:48:25 +053011284 if (static_branch_unlikely(&sched_numa_balancing))
Peter Zijlstracbee9f82012-10-25 14:16:43 +020011285 task_tick_numa(rq, curr);
Morten Rasmussena562dfc2015-05-09 16:49:57 +010011286
Patrick Bellasi2178e842016-07-22 11:35:59 +010011287#ifdef CONFIG_SMP
Patrick Bellasi8e45d942016-02-10 09:24:36 +000011288 if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
Morten Rasmussena562dfc2015-05-09 16:49:57 +010011289 rq->rd->overutilized = true;
Patrick Bellasi8e45d942016-02-10 09:24:36 +000011290 trace_sched_overutilized(true);
11291 }
Morten Rasmussen4c6a8242016-02-25 12:47:54 +000011292
Pavankumar Kondeti35213222018-04-03 15:10:44 +053011293 rq->misfit_task = !task_fits_max(curr, rq->cpu);
Patrick Bellasi2178e842016-07-22 11:35:59 +010011294#endif
Pavankumar Kondeti35213222018-04-03 15:10:44 +053011295 walt_update_misfit_task(rq, curr);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011296}
11297
11298/*
Peter Zijlstracd29fe62009-11-27 17:32:46 +010011299 * called on fork with the child task as argument from the parent's context
11300 * - child not yet on the tasklist
11301 * - preemption disabled
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011302 */
Peter Zijlstracd29fe62009-11-27 17:32:46 +010011303static void task_fork_fair(struct task_struct *p)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011304{
Daisuke Nishimura4fc420c2011-12-15 14:36:55 +090011305 struct cfs_rq *cfs_rq;
11306 struct sched_entity *se = &p->se, *curr;
Peter Zijlstracd29fe62009-11-27 17:32:46 +010011307 struct rq *rq = this_rq();
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011308
Peter Zijlstrae210bff2016-06-16 18:51:48 +020011309 raw_spin_lock(&rq->lock);
Peter Zijlstra861d0342010-08-19 13:31:43 +020011310 update_rq_clock(rq);
11311
Daisuke Nishimura4fc420c2011-12-15 14:36:55 +090011312 cfs_rq = task_cfs_rq(current);
11313 curr = cfs_rq->curr;
Peter Zijlstrae210bff2016-06-16 18:51:48 +020011314 if (curr) {
11315 update_curr(cfs_rq);
Mike Galbraithb5d9d732009-09-08 11:12:28 +020011316 se->vruntime = curr->vruntime;
Peter Zijlstrae210bff2016-06-16 18:51:48 +020011317 }
Peter Zijlstraaeb73b02007-10-15 17:00:05 +020011318 place_entity(cfs_rq, se, 1);
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +020011319
Peter Zijlstracd29fe62009-11-27 17:32:46 +010011320 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
Dmitry Adamushko87fefa32007-10-15 17:00:08 +020011321 /*
Ingo Molnaredcb60a2007-10-15 17:00:08 +020011322 * Upon rescheduling, sched_class::put_prev_task() will place
11323 * 'current' within the tree based on its new key value.
11324 */
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +020011325 swap(curr->vruntime, se->vruntime);
Kirill Tkhai88751252014-06-29 00:03:57 +040011326 resched_curr(rq);
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +020011327 }
11328
Peter Zijlstra88ec22d2009-12-16 18:04:41 +010011329 se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstrae210bff2016-06-16 18:51:48 +020011330 raw_spin_unlock(&rq->lock);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011331}
11332
Steven Rostedtcb469842008-01-25 21:08:22 +010011333/*
11334 * Priority of the task has changed. Check to see if we preempt
11335 * the current task.
11336 */
Peter Zijlstrada7a7352011-01-17 17:03:27 +010011337static void
11338prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
Steven Rostedtcb469842008-01-25 21:08:22 +010011339{
Kirill Tkhaida0c1e62014-08-20 13:47:32 +040011340 if (!task_on_rq_queued(p))
Peter Zijlstrada7a7352011-01-17 17:03:27 +010011341 return;
11342
Steven Rostedtcb469842008-01-25 21:08:22 +010011343 /*
11344 * Reschedule if we are currently running on this runqueue and
11345 * our priority decreased, or if we are not currently running on
11346 * this runqueue and our priority is higher than the current's
11347 */
Peter Zijlstrada7a7352011-01-17 17:03:27 +010011348 if (rq->curr == p) {
Steven Rostedtcb469842008-01-25 21:08:22 +010011349 if (p->prio > oldprio)
Kirill Tkhai88751252014-06-29 00:03:57 +040011350 resched_curr(rq);
Steven Rostedtcb469842008-01-25 21:08:22 +010011351 } else
Peter Zijlstra15afe092008-09-20 23:38:02 +020011352 check_preempt_curr(rq, p, 0);
Steven Rostedtcb469842008-01-25 21:08:22 +010011353}
11354
Byungchul Parkdaa59402015-08-20 20:22:00 +090011355static inline bool vruntime_normalized(struct task_struct *p)
11356{
11357 struct sched_entity *se = &p->se;
11358
11359 /*
11360 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
11361 * the dequeue_entity(.flags=0) will already have normalized the
11362 * vruntime.
11363 */
11364 if (p->on_rq)
11365 return true;
11366
11367 /*
11368 * When !on_rq, vruntime of the task has usually NOT been normalized.
11369 * But there are some cases where it has already been normalized:
11370 *
11371 * - A forked child which is waiting for being woken up by
11372 * wake_up_new_task().
11373 * - A task which has been woken up by try_to_wake_up() and
11374 * waiting for actually being woken up by sched_ttwu_pending().
11375 */
Steve Mucklefa7a13e2018-08-31 15:42:17 -070011376 if (!se->sum_exec_runtime ||
11377 (p->state == TASK_WAKING && p->sched_remote_wakeup))
Byungchul Parkdaa59402015-08-20 20:22:00 +090011378 return true;
11379
11380 return false;
11381}
11382
Vincent Guittot96956e22016-11-08 10:53:44 +010011383#ifdef CONFIG_FAIR_GROUP_SCHED
11384/*
11385 * Propagate the changes of the sched_entity across the tg tree to make it
11386 * visible to the root
11387 */
11388static void propagate_entity_cfs_rq(struct sched_entity *se)
11389{
11390 struct cfs_rq *cfs_rq;
11391
11392 /* Start to propagate at parent */
11393 se = se->parent;
11394
11395 for_each_sched_entity(se) {
11396 cfs_rq = cfs_rq_of(se);
11397
11398 if (cfs_rq_throttled(cfs_rq))
11399 break;
11400
11401 update_load_avg(se, UPDATE_TG);
11402 }
11403}
11404#else
11405static void propagate_entity_cfs_rq(struct sched_entity *se) { }
11406#endif
11407
Vincent Guittot793cfff2016-11-08 10:53:42 +010011408static void detach_entity_cfs_rq(struct sched_entity *se)
Peter Zijlstrada7a7352011-01-17 17:03:27 +010011409{
Peter Zijlstrada7a7352011-01-17 17:03:27 +010011410 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11411
Yuyang Du9d89c252015-07-15 08:04:37 +080011412 /* Catch up with the cfs_rq and remove our load when we leave */
Vincent Guittot96956e22016-11-08 10:53:44 +010011413 update_load_avg(se, 0);
Byungchul Parka05e8c52015-08-20 20:21:56 +090011414 detach_entity_load_avg(cfs_rq, se);
Peter Zijlstra7c3edd22016-07-13 10:56:25 +020011415 update_tg_load_avg(cfs_rq, false);
Vincent Guittot96956e22016-11-08 10:53:44 +010011416 propagate_entity_cfs_rq(se);
Peter Zijlstrada7a7352011-01-17 17:03:27 +010011417}
11418
Vincent Guittot793cfff2016-11-08 10:53:42 +010011419static void attach_entity_cfs_rq(struct sched_entity *se)
Steven Rostedtcb469842008-01-25 21:08:22 +010011420{
Byungchul Parkdaa59402015-08-20 20:22:00 +090011421 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Byungchul Park7855a352015-08-10 18:02:55 +090011422
11423#ifdef CONFIG_FAIR_GROUP_SCHED
Michael wangeb7a59b2014-02-20 11:14:53 +080011424 /*
11425 * Since the real-depth could have been changed (only FAIR
11426 * class maintain depth value), reset depth properly.
11427 */
11428 se->depth = se->parent ? se->parent->depth + 1 : 0;
11429#endif
Byungchul Park7855a352015-08-10 18:02:55 +090011430
Vincent Guittot793cfff2016-11-08 10:53:42 +010011431 /* Synchronize entity with its cfs_rq */
Vincent Guittot96956e22016-11-08 10:53:44 +010011432 update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
Byungchul Parkdaa59402015-08-20 20:22:00 +090011433 attach_entity_load_avg(cfs_rq, se);
Peter Zijlstra7c3edd22016-07-13 10:56:25 +020011434 update_tg_load_avg(cfs_rq, false);
Vincent Guittot96956e22016-11-08 10:53:44 +010011435 propagate_entity_cfs_rq(se);
Vincent Guittot793cfff2016-11-08 10:53:42 +010011436}
11437
Peter Zijlstra029632f2011-10-25 10:00:11 +020011438static void detach_task_cfs_rq(struct task_struct *p)
11439{
11440 struct sched_entity *se = &p->se;
11441 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra029632f2011-10-25 10:00:11 +020011442
Peter Zijlstra029632f2011-10-25 10:00:11 +020011443 if (!vruntime_normalized(p)) {
11444 /*
11445 * Fix up our vruntime so that the current sleep doesn't
11446 * cause 'unlimited' sleep bonus.
11447 */
11448 place_entity(cfs_rq, se, 0);
11449 se->vruntime -= cfs_rq->min_vruntime;
11450 }
11451
Vincent Guittot793cfff2016-11-08 10:53:42 +010011452 detach_entity_cfs_rq(se);
Peter Zijlstra029632f2011-10-25 10:00:11 +020011453}
Peter Zijlstrafed14d42012-02-11 06:05:00 +010011454
11455static void attach_task_cfs_rq(struct task_struct *p)
Peter Zijlstra029632f2011-10-25 10:00:11 +020011456{
11457 struct sched_entity *se = &p->se;
Paul Turner0ac9b1c2013-10-16 11:16:27 -070011458 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra029632f2011-10-25 10:00:11 +020011459
Vincent Guittot793cfff2016-11-08 10:53:42 +010011460 attach_entity_cfs_rq(se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011461
11462 if (!vruntime_normalized(p))
Steven Rostedtcb469842008-01-25 21:08:22 +010011463 se->vruntime += cfs_rq->min_vruntime;
11464}
11465
11466static void switched_from_fair(struct rq *rq, struct task_struct *p)
11467{
11468 detach_task_cfs_rq(p);
11469}
11470
11471static void switched_to_fair(struct rq *rq, struct task_struct *p)
11472{
11473 attach_task_cfs_rq(p);
11474
11475 if (task_on_rq_queued(p)) {
11476 /*
11477 * We were most likely switched from sched_rt, so
11478 * kick off the schedule if running, otherwise just see
11479 * if we can still preempt the current task.
11480 */
11481 if (rq->curr == p)
11482 resched_curr(rq);
11483 else
11484 check_preempt_curr(rq, p, 0);
11485 }
11486}
11487
11488/* Account for a task changing its policy or group.
11489 *
11490 * This routine is mostly called to set cfs_rq->curr field when a task
11491 * migrates between groups/classes.
11492 */
11493static void set_curr_task_fair(struct rq *rq)
11494{
11495 struct sched_entity *se = &rq->curr->se;
11496
11497 for_each_sched_entity(se) {
11498 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +020011499
11500 set_next_entity(cfs_rq, se);
11501 /* ensure bandwidth has been allocated on our new cfs_rq */
11502 account_cfs_rq_runtime(cfs_rq, 0);
Peter Zijlstrada7a7352011-01-17 17:03:27 +010011503 }
11504}
11505
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +020011506void init_cfs_rq(struct cfs_rq *cfs_rq)
11507{
11508 cfs_rq->tasks_timeline = RB_ROOT;
11509 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
11510#ifndef CONFIG_64BIT
Peter Zijlstrada7a7352011-01-17 17:03:27 +010011511 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +020011512#endif
11513#ifdef CONFIG_SMP
Vincent Guittot96956e22016-11-08 10:53:44 +010011514#ifdef CONFIG_FAIR_GROUP_SCHED
11515 cfs_rq->propagate_avg = 0;
11516#endif
Ingo Molnarc3b64f12007-08-09 11:16:51 +020011517 atomic_long_set(&cfs_rq->removed_load_avg, 0);
11518 atomic_long_set(&cfs_rq->removed_util_avg, 0);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011519#endif
11520}
11521
11522#ifdef CONFIG_FAIR_GROUP_SCHED
11523static void task_set_group_fair(struct task_struct *p)
11524{
11525 struct sched_entity *se = &p->se;
11526
11527 set_task_rq(p, task_cpu(p));
11528 se->depth = se->parent ? se->parent->depth + 1 : 0;
11529}
11530
11531static void task_move_group_fair(struct task_struct *p)
11532{
11533 detach_task_cfs_rq(p);
11534 set_task_rq(p, task_cpu(p));
11535
11536#ifdef CONFIG_SMP
11537 /* Tell se's cfs_rq has been changed -- migrated */
Dmitry Adamushkod02e5ed2007-10-15 17:00:07 +020011538 p->se.avg.last_update_time = 0;
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +020011539#endif
11540 attach_task_cfs_rq(p);
11541}
11542
11543static void task_change_group_fair(struct task_struct *p, int type)
11544{
11545 switch (type) {
11546 case TASK_SET_GROUP:
11547 task_set_group_fair(p);
11548 break;
11549
11550 case TASK_MOVE_GROUP:
11551 task_move_group_fair(p);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011552 break;
11553 }
11554}
11555
11556void free_fair_sched_group(struct task_group *tg)
11557{
11558 int i;
11559
Peter Zijlstra15afe092008-09-20 23:38:02 +020011560 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011561
11562 for_each_possible_cpu(i) {
11563 if (tg->cfs_rq)
11564 kfree(tg->cfs_rq[i]);
11565 if (tg->se)
11566 kfree(tg->se[i]);
11567 }
11568
11569 kfree(tg->cfs_rq);
11570 kfree(tg->se);
11571}
11572
11573int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11574{
11575 struct sched_entity *se;
Peter Zijlstra810b3812008-02-29 15:21:01 -050011576 struct cfs_rq *cfs_rq;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011577 int i;
11578
11579 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
11580 if (!tg->cfs_rq)
11581 goto err;
11582 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
11583 if (!tg->se)
11584 goto err;
11585
11586 tg->shares = NICE_0_LOAD;
11587
11588 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
11589
11590 for_each_possible_cpu(i) {
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011591 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
11592 GFP_KERNEL, cpu_to_node(i));
11593 if (!cfs_rq)
11594 goto err;
11595
11596 se = kzalloc_node(sizeof(struct sched_entity),
11597 GFP_KERNEL, cpu_to_node(i));
11598 if (!se)
11599 goto err_free_rq;
11600
11601 init_cfs_rq(cfs_rq);
11602 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
11603 init_entity_runnable_average(se);
11604 }
11605
11606 return 1;
11607
11608err_free_rq:
11609 kfree(cfs_rq);
11610err:
11611 return 0;
11612}
11613
11614void online_fair_sched_group(struct task_group *tg)
11615{
Ingo Molnar2e09bf52007-10-15 17:00:05 +020011616 struct sched_entity *se;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011617 struct rq *rq;
11618 int i;
11619
11620 for_each_possible_cpu(i) {
11621 rq = cpu_rq(i);
11622 se = tg->se[i];
11623
11624 raw_spin_lock_irq(&rq->lock);
11625 post_init_entity_util_avg(se);
11626 sync_throttle(tg, i);
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +020011627 raw_spin_unlock_irq(&rq->lock);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011628 }
Peter Zijlstra810b3812008-02-29 15:21:01 -050011629}
11630
11631void unregister_fair_sched_group(struct task_group *tg)
Paul Turnerec12cb72011-07-21 09:43:30 -070011632{
11633 unsigned long flags;
11634 struct rq *rq;
11635 int cpu;
11636
11637 for_each_possible_cpu(cpu) {
11638 if (tg->se[cpu])
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011639 remove_entity_load_avg(tg->se[cpu]);
11640
Peter Zijlstra810b3812008-02-29 15:21:01 -050011641 /*
Peter Zijlstrab2b5ce02010-10-15 15:24:15 +020011642 * Only empty task groups can be destroyed; so we can speculatively
Peter Zijlstra810b3812008-02-29 15:21:01 -050011643 * check on_list without danger of it being re-added.
Peter Zijlstrab2b5ce02010-10-15 15:24:15 +020011644 */
11645 if (!tg->cfs_rq[cpu]->on_list)
11646 continue;
11647
11648 rq = cpu_rq(cpu);
11649
11650 raw_spin_lock_irqsave(&rq->lock, flags);
11651 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
11652 raw_spin_unlock_irqrestore(&rq->lock, flags);
11653 }
11654}
11655
11656void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
Peter Zijlstra88ec22d2009-12-16 18:04:41 +010011657 struct sched_entity *se, int cpu,
Peter Zijlstrab2b5ce02010-10-15 15:24:15 +020011658 struct sched_entity *parent)
11659{
11660 struct rq *rq = cpu_rq(cpu);
Frederic Weisbecker71b1da42013-04-12 01:50:59 +020011661
11662 cfs_rq->tg = tg;
11663 cfs_rq->rq = rq;
Linus Torvalds17bc14b2012-12-14 07:20:43 -080011664 init_cfs_rq_runtime(cfs_rq);
Peter Zijlstra029632f2011-10-25 10:00:11 +020011665
11666 tg->cfs_rq[cpu] = cfs_rq;
11667 tg->se[cpu] = se;
11668
11669 /* se could be NULL for root_task_group */
11670 if (!se)
11671 return;
11672
11673 if (!parent) {
11674 se->cfs_rq = &rq->cfs;
11675 se->depth = 0;
11676 } else {
11677 se->cfs_rq = parent->my_q;
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010011678 se->depth = parent->depth + 1;
Peter Zijlstra029632f2011-10-25 10:00:11 +020011679 }
11680
11681 se->my_q = cfs_rq;
Peter Zijlstra810b3812008-02-29 15:21:01 -050011682 /* guarantee group entities always have weight */
H Hartley Sweeten6d686f42010-01-13 20:21:52 -070011683 update_load_set(&se->load, NICE_0_LOAD);
Peter Williams0d721ce2009-09-21 01:31:53 +000011684 se->parent = parent;
11685}
Peter Williams0d721ce2009-09-21 01:31:53 +000011686
11687static DEFINE_MUTEX(shares_mutex);
11688
11689int sched_group_set_shares(struct task_group *tg, unsigned long shares)
11690{
11691 int i;
Peter Williams0d721ce2009-09-21 01:31:53 +000011692 unsigned long flags;
Zhu Yanhaia59f4e02013-01-08 12:56:52 +080011693
Peter Williams0d721ce2009-09-21 01:31:53 +000011694 /*
11695 * We can't change the weight of the root cgroup.
11696 */
11697 if (!tg->se[0])
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011698 return -EINVAL;
11699
11700 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
Peter Zijlstra029632f2011-10-25 10:00:11 +020011701
Ingo Molnar5522d5d2007-10-15 17:00:12 +020011702 mutex_lock(&shares_mutex);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011703 if (tg->shares == shares)
11704 goto done;
11705
Mike Galbraithd95f4122011-02-01 09:50:51 -050011706 tg->shares = shares;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011707 for_each_possible_cpu(i) {
11708 struct rq *rq = cpu_rq(i);
11709 struct sched_entity *se;
11710
11711 se = tg->se[i];
11712 /* Propagate contribution to hierarchy */
Peter Williams681f3e62007-10-24 18:23:51 +020011713 raw_spin_lock_irqsave(&rq->lock, flags);
Li Zefan4ce72a22008-10-22 15:25:26 +080011714
Paul Turner0a74bef2012-10-04 13:18:30 +020011715 /* Possible calls to update_curr() need rq clock */
Alex Shi141965c2013-06-26 13:05:39 +080011716 update_rq_clock(rq);
Vincent Guittot6960f772016-12-21 16:50:26 +010011717 for_each_sched_entity(se) {
11718 update_load_avg(se, UPDATE_TG);
11719 update_cfs_shares(se);
11720 }
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011721 raw_spin_unlock_irqrestore(&rq->lock, flags);
11722 }
11723
11724done:
11725 mutex_unlock(&shares_mutex);
11726 return 0;
11727}
11728#else /* CONFIG_FAIR_GROUP_SCHED */
11729
11730void free_fair_sched_group(struct task_group *tg) { }
11731
11732int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11733{
11734 return 1;
11735}
11736
Peter Zijlstra8663e242016-06-22 14:58:02 +020011737void online_fair_sched_group(struct task_group *tg) { }
11738
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011739void unregister_fair_sched_group(struct task_group *tg) { }
11740
11741#endif /* CONFIG_FAIR_GROUP_SCHED */
11742
11743
11744static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
11745{
11746 struct sched_entity *se = &task->se;
11747 unsigned int rr_interval = 0;
11748
11749 /*
11750 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
11751 * idle runqueue:
11752 */
11753 if (rq->cfs.load.weight)
11754 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
11755
11756 return rr_interval;
Li Zefan4ce72a22008-10-22 15:25:26 +080011757}
11758
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +010011759/*
11760 * All the scheduling class methods:
Peter Zijlstra88ec22d2009-12-16 18:04:41 +010011761 */
11762const struct sched_class fair_sched_class = {
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011763 .next = &idle_sched_class,
11764 .enqueue_task = enqueue_task_fair,
11765 .dequeue_task = dequeue_task_fair,
11766 .yield_task = yield_task_fair,
Peter Zijlstracd29fe62009-11-27 17:32:46 +010011767 .yield_to_task = yield_to_task_fair,
Steven Rostedtcb469842008-01-25 21:08:22 +010011768
11769 .check_preempt_curr = check_preempt_wakeup,
Peter Zijlstrada7a7352011-01-17 17:03:27 +010011770
Steven Rostedtcb469842008-01-25 21:08:22 +010011771 .pick_next_task = pick_next_task_fair,
Peter Zijlstra810b3812008-02-29 15:21:01 -050011772 .put_prev_task = put_prev_task_fair,
Peter Williams0d721ce2009-09-21 01:31:53 +000011773
11774#ifdef CONFIG_SMP
Peter Zijlstra810b3812008-02-29 15:21:01 -050011775 .select_task_rq = select_task_rq_fair,
Peter Zijlstrab2b5ce02010-10-15 15:24:15 +020011776 .migrate_task_rq = migrate_task_rq_fair,
Yuyang Du12695572015-07-15 08:04:40 +080011777
Peter Zijlstrac5b28032015-05-15 17:43:35 +020011778 .rq_online = rq_online_fair,
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011779 .rq_offline = rq_offline_fair,
11780
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011781 .task_dead = task_dead_fair,
11782 .set_cpus_allowed = set_cpus_allowed_common,
11783#endif
11784
Stanislaw Gruszka6e998912014-11-12 16:58:44 +010011785 .set_curr_task = set_curr_task_fair,
11786 .task_tick = task_tick_fair,
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011787 .task_fork = task_fork_fair,
11788
11789 .prio_changed = prio_changed_fair,
11790 .switched_from = switched_from_fair,
11791 .switched_to = switched_to_fair,
11792
Peter Zijlstra029632f2011-10-25 10:00:11 +020011793 .get_rr_interval = get_rr_interval_fair,
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011794
11795 .update_curr = update_curr_fair,
11796
11797#ifdef CONFIG_FAIR_GROUP_SCHED
Vincent Guittotea86cb42016-06-17 13:38:55 +020011798 .task_change_group = task_change_group_fair,
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011799#endif
Joonwoo Parkf7d6cd42017-01-17 15:19:43 -080011800#ifdef CONFIG_SCHED_WALT
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +053011801 .fixup_walt_sched_stats = walt_fixup_sched_stats_fair,
Pavankumar Kondeti351d3fa2019-09-04 11:28:58 +053011802 .fixup_cumulative_runnable_avg =
11803 walt_fixup_cumulative_runnable_avg_fair,
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070011804#endif
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011805};
11806
11807#ifdef CONFIG_SCHED_DEBUG
11808void print_cfs_stats(struct seq_file *m, int cpu)
11809{
11810 struct cfs_rq *cfs_rq;
11811
11812 rcu_read_lock();
11813 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
11814 print_cfs_rq(m, cpu, cfs_rq);
11815 rcu_read_unlock();
11816}
Srikar Dronamraju397f2372015-06-25 22:51:43 +053011817
11818#ifdef CONFIG_NUMA_BALANCING
11819void show_numa_stats(struct task_struct *p, struct seq_file *m)
11820{
11821 int node;
11822 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
11823
11824 for_each_online_node(node) {
11825 if (p->numa_faults) {
11826 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
11827 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
11828 }
11829 if (p->numa_group) {
11830 gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
11831 gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
11832 }
11833 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
11834 }
11835}
11836#endif /* CONFIG_NUMA_BALANCING */
11837#endif /* CONFIG_SCHED_DEBUG */
Peter Zijlstra029632f2011-10-25 10:00:11 +020011838
11839__init void init_sched_fair_class(void)
11840{
11841#ifdef CONFIG_SMP
11842 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
11843
Frederic Weisbecker3451d022011-08-10 23:21:01 +020011844#ifdef CONFIG_NO_HZ_COMMON
Diwakar Tundlam554ceca2012-03-07 14:44:26 -080011845 nohz.next_balance = jiffies;
Peter Zijlstra029632f2011-10-25 10:00:11 +020011846 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
Peter Zijlstra029632f2011-10-25 10:00:11 +020011847#endif
11848#endif /* SMP */
11849
11850}
Vikram Mulukutlad056dbc2017-02-07 18:58:07 -080011851
Joonwoo Parkf7d6cd42017-01-17 15:19:43 -080011852/* WALT sched implementation begins here */
Pavankumar Kondetid4127502017-07-20 08:56:15 +053011853#ifdef CONFIG_SCHED_WALT
Joonwoo Parkf7d6cd42017-01-17 15:19:43 -080011854
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +053011855#ifdef CONFIG_CFS_BANDWIDTH
11856
11857static void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq)
11858{
11859 cfs_rq->walt_stats.nr_big_tasks = 0;
11860 cfs_rq->walt_stats.cumulative_runnable_avg = 0;
11861 cfs_rq->walt_stats.pred_demands_sum = 0;
11862}
11863
11864static void walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p)
11865{
11866 inc_nr_big_task(&cfs_rq->walt_stats, p);
11867 fixup_cumulative_runnable_avg(&cfs_rq->walt_stats, p->ravg.demand,
11868 p->ravg.pred_demand);
11869}
11870
11871static void walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p)
11872{
11873 dec_nr_big_task(&cfs_rq->walt_stats, p);
11874 fixup_cumulative_runnable_avg(&cfs_rq->walt_stats, -(s64)p->ravg.demand,
11875 -(s64)p->ravg.pred_demand);
11876}
11877
11878static void walt_inc_throttled_cfs_rq_stats(struct walt_sched_stats *stats,
11879 struct cfs_rq *tcfs_rq)
11880{
11881 struct rq *rq = rq_of(tcfs_rq);
11882
11883 stats->nr_big_tasks += tcfs_rq->walt_stats.nr_big_tasks;
11884 fixup_cumulative_runnable_avg(stats,
11885 tcfs_rq->walt_stats.cumulative_runnable_avg,
11886 tcfs_rq->walt_stats.pred_demands_sum);
11887
11888 if (stats == &rq->walt_stats)
11889 walt_fixup_cum_window_demand(rq,
11890 tcfs_rq->walt_stats.cumulative_runnable_avg);
11891
11892}
11893
11894static void walt_dec_throttled_cfs_rq_stats(struct walt_sched_stats *stats,
11895 struct cfs_rq *tcfs_rq)
11896{
11897 struct rq *rq = rq_of(tcfs_rq);
11898
11899 stats->nr_big_tasks -= tcfs_rq->walt_stats.nr_big_tasks;
11900 fixup_cumulative_runnable_avg(stats,
11901 -tcfs_rq->walt_stats.cumulative_runnable_avg,
11902 -tcfs_rq->walt_stats.pred_demands_sum);
11903
11904 /*
11905 * We remove the throttled cfs_rq's tasks's contribution from the
11906 * cumulative window demand so that the same can be added
11907 * unconditionally when the cfs_rq is unthrottled.
11908 */
11909 if (stats == &rq->walt_stats)
11910 walt_fixup_cum_window_demand(rq,
11911 -tcfs_rq->walt_stats.cumulative_runnable_avg);
11912}
11913
11914static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p,
11915 u32 new_task_load, u32 new_pred_demand)
11916{
11917 struct cfs_rq *cfs_rq;
11918 struct sched_entity *se = &p->se;
11919 s64 task_load_delta = (s64)new_task_load - task_load(p);
11920 s64 pred_demand_delta = PRED_DEMAND_DELTA;
11921
11922 for_each_sched_entity(se) {
11923 cfs_rq = cfs_rq_of(se);
11924
11925 fixup_cumulative_runnable_avg(&cfs_rq->walt_stats,
11926 task_load_delta,
11927 pred_demand_delta);
11928 if (cfs_rq_throttled(cfs_rq))
11929 break;
11930 }
11931
11932 /* Fix up rq->walt_stats only if we didn't find any throttled cfs_rq */
11933 if (!se) {
11934 fixup_cumulative_runnable_avg(&rq->walt_stats,
11935 task_load_delta,
11936 pred_demand_delta);
11937 walt_fixup_cum_window_demand(rq, task_load_delta);
11938 }
11939}
11940
11941static void walt_fixup_nr_big_tasks(struct rq *rq, struct task_struct *p,
11942 int delta, bool inc)
11943{
11944 struct cfs_rq *cfs_rq;
11945 struct sched_entity *se = &p->se;
11946
11947 for_each_sched_entity(se) {
11948 cfs_rq = cfs_rq_of(se);
11949
11950 cfs_rq->walt_stats.nr_big_tasks += inc ? delta : -delta;
11951 BUG_ON(cfs_rq->walt_stats.nr_big_tasks < 0);
11952
11953 if (cfs_rq_throttled(cfs_rq))
11954 break;
11955 }
11956
11957 /* Fix up rq->walt_stats only if we didn't find any throttled cfs_rq */
11958 if (!se)
11959 walt_adjust_nr_big_tasks(rq, delta, inc);
11960}
11961
11962/*
11963 * Check if task is part of a hierarchy where some cfs_rq does not have any
11964 * runtime left.
11965 *
11966 * We can't rely on throttled_hierarchy() to do this test, as
11967 * cfs_rq->throttle_count will not be updated yet when this function is called
11968 * from scheduler_tick()
11969 */
11970static int task_will_be_throttled(struct task_struct *p)
11971{
11972 struct sched_entity *se = &p->se;
11973 struct cfs_rq *cfs_rq;
11974
11975 if (!cfs_bandwidth_used())
11976 return 0;
11977
11978 for_each_sched_entity(se) {
11979 cfs_rq = cfs_rq_of(se);
11980 if (!cfs_rq->runtime_enabled)
11981 continue;
11982 if (cfs_rq->runtime_remaining <= 0)
11983 return 1;
11984 }
11985
11986 return 0;
11987}
11988
11989#else /* CONFIG_CFS_BANDWIDTH */
11990
11991static void walt_fixup_sched_stats_fair(struct rq *rq, struct task_struct *p,
11992 u32 new_task_load, u32 new_pred_demand)
11993{
11994 fixup_walt_sched_stats_common(rq, p, new_task_load, new_pred_demand);
11995}
11996
11997static void walt_fixup_nr_big_tasks(struct rq *rq, struct task_struct *p,
11998 int delta, bool inc)
11999{
12000 walt_adjust_nr_big_tasks(rq, delta, inc);
12001}
12002
12003static int task_will_be_throttled(struct task_struct *p)
12004{
12005 return false;
12006}
12007
12008#endif /* CONFIG_CFS_BANDWIDTH */
12009
Joonwoo Parke77a2012016-12-06 18:12:43 -080012010static inline int
12011kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
12012{
12013 unsigned long flags;
12014 int rc = 0;
12015
12016 /* Invoke active balance to force migrate currently running task */
12017 raw_spin_lock_irqsave(&rq->lock, flags);
12018 if (!rq->active_balance) {
12019 rq->active_balance = 1;
12020 rq->push_cpu = new_cpu;
12021 get_task_struct(p);
12022 rq->push_task = p;
12023 rc = 1;
12024 }
12025 raw_spin_unlock_irqrestore(&rq->lock, flags);
12026
12027 return rc;
12028}
12029
Pavankumar Kondeti4e13d112018-01-25 01:12:08 +053012030#ifdef CONFIG_SCHED_WALT
12031struct walt_rotate_work {
12032 struct work_struct w;
12033 struct task_struct *src_task;
12034 struct task_struct *dst_task;
12035 int src_cpu;
12036 int dst_cpu;
12037};
12038
12039static DEFINE_PER_CPU(struct walt_rotate_work, walt_rotate_works);
12040
12041static void walt_rotate_work_func(struct work_struct *work)
12042{
12043 struct walt_rotate_work *wr = container_of(work,
12044 struct walt_rotate_work, w);
12045
12046 migrate_swap(wr->src_task, wr->dst_task);
12047
12048 put_task_struct(wr->src_task);
12049 put_task_struct(wr->dst_task);
12050
12051 clear_reserved(wr->src_cpu);
12052 clear_reserved(wr->dst_cpu);
12053}
12054
12055void walt_rotate_work_init(void)
12056{
12057 int i;
12058
12059 for_each_possible_cpu(i) {
12060 struct walt_rotate_work *wr = &per_cpu(walt_rotate_works, i);
12061
12062 INIT_WORK(&wr->w, walt_rotate_work_func);
12063 }
12064}
12065
12066#define WALT_ROTATION_THRESHOLD_NS 16000000
12067static void walt_check_for_rotation(struct rq *src_rq)
12068{
12069 u64 wc, wait, max_wait = 0, run, max_run = 0;
12070 int deserved_cpu = nr_cpu_ids, dst_cpu = nr_cpu_ids;
12071 int i, src_cpu = cpu_of(src_rq);
12072 struct rq *dst_rq;
12073 struct walt_rotate_work *wr = NULL;
12074
12075 if (!walt_rotation_enabled)
12076 return;
12077
12078 if (got_boost_kick())
12079 return;
12080
12081 if (is_max_capacity_cpu(src_cpu))
12082 return;
12083
Pavankumar Kondetifaa04442018-06-25 16:13:39 +053012084 wc = sched_ktime_clock();
Pavankumar Kondeti4e13d112018-01-25 01:12:08 +053012085 for_each_possible_cpu(i) {
12086 struct rq *rq = cpu_rq(i);
12087
12088 if (is_max_capacity_cpu(i))
12089 break;
12090
12091 if (is_reserved(i))
12092 continue;
12093
12094 if (!rq->misfit_task || rq->curr->sched_class !=
12095 &fair_sched_class)
12096 continue;
12097
12098 wait = wc - rq->curr->last_enqueued_ts;
12099 if (wait > max_wait) {
12100 max_wait = wait;
12101 deserved_cpu = i;
12102 }
12103 }
12104
12105 if (deserved_cpu != src_cpu)
12106 return;
12107
12108 for_each_possible_cpu(i) {
12109 struct rq *rq = cpu_rq(i);
12110
12111 if (!is_max_capacity_cpu(i))
12112 continue;
12113
12114 if (is_reserved(i))
12115 continue;
12116
12117 if (rq->curr->sched_class != &fair_sched_class)
12118 continue;
12119
12120 if (rq->nr_running > 1)
12121 continue;
12122
12123 run = wc - rq->curr->last_enqueued_ts;
12124
12125 if (run < WALT_ROTATION_THRESHOLD_NS)
12126 continue;
12127
12128 if (run > max_run) {
12129 max_run = run;
12130 dst_cpu = i;
12131 }
12132 }
12133
12134 if (dst_cpu == nr_cpu_ids)
12135 return;
12136
12137 dst_rq = cpu_rq(dst_cpu);
12138
12139 double_rq_lock(src_rq, dst_rq);
12140 if (dst_rq->curr->sched_class == &fair_sched_class) {
12141 get_task_struct(src_rq->curr);
12142 get_task_struct(dst_rq->curr);
12143
12144 mark_reserved(src_cpu);
12145 mark_reserved(dst_cpu);
12146 wr = &per_cpu(walt_rotate_works, src_cpu);
12147
12148 wr->src_task = src_rq->curr;
12149 wr->dst_task = dst_rq->curr;
12150
12151 wr->src_cpu = src_cpu;
12152 wr->dst_cpu = dst_cpu;
12153 }
12154 double_rq_unlock(src_rq, dst_rq);
12155
12156 if (wr)
12157 queue_work_on(src_cpu, system_highpri_wq, &wr->w);
12158}
12159#else
12160static inline void walt_check_for_rotation(struct rq *rq)
12161{
12162}
12163#endif
12164
Puja Gupta300812c2017-07-18 17:40:54 -070012165static DEFINE_RAW_SPINLOCK(migration_lock);
Joonwoo Parke77a2012016-12-06 18:12:43 -080012166void check_for_migration(struct rq *rq, struct task_struct *p)
12167{
12168 int new_cpu;
12169 int active_balance;
12170 int cpu = task_cpu(p);
12171
12172 if (rq->misfit_task) {
12173 if (rq->curr->state != TASK_RUNNING ||
12174 rq->curr->nr_cpus_allowed == 1)
12175 return;
12176
Pavankumar Kondeti39c695e2017-07-20 16:05:51 +053012177 if (task_will_be_throttled(p))
12178 return;
12179
Puja Gupta300812c2017-07-18 17:40:54 -070012180 raw_spin_lock(&migration_lock);
Pavankumar Kondetic57119b2017-07-12 18:54:54 +053012181 rcu_read_lock();
Pavankumar Kondeti25ba1fa2018-01-31 16:36:39 +053012182 new_cpu = select_energy_cpu_brute(p, cpu, 0);
Pavankumar Kondetic57119b2017-07-12 18:54:54 +053012183 rcu_read_unlock();
Joonwoo Park6ee025c2017-05-17 10:50:19 -070012184 if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) {
Joonwoo Parke77a2012016-12-06 18:12:43 -080012185 active_balance = kick_active_balance(rq, p, new_cpu);
12186 if (active_balance) {
12187 mark_reserved(new_cpu);
Puja Gupta300812c2017-07-18 17:40:54 -070012188 raw_spin_unlock(&migration_lock);
Joonwoo Parke77a2012016-12-06 18:12:43 -080012189 stop_one_cpu_nowait(cpu,
12190 active_load_balance_cpu_stop, rq,
12191 &rq->active_balance_work);
Puja Gupta300812c2017-07-18 17:40:54 -070012192 return;
Joonwoo Parke77a2012016-12-06 18:12:43 -080012193 }
Pavankumar Kondeti4e13d112018-01-25 01:12:08 +053012194 } else {
12195 walt_check_for_rotation(rq);
Joonwoo Parke77a2012016-12-06 18:12:43 -080012196 }
Puja Gupta300812c2017-07-18 17:40:54 -070012197 raw_spin_unlock(&migration_lock);
Joonwoo Parke77a2012016-12-06 18:12:43 -080012198 }
12199}
12200
Puja Gupta487dec62017-06-27 10:13:50 -070012201#endif /* CONFIG_SCHED_WALT */