blob: ed50a4436834999c55f35c6e6f8791736f86a5ec [file] [log] [blame]
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001/*
2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3 *
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
8 *
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11 *
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15 *
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
Peter Zijlstra21805082007-08-25 18:41:53 +020018 *
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
Peter Zijlstra90eec102015-11-16 11:08:45 +010020 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020021 */
22
Christian Ehrhardt1983a922009-11-30 12:16:47 +010023#include <linux/sched.h>
Mel Gormancb251762016-02-05 09:08:36 +000024#include <linux/latencytop.h>
Sisir Koppaka3436ae12011-03-26 18:22:55 +053025#include <linux/cpumask.h>
Nicolas Pitre83a0a962014-09-04 11:32:10 -040026#include <linux/cpuidle.h>
Peter Zijlstra029632f2011-10-25 10:00:11 +020027#include <linux/slab.h>
28#include <linux/profile.h>
29#include <linux/interrupt.h>
Peter Zijlstracbee9f82012-10-25 14:16:43 +020030#include <linux/mempolicy.h>
Mel Gormane14808b2012-11-19 10:59:15 +000031#include <linux/migrate.h>
Peter Zijlstracbee9f82012-10-25 14:16:43 +020032#include <linux/task_work.h>
Peter Zijlstra029632f2011-10-25 10:00:11 +020033
Peter Zijlstra029632f2011-10-25 10:00:11 +020034#include "sched.h"
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070035#include <trace/events/sched.h>
Arjan van de Ven97455122008-01-25 21:08:34 +010036
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020037/*
Peter Zijlstra21805082007-08-25 18:41:53 +020038 * Targeted preemption latency for CPU-bound tasks:
Takuya Yoshikawa864616e2010-10-14 16:09:13 +090039 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020040 *
Peter Zijlstra21805082007-08-25 18:41:53 +020041 * NOTE: this latency value is not the same as the concept of
Ingo Molnard274a4c2007-10-15 17:00:14 +020042 * 'timeslice length' - timeslices in CFS are of variable length
43 * and have no persistent notion like in traditional, time-slice
44 * based scheduling concepts.
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020045 *
Ingo Molnard274a4c2007-10-15 17:00:14 +020046 * (to see the precise effective timeslice length of your workload,
47 * run vmstat and monitor the context-switches (cs) field)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020048 */
Mike Galbraith21406922010-03-11 17:17:15 +010049unsigned int sysctl_sched_latency = 6000000ULL;
50unsigned int normalized_sysctl_sched_latency = 6000000ULL;
Ingo Molnar2bd8e6d2007-10-15 17:00:02 +020051
52/*
Christian Ehrhardt1983a922009-11-30 12:16:47 +010053 * The initial- and re-scaling of tunables is configurable
54 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
55 *
56 * Options are:
57 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
58 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
59 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
60 */
61enum sched_tunable_scaling sysctl_sched_tunable_scaling
62 = SCHED_TUNABLESCALING_LOG;
63
64/*
Peter Zijlstrab2be5e92007-11-09 22:39:37 +010065 * Minimal preemption granularity for CPU-bound tasks:
Takuya Yoshikawa864616e2010-10-14 16:09:13 +090066 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
Peter Zijlstrab2be5e92007-11-09 22:39:37 +010067 */
Ingo Molnar0bf377b2010-09-12 08:14:52 +020068unsigned int sysctl_sched_min_granularity = 750000ULL;
69unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
Peter Zijlstrab2be5e92007-11-09 22:39:37 +010070
71/*
72 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
73 */
Ingo Molnar0bf377b2010-09-12 08:14:52 +020074static unsigned int sched_nr_latency = 8;
Peter Zijlstrab2be5e92007-11-09 22:39:37 +010075
76/*
Mike Galbraith2bba22c2009-09-09 15:41:37 +020077 * After fork, child runs first. If set to 0 (default) then
Ingo Molnar2bd8e6d2007-10-15 17:00:02 +020078 * parent will (try to) run first.
79 */
Mike Galbraith2bba22c2009-09-09 15:41:37 +020080unsigned int sysctl_sched_child_runs_first __read_mostly;
Peter Zijlstra21805082007-08-25 18:41:53 +020081
82/*
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070083 * Controls whether, when SD_SHARE_PKG_RESOURCES is on, if all
84 * tasks go to idle CPUs when woken. If this is off, note that the
85 * per-task flag PF_WAKE_UP_IDLE can still cause a task to go to an
86 * idle CPU upon being woken.
87 */
88unsigned int __read_mostly sysctl_sched_wake_to_idle;
89
90/*
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020091 * SCHED_OTHER wake-up granularity.
Mike Galbraith172e0822009-09-09 15:41:37 +020092 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020093 *
94 * This option delays the preemption effects of decoupled workloads
95 * and reduces their over-scheduling. Synchronous workloads will still
96 * have immediate wakeup/sleep latencies.
97 */
Mike Galbraith172e0822009-09-09 15:41:37 +020098unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +010099unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200100
Ingo Molnarda84d962007-10-15 17:00:18 +0200101const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
102
Paul Turnera7a4f8a2010-11-15 15:47:06 -0800103/*
104 * The exponential sliding window over which load is averaged for shares
105 * distribution.
106 * (default: 10msec)
107 */
108unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
109
Paul Turnerec12cb72011-07-21 09:43:30 -0700110#ifdef CONFIG_CFS_BANDWIDTH
111/*
112 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
113 * each time a cfs_rq requests quota.
114 *
115 * Note: in the case that the slice exceeds the runtime remaining (either due
116 * to consumption or the quota being specified to be smaller than the slice)
117 * we will always only issue the remaining available time.
118 *
119 * default: 5 msec, units: microseconds
120 */
121unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
122#endif
123
Morten Rasmussen32731632016-07-25 14:34:26 +0100124/*
125 * The margin used when comparing utilization with CPU capacity:
126 * util * 1024 < capacity * margin
127 */
128unsigned int capacity_margin = 1280; /* ~20% */
129
Paul Gortmaker85276322013-04-19 15:10:50 -0400130static inline void update_load_add(struct load_weight *lw, unsigned long inc)
131{
132 lw->weight += inc;
133 lw->inv_weight = 0;
134}
135
136static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
137{
138 lw->weight -= dec;
139 lw->inv_weight = 0;
140}
141
142static inline void update_load_set(struct load_weight *lw, unsigned long w)
143{
144 lw->weight = w;
145 lw->inv_weight = 0;
146}
147
Peter Zijlstra029632f2011-10-25 10:00:11 +0200148/*
149 * Increase the granularity value when there are more CPUs,
150 * because with more CPUs the 'effective latency' as visible
151 * to users decreases. But the relationship is not linear,
152 * so pick a second-best guess by going with the log2 of the
153 * number of CPUs.
154 *
155 * This idea comes from the SD scheduler of Con Kolivas:
156 */
Nicholas Mc Guire58ac93e2015-05-15 21:05:42 +0200157static unsigned int get_update_sysctl_factor(void)
Peter Zijlstra029632f2011-10-25 10:00:11 +0200158{
Nicholas Mc Guire58ac93e2015-05-15 21:05:42 +0200159 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
Peter Zijlstra029632f2011-10-25 10:00:11 +0200160 unsigned int factor;
161
162 switch (sysctl_sched_tunable_scaling) {
163 case SCHED_TUNABLESCALING_NONE:
164 factor = 1;
165 break;
166 case SCHED_TUNABLESCALING_LINEAR:
167 factor = cpus;
168 break;
169 case SCHED_TUNABLESCALING_LOG:
170 default:
171 factor = 1 + ilog2(cpus);
172 break;
173 }
174
175 return factor;
176}
177
178static void update_sysctl(void)
179{
180 unsigned int factor = get_update_sysctl_factor();
181
182#define SET_SYSCTL(name) \
183 (sysctl_##name = (factor) * normalized_sysctl_##name)
184 SET_SYSCTL(sched_min_granularity);
185 SET_SYSCTL(sched_latency);
186 SET_SYSCTL(sched_wakeup_granularity);
187#undef SET_SYSCTL
188}
189
190void sched_init_granularity(void)
191{
192 update_sysctl();
193}
194
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100195#define WMULT_CONST (~0U)
Peter Zijlstra029632f2011-10-25 10:00:11 +0200196#define WMULT_SHIFT 32
197
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100198static void __update_inv_weight(struct load_weight *lw)
Peter Zijlstra029632f2011-10-25 10:00:11 +0200199{
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100200 unsigned long w;
Peter Zijlstra029632f2011-10-25 10:00:11 +0200201
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100202 if (likely(lw->inv_weight))
203 return;
204
205 w = scale_load_down(lw->weight);
206
207 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
208 lw->inv_weight = 1;
209 else if (unlikely(!w))
210 lw->inv_weight = WMULT_CONST;
Peter Zijlstra029632f2011-10-25 10:00:11 +0200211 else
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100212 lw->inv_weight = WMULT_CONST / w;
213}
Peter Zijlstra029632f2011-10-25 10:00:11 +0200214
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100215/*
216 * delta_exec * weight / lw.weight
217 * OR
218 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
219 *
Yuyang Du1c3de5e2016-03-30 07:07:51 +0800220 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100221 * we're guaranteed shift stays positive because inv_weight is guaranteed to
222 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
223 *
224 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
225 * weight/lw.weight <= 1, and therefore our shift will also be positive.
226 */
227static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
228{
229 u64 fact = scale_load_down(weight);
230 int shift = WMULT_SHIFT;
Peter Zijlstra029632f2011-10-25 10:00:11 +0200231
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100232 __update_inv_weight(lw);
233
234 if (unlikely(fact >> 32)) {
235 while (fact >> 32) {
236 fact >>= 1;
237 shift--;
238 }
Peter Zijlstra029632f2011-10-25 10:00:11 +0200239 }
240
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100241 /* hint to use a 32x32->64 mul */
242 fact = (u64)(u32)fact * lw->inv_weight;
Peter Zijlstra029632f2011-10-25 10:00:11 +0200243
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100244 while (fact >> 32) {
245 fact >>= 1;
246 shift--;
247 }
248
249 return mul_u64_u32_shr(delta_exec, fact, shift);
Peter Zijlstra029632f2011-10-25 10:00:11 +0200250}
251
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -0700252#ifdef CONFIG_SMP
253static int active_load_balance_cpu_stop(void *data);
254#endif
Peter Zijlstra029632f2011-10-25 10:00:11 +0200255
256const struct sched_class fair_sched_class;
Peter Zijlstraa4c2f002008-10-17 19:27:03 +0200257
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200258/**************************************************************
259 * CFS operations on generic schedulable entities:
260 */
261
262#ifdef CONFIG_FAIR_GROUP_SCHED
263
264/* cpu runqueue to which this cfs_rq is attached */
265static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
266{
267 return cfs_rq->rq;
268}
269
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200270/* An entity is a task if it doesn't "own" a runqueue */
271#define entity_is_task(se) (!se->my_q)
272
Peter Zijlstra8f488942009-07-24 12:25:30 +0200273static inline struct task_struct *task_of(struct sched_entity *se)
274{
Peter Zijlstra9148a3a2016-09-20 22:34:51 +0200275 SCHED_WARN_ON(!entity_is_task(se));
Peter Zijlstra8f488942009-07-24 12:25:30 +0200276 return container_of(se, struct task_struct, se);
277}
278
Peter Zijlstrab7581492008-04-19 19:45:00 +0200279/* Walk up scheduling entities hierarchy */
280#define for_each_sched_entity(se) \
281 for (; se; se = se->parent)
282
283static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
284{
285 return p->se.cfs_rq;
286}
287
288/* runqueue on which this entity is (to be) queued */
289static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
290{
291 return se->cfs_rq;
292}
293
294/* runqueue "owned" by this group */
295static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
296{
297 return grp->my_q;
298}
299
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800300static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
301{
302 if (!cfs_rq->on_list) {
Paul Turner67e86252010-11-15 15:47:05 -0800303 /*
304 * Ensure we either appear before our parent (if already
305 * enqueued) or force our parent to appear after us when it is
306 * enqueued. The fact that we always enqueue bottom-up
307 * reduces this to two cases.
308 */
309 if (cfs_rq->tg->parent &&
310 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
311 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800312 &rq_of(cfs_rq)->leaf_cfs_rq_list);
Paul Turner67e86252010-11-15 15:47:05 -0800313 } else {
314 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
315 &rq_of(cfs_rq)->leaf_cfs_rq_list);
316 }
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800317
318 cfs_rq->on_list = 1;
319 }
320}
321
322static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
323{
324 if (cfs_rq->on_list) {
325 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
326 cfs_rq->on_list = 0;
327 }
328}
329
Peter Zijlstrab7581492008-04-19 19:45:00 +0200330/* Iterate thr' all leaf cfs_rq's on a runqueue */
331#define for_each_leaf_cfs_rq(rq, cfs_rq) \
332 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
333
334/* Do the two (enqueued) entities belong to the same group ? */
Peter Zijlstrafed14d42012-02-11 06:05:00 +0100335static inline struct cfs_rq *
Peter Zijlstrab7581492008-04-19 19:45:00 +0200336is_same_group(struct sched_entity *se, struct sched_entity *pse)
337{
338 if (se->cfs_rq == pse->cfs_rq)
Peter Zijlstrafed14d42012-02-11 06:05:00 +0100339 return se->cfs_rq;
Peter Zijlstrab7581492008-04-19 19:45:00 +0200340
Peter Zijlstrafed14d42012-02-11 06:05:00 +0100341 return NULL;
Peter Zijlstrab7581492008-04-19 19:45:00 +0200342}
343
344static inline struct sched_entity *parent_entity(struct sched_entity *se)
345{
346 return se->parent;
347}
348
Peter Zijlstra464b7522008-10-24 11:06:15 +0200349static void
350find_matching_se(struct sched_entity **se, struct sched_entity **pse)
351{
352 int se_depth, pse_depth;
353
354 /*
355 * preemption test can be made between sibling entities who are in the
356 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
357 * both tasks until we find their ancestors who are siblings of common
358 * parent.
359 */
360
361 /* First walk up until both entities are at same depth */
Peter Zijlstrafed14d42012-02-11 06:05:00 +0100362 se_depth = (*se)->depth;
363 pse_depth = (*pse)->depth;
Peter Zijlstra464b7522008-10-24 11:06:15 +0200364
365 while (se_depth > pse_depth) {
366 se_depth--;
367 *se = parent_entity(*se);
368 }
369
370 while (pse_depth > se_depth) {
371 pse_depth--;
372 *pse = parent_entity(*pse);
373 }
374
375 while (!is_same_group(*se, *pse)) {
376 *se = parent_entity(*se);
377 *pse = parent_entity(*pse);
378 }
379}
380
Peter Zijlstra8f488942009-07-24 12:25:30 +0200381#else /* !CONFIG_FAIR_GROUP_SCHED */
382
383static inline struct task_struct *task_of(struct sched_entity *se)
384{
385 return container_of(se, struct task_struct, se);
386}
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200387
388static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
389{
390 return container_of(cfs_rq, struct rq, cfs);
391}
392
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200393#define entity_is_task(se) 1
394
Peter Zijlstrab7581492008-04-19 19:45:00 +0200395#define for_each_sched_entity(se) \
396 for (; se; se = NULL)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200397
Peter Zijlstrab7581492008-04-19 19:45:00 +0200398static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200399{
Peter Zijlstrab7581492008-04-19 19:45:00 +0200400 return &task_rq(p)->cfs;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200401}
402
Peter Zijlstrab7581492008-04-19 19:45:00 +0200403static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
404{
405 struct task_struct *p = task_of(se);
406 struct rq *rq = task_rq(p);
407
408 return &rq->cfs;
409}
410
411/* runqueue "owned" by this group */
412static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
413{
414 return NULL;
415}
416
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800417static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
418{
419}
420
421static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
422{
423}
424
Peter Zijlstrab7581492008-04-19 19:45:00 +0200425#define for_each_leaf_cfs_rq(rq, cfs_rq) \
426 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
427
Peter Zijlstrab7581492008-04-19 19:45:00 +0200428static inline struct sched_entity *parent_entity(struct sched_entity *se)
429{
430 return NULL;
431}
432
Peter Zijlstra464b7522008-10-24 11:06:15 +0200433static inline void
434find_matching_se(struct sched_entity **se, struct sched_entity **pse)
435{
436}
437
Peter Zijlstrab7581492008-04-19 19:45:00 +0200438#endif /* CONFIG_FAIR_GROUP_SCHED */
439
Peter Zijlstra6c16a6d2012-03-21 13:07:16 -0700440static __always_inline
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100441void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200442
443/**************************************************************
444 * Scheduling class tree data structure manipulation methods:
445 */
446
Andrei Epure1bf08232013-03-12 21:12:24 +0200447static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
Peter Zijlstra02e04312007-10-15 17:00:07 +0200448{
Andrei Epure1bf08232013-03-12 21:12:24 +0200449 s64 delta = (s64)(vruntime - max_vruntime);
Peter Zijlstra368059a2007-10-15 17:00:11 +0200450 if (delta > 0)
Andrei Epure1bf08232013-03-12 21:12:24 +0200451 max_vruntime = vruntime;
Peter Zijlstra02e04312007-10-15 17:00:07 +0200452
Andrei Epure1bf08232013-03-12 21:12:24 +0200453 return max_vruntime;
Peter Zijlstra02e04312007-10-15 17:00:07 +0200454}
455
Ingo Molnar0702e3e2007-10-15 17:00:14 +0200456static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstrab0ffd242007-10-15 17:00:12 +0200457{
458 s64 delta = (s64)(vruntime - min_vruntime);
459 if (delta < 0)
460 min_vruntime = vruntime;
461
462 return min_vruntime;
463}
464
Fabio Checconi54fdc582009-07-16 12:32:27 +0200465static inline int entity_before(struct sched_entity *a,
466 struct sched_entity *b)
467{
468 return (s64)(a->vruntime - b->vruntime) < 0;
469}
470
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200471static void update_min_vruntime(struct cfs_rq *cfs_rq)
472{
Peter Zijlstrab60205c2016-09-20 21:58:12 +0200473 struct sched_entity *curr = cfs_rq->curr;
474
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200475 u64 vruntime = cfs_rq->min_vruntime;
476
Peter Zijlstrab60205c2016-09-20 21:58:12 +0200477 if (curr) {
478 if (curr->on_rq)
479 vruntime = curr->vruntime;
480 else
481 curr = NULL;
482 }
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200483
484 if (cfs_rq->rb_leftmost) {
485 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
486 struct sched_entity,
487 run_node);
488
Peter Zijlstrab60205c2016-09-20 21:58:12 +0200489 if (!curr)
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200490 vruntime = se->vruntime;
491 else
492 vruntime = min_vruntime(vruntime, se->vruntime);
493 }
494
Andrei Epure1bf08232013-03-12 21:12:24 +0200495 /* ensure we never gain time by being placed backwards. */
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200496 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
Peter Zijlstra3fe16982011-04-05 17:23:48 +0200497#ifndef CONFIG_64BIT
498 smp_wmb();
499 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
500#endif
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200501}
502
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200503/*
504 * Enqueue an entity into the rb-tree:
505 */
Ingo Molnar0702e3e2007-10-15 17:00:14 +0200506static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200507{
508 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
509 struct rb_node *parent = NULL;
510 struct sched_entity *entry;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200511 int leftmost = 1;
512
513 /*
514 * Find the right place in the rbtree:
515 */
516 while (*link) {
517 parent = *link;
518 entry = rb_entry(parent, struct sched_entity, run_node);
519 /*
520 * We dont care about collisions. Nodes with
521 * the same key stay together.
522 */
Stephan Baerwolf2bd2d6f2011-07-20 14:46:59 +0200523 if (entity_before(se, entry)) {
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200524 link = &parent->rb_left;
525 } else {
526 link = &parent->rb_right;
527 leftmost = 0;
528 }
529 }
530
531 /*
532 * Maintain a cache of leftmost tree entries (it is frequently
533 * used):
534 */
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200535 if (leftmost)
Ingo Molnar57cb4992007-10-15 17:00:11 +0200536 cfs_rq->rb_leftmost = &se->run_node;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200537
538 rb_link_node(&se->run_node, parent, link);
539 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200540}
541
Ingo Molnar0702e3e2007-10-15 17:00:14 +0200542static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200543{
Peter Zijlstra3fe69742008-03-14 20:55:51 +0100544 if (cfs_rq->rb_leftmost == &se->run_node) {
545 struct rb_node *next_node;
Peter Zijlstra3fe69742008-03-14 20:55:51 +0100546
547 next_node = rb_next(&se->run_node);
548 cfs_rq->rb_leftmost = next_node;
Peter Zijlstra3fe69742008-03-14 20:55:51 +0100549 }
Ingo Molnare9acbff2007-10-15 17:00:04 +0200550
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200551 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200552}
553
Peter Zijlstra029632f2011-10-25 10:00:11 +0200554struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200555{
Peter Zijlstraf4b67552008-11-04 21:25:07 +0100556 struct rb_node *left = cfs_rq->rb_leftmost;
557
558 if (!left)
559 return NULL;
560
561 return rb_entry(left, struct sched_entity, run_node);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200562}
563
Rik van Rielac53db52011-02-01 09:51:03 -0500564static struct sched_entity *__pick_next_entity(struct sched_entity *se)
565{
566 struct rb_node *next = rb_next(&se->run_node);
567
568 if (!next)
569 return NULL;
570
571 return rb_entry(next, struct sched_entity, run_node);
572}
573
574#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra029632f2011-10-25 10:00:11 +0200575struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
Peter Zijlstraaeb73b02007-10-15 17:00:05 +0200576{
Ingo Molnar7eee3e62008-02-22 10:32:21 +0100577 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
Peter Zijlstraaeb73b02007-10-15 17:00:05 +0200578
Balbir Singh70eee742008-02-22 13:25:53 +0530579 if (!last)
580 return NULL;
Ingo Molnar7eee3e62008-02-22 10:32:21 +0100581
582 return rb_entry(last, struct sched_entity, run_node);
Peter Zijlstraaeb73b02007-10-15 17:00:05 +0200583}
584
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200585/**************************************************************
586 * Scheduling class statistics methods:
587 */
588
Christian Ehrhardtacb4a842009-11-30 12:16:48 +0100589int sched_proc_update_handler(struct ctl_table *table, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -0700590 void __user *buffer, size_t *lenp,
Peter Zijlstrab2be5e92007-11-09 22:39:37 +0100591 loff_t *ppos)
592{
Alexey Dobriyan8d65af72009-09-23 15:57:19 -0700593 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Nicholas Mc Guire58ac93e2015-05-15 21:05:42 +0200594 unsigned int factor = get_update_sysctl_factor();
Peter Zijlstrab2be5e92007-11-09 22:39:37 +0100595
596 if (ret || !write)
597 return ret;
598
599 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
600 sysctl_sched_min_granularity);
601
Christian Ehrhardtacb4a842009-11-30 12:16:48 +0100602#define WRT_SYSCTL(name) \
603 (normalized_sysctl_##name = sysctl_##name / (factor))
604 WRT_SYSCTL(sched_min_granularity);
605 WRT_SYSCTL(sched_latency);
606 WRT_SYSCTL(sched_wakeup_granularity);
Christian Ehrhardtacb4a842009-11-30 12:16:48 +0100607#undef WRT_SYSCTL
608
Peter Zijlstrab2be5e92007-11-09 22:39:37 +0100609 return 0;
610}
611#endif
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200612
613/*
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200614 * delta /= w
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200615 */
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100616static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200617{
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200618 if (unlikely(se->load.weight != NICE_0_LOAD))
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100619 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200620
621 return delta;
622}
623
624/*
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200625 * The idea is to set a period in which each task runs once.
626 *
Borislav Petkov532b1852012-08-08 16:16:04 +0200627 * When there are too many tasks (sched_nr_latency) we have to stretch
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200628 * this period because otherwise the slices get too small.
629 *
630 * p = (nr <= nl) ? l : l*nr/nl
631 */
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +0200632static u64 __sched_period(unsigned long nr_running)
633{
Boqun Feng8e2b0bf2015-07-02 22:25:52 +0800634 if (unlikely(nr_running > sched_nr_latency))
635 return nr_running * sysctl_sched_min_granularity;
636 else
637 return sysctl_sched_latency;
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +0200638}
639
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200640/*
641 * We calculate the wall-time slice from the period by taking a part
642 * proportional to the weight.
643 *
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200644 * s = p*P[w/rw]
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200645 */
Peter Zijlstra6d0f0ebd2007-10-15 17:00:05 +0200646static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
Peter Zijlstra21805082007-08-25 18:41:53 +0200647{
Mike Galbraith0a582442009-01-02 12:16:42 +0100648 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200649
Mike Galbraith0a582442009-01-02 12:16:42 +0100650 for_each_sched_entity(se) {
Lin Ming6272d682009-01-15 17:17:15 +0100651 struct load_weight *load;
Christian Engelmayer3104bf02009-06-16 10:35:12 +0200652 struct load_weight lw;
Lin Ming6272d682009-01-15 17:17:15 +0100653
654 cfs_rq = cfs_rq_of(se);
655 load = &cfs_rq->load;
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200656
Mike Galbraith0a582442009-01-02 12:16:42 +0100657 if (unlikely(!se->on_rq)) {
Christian Engelmayer3104bf02009-06-16 10:35:12 +0200658 lw = cfs_rq->load;
Mike Galbraith0a582442009-01-02 12:16:42 +0100659
660 update_load_add(&lw, se->load.weight);
661 load = &lw;
662 }
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100663 slice = __calc_delta(slice, se->load.weight, load);
Mike Galbraith0a582442009-01-02 12:16:42 +0100664 }
665 return slice;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200666}
667
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200668/*
Andrei Epure660cc002013-03-11 12:03:20 +0200669 * We calculate the vruntime slice of a to-be-inserted task.
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200670 *
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200671 * vs = s/w
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200672 */
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200673static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200674{
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200675 return calc_delta_fair(sched_slice(cfs_rq, se), se);
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200676}
677
Alex Shia75cdaa2013-06-20 10:18:47 +0800678#ifdef CONFIG_SMP
Morten Rasmussen772bd008c2016-06-22 18:03:13 +0100679static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
Mel Gormanfb13c7e2013-10-07 11:29:17 +0100680static unsigned long task_h_load(struct task_struct *p);
681
Yuyang Du9d89c252015-07-15 08:04:37 +0800682/*
683 * We choose a half-life close to 1 scheduling period.
Leo Yan84fb5a12015-09-15 18:57:37 +0800684 * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
685 * dependent on this value.
Yuyang Du9d89c252015-07-15 08:04:37 +0800686 */
687#define LOAD_AVG_PERIOD 32
688#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
Leo Yan84fb5a12015-09-15 18:57:37 +0800689#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
Alex Shia75cdaa2013-06-20 10:18:47 +0800690
Yuyang Du540247f2015-07-15 08:04:39 +0800691/* Give new sched_entity start runnable values to heavy its load in infant time */
692void init_entity_runnable_average(struct sched_entity *se)
Alex Shia75cdaa2013-06-20 10:18:47 +0800693{
Yuyang Du540247f2015-07-15 08:04:39 +0800694 struct sched_avg *sa = &se->avg;
Alex Shia75cdaa2013-06-20 10:18:47 +0800695
Yuyang Du9d89c252015-07-15 08:04:37 +0800696 sa->last_update_time = 0;
697 /*
698 * sched_avg's period_contrib should be strictly less then 1024, so
699 * we give it 1023 to make sure it is almost a period (1024us), and
700 * will definitely be update (after enqueue).
701 */
702 sa->period_contrib = 1023;
Vincent Guittotb5a9b342016-10-19 14:45:23 +0200703 /*
704 * Tasks are intialized with full load to be seen as heavy tasks until
705 * they get a chance to stabilize to their real load level.
706 * Group entities are intialized with zero load to reflect the fact that
707 * nothing has been attached to the task group yet.
708 */
709 if (entity_is_task(se))
710 sa->load_avg = scale_load_down(se->load.weight);
Yuyang Du9d89c252015-07-15 08:04:37 +0800711 sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800712 /*
713 * At this point, util_avg won't be used in select_task_rq_fair anyway
714 */
715 sa->util_avg = 0;
716 sa->util_sum = 0;
Yuyang Du9d89c252015-07-15 08:04:37 +0800717 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
Alex Shia75cdaa2013-06-20 10:18:47 +0800718}
Yuyang Du7ea241a2015-07-15 08:04:42 +0800719
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200720static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
721static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
Peter Zijlstra3d30544f2016-06-21 14:27:50 +0200722static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200723static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
724
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800725/*
726 * With new tasks being created, their initial util_avgs are extrapolated
727 * based on the cfs_rq's current util_avg:
728 *
729 * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
730 *
731 * However, in many cases, the above util_avg does not give a desired
732 * value. Moreover, the sum of the util_avgs may be divergent, such
733 * as when the series is a harmonic series.
734 *
735 * To solve this problem, we also cap the util_avg of successive tasks to
736 * only 1/2 of the left utilization budget:
737 *
738 * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
739 *
740 * where n denotes the nth task.
741 *
742 * For example, a simplest series from the beginning would be like:
743 *
744 * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
745 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
746 *
747 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
748 * if util_avg > util_avg_cap.
749 */
750void post_init_entity_util_avg(struct sched_entity *se)
751{
752 struct cfs_rq *cfs_rq = cfs_rq_of(se);
753 struct sched_avg *sa = &se->avg;
Yuyang Du172895e2016-04-05 12:12:27 +0800754 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200755 u64 now = cfs_rq_clock_task(cfs_rq);
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800756
757 if (cap > 0) {
758 if (cfs_rq->avg.util_avg != 0) {
759 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
760 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
761
762 if (sa->util_avg > cap)
763 sa->util_avg = cap;
764 } else {
765 sa->util_avg = cap;
766 }
767 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
768 }
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200769
770 if (entity_is_task(se)) {
771 struct task_struct *p = task_of(se);
772 if (p->sched_class != &fair_sched_class) {
773 /*
774 * For !fair tasks do:
775 *
776 update_cfs_rq_load_avg(now, cfs_rq, false);
777 attach_entity_load_avg(cfs_rq, se);
778 switched_from_fair(rq, p);
779 *
780 * such that the next switched_to_fair() has the
781 * expected state.
782 */
783 se->avg.last_update_time = now;
784 return;
785 }
786 }
787
Peter Zijlstra7c3edd22016-07-13 10:56:25 +0200788 update_cfs_rq_load_avg(now, cfs_rq, false);
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200789 attach_entity_load_avg(cfs_rq, se);
Peter Zijlstra7c3edd22016-07-13 10:56:25 +0200790 update_tg_load_avg(cfs_rq, false);
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800791}
792
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200793#else /* !CONFIG_SMP */
Yuyang Du540247f2015-07-15 08:04:39 +0800794void init_entity_runnable_average(struct sched_entity *se)
Alex Shia75cdaa2013-06-20 10:18:47 +0800795{
796}
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800797void post_init_entity_util_avg(struct sched_entity *se)
798{
799}
Peter Zijlstra3d30544f2016-06-21 14:27:50 +0200800static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
801{
802}
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200803#endif /* CONFIG_SMP */
Alex Shia75cdaa2013-06-20 10:18:47 +0800804
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200805/*
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100806 * Update the current task's runtime statistics.
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200807 */
Ingo Molnarb7cc0892007-08-09 11:16:47 +0200808static void update_curr(struct cfs_rq *cfs_rq)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200809{
Ingo Molnar429d43b2007-10-15 17:00:03 +0200810 struct sched_entity *curr = cfs_rq->curr;
Frederic Weisbecker78becc22013-04-12 01:51:02 +0200811 u64 now = rq_clock_task(rq_of(cfs_rq));
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100812 u64 delta_exec;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200813
814 if (unlikely(!curr))
815 return;
816
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100817 delta_exec = now - curr->exec_start;
818 if (unlikely((s64)delta_exec <= 0))
Peter Zijlstra34f28ec2008-12-16 08:45:31 +0100819 return;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200820
Ingo Molnar8ebc91d2007-10-15 17:00:03 +0200821 curr->exec_start = now;
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +0100822
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100823 schedstat_set(curr->statistics.exec_max,
824 max(delta_exec, curr->statistics.exec_max));
825
826 curr->sum_exec_runtime += delta_exec;
Josh Poimboeufae928822016-06-17 12:43:24 -0500827 schedstat_add(cfs_rq->exec_clock, delta_exec);
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100828
829 curr->vruntime += calc_delta_fair(delta_exec, curr);
830 update_min_vruntime(cfs_rq);
831
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +0100832 if (entity_is_task(curr)) {
833 struct task_struct *curtask = task_of(curr);
834
Ingo Molnarf977bb42009-09-13 18:15:54 +0200835 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +0100836 cpuacct_charge(curtask, delta_exec);
Frank Mayharf06febc2008-09-12 09:54:39 -0700837 account_group_exec_runtime(curtask, delta_exec);
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +0100838 }
Paul Turnerec12cb72011-07-21 09:43:30 -0700839
840 account_cfs_rq_runtime(cfs_rq, delta_exec);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200841}
842
Stanislaw Gruszka6e998912014-11-12 16:58:44 +0100843static void update_curr_fair(struct rq *rq)
844{
845 update_curr(cfs_rq_of(&rq->curr->se));
846}
847
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200848static inline void
Ingo Molnar5870db52007-08-09 11:16:47 +0200849update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200850{
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500851 u64 wait_start, prev_wait_start;
852
853 if (!schedstat_enabled())
854 return;
855
856 wait_start = rq_clock(rq_of(cfs_rq));
857 prev_wait_start = schedstat_val(se->statistics.wait_start);
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800858
859 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500860 likely(wait_start > prev_wait_start))
861 wait_start -= prev_wait_start;
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800862
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500863 schedstat_set(se->statistics.wait_start, wait_start);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200864}
865
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500866static inline void
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800867update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
868{
869 struct task_struct *p;
Mel Gormancb251762016-02-05 09:08:36 +0000870 u64 delta;
871
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500872 if (!schedstat_enabled())
873 return;
874
875 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800876
877 if (entity_is_task(se)) {
878 p = task_of(se);
879 if (task_on_rq_migrating(p)) {
880 /*
881 * Preserve migrating task's wait time so wait_start
882 * time stamp can be adjusted to accumulate wait time
883 * prior to migration.
884 */
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500885 schedstat_set(se->statistics.wait_start, delta);
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800886 return;
887 }
888 trace_sched_stat_wait(p, delta);
889 }
890
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500891 schedstat_set(se->statistics.wait_max,
892 max(schedstat_val(se->statistics.wait_max), delta));
893 schedstat_inc(se->statistics.wait_count);
894 schedstat_add(se->statistics.wait_sum, delta);
895 schedstat_set(se->statistics.wait_start, 0);
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800896}
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800897
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500898static inline void
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500899update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
900{
901 struct task_struct *tsk = NULL;
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500902 u64 sleep_start, block_start;
903
904 if (!schedstat_enabled())
905 return;
906
907 sleep_start = schedstat_val(se->statistics.sleep_start);
908 block_start = schedstat_val(se->statistics.block_start);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500909
910 if (entity_is_task(se))
911 tsk = task_of(se);
912
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500913 if (sleep_start) {
914 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500915
916 if ((s64)delta < 0)
917 delta = 0;
918
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500919 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
920 schedstat_set(se->statistics.sleep_max, delta);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500921
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500922 schedstat_set(se->statistics.sleep_start, 0);
923 schedstat_add(se->statistics.sum_sleep_runtime, delta);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500924
925 if (tsk) {
926 account_scheduler_latency(tsk, delta >> 10, 1);
927 trace_sched_stat_sleep(tsk, delta);
928 }
929 }
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500930 if (block_start) {
931 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500932
933 if ((s64)delta < 0)
934 delta = 0;
935
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500936 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
937 schedstat_set(se->statistics.block_max, delta);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500938
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500939 schedstat_set(se->statistics.block_start, 0);
940 schedstat_add(se->statistics.sum_sleep_runtime, delta);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500941
942 if (tsk) {
943 if (tsk->in_iowait) {
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500944 schedstat_add(se->statistics.iowait_sum, delta);
945 schedstat_inc(se->statistics.iowait_count);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500946 trace_sched_stat_iowait(tsk, delta);
947 }
948
949 trace_sched_stat_blocked(tsk, delta);
Riley Andrews4c873b42015-10-02 00:39:53 -0700950 trace_sched_blocked_reason(tsk);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500951
952 /*
953 * Blocking time is in units of nanosecs, so shift by
954 * 20 to get a milliseconds-range estimation of the
955 * amount of time that the task spent sleeping:
956 */
957 if (unlikely(prof_on == SLEEP_PROFILING)) {
958 profile_hits(SLEEP_PROFILING,
959 (void *)get_wchan(tsk),
960 delta >> 20);
961 }
962 account_scheduler_latency(tsk, delta >> 10, 0);
963 }
964 }
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200965}
966
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200967/*
968 * Task is being enqueued - update stats:
969 */
Mel Gormancb251762016-02-05 09:08:36 +0000970static inline void
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500971update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200972{
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500973 if (!schedstat_enabled())
974 return;
975
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200976 /*
977 * Are we enqueueing a waiting task? (for current tasks
978 * a dequeue/enqueue event is a NOP)
979 */
Ingo Molnar429d43b2007-10-15 17:00:03 +0200980 if (se != cfs_rq->curr)
Ingo Molnar5870db52007-08-09 11:16:47 +0200981 update_stats_wait_start(cfs_rq, se);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500982
983 if (flags & ENQUEUE_WAKEUP)
984 update_stats_enqueue_sleeper(cfs_rq, se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200985}
986
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200987static inline void
Mel Gormancb251762016-02-05 09:08:36 +0000988update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200989{
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -0500990
991 if (!schedstat_enabled())
992 return;
993
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200994 /*
995 * Mark the end of the wait period if dequeueing a
996 * waiting task:
997 */
Ingo Molnar429d43b2007-10-15 17:00:03 +0200998 if (se != cfs_rq->curr)
Ingo Molnar9ef0a962007-08-09 11:16:47 +0200999 update_stats_wait_end(cfs_rq, se);
Mel Gormancb251762016-02-05 09:08:36 +00001000
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05001001 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1002 struct task_struct *tsk = task_of(se);
Mel Gormancb251762016-02-05 09:08:36 +00001003
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05001004 if (tsk->state & TASK_INTERRUPTIBLE)
1005 schedstat_set(se->statistics.sleep_start,
1006 rq_clock(rq_of(cfs_rq)));
1007 if (tsk->state & TASK_UNINTERRUPTIBLE)
1008 schedstat_set(se->statistics.block_start,
1009 rq_clock(rq_of(cfs_rq)));
Mel Gormancb251762016-02-05 09:08:36 +00001010 }
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001011}
1012
1013/*
1014 * We are picking a new current task - update its stats:
1015 */
1016static inline void
Ingo Molnar79303e92007-08-09 11:16:47 +02001017update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001018{
1019 /*
1020 * We are starting a new run period:
1021 */
Frederic Weisbecker78becc22013-04-12 01:51:02 +02001022 se->exec_start = rq_clock_task(rq_of(cfs_rq));
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001023}
1024
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001025/**************************************************
1026 * Scheduling class queueing methods:
1027 */
1028
Peter Zijlstracbee9f82012-10-25 14:16:43 +02001029#ifdef CONFIG_NUMA_BALANCING
1030/*
Mel Gorman598f0ec2013-10-07 11:28:55 +01001031 * Approximate time to scan a full NUMA task in ms. The task scan period is
1032 * calculated based on the tasks virtual memory size and
1033 * numa_balancing_scan_size.
Peter Zijlstracbee9f82012-10-25 14:16:43 +02001034 */
Mel Gorman598f0ec2013-10-07 11:28:55 +01001035unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1036unsigned int sysctl_numa_balancing_scan_period_max = 60000;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02001037
1038/* Portion of address space to scan in MB */
1039unsigned int sysctl_numa_balancing_scan_size = 256;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02001040
Peter Zijlstra4b96a29b2012-10-25 14:16:47 +02001041/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1042unsigned int sysctl_numa_balancing_scan_delay = 1000;
1043
Mel Gorman598f0ec2013-10-07 11:28:55 +01001044static unsigned int task_nr_scan_windows(struct task_struct *p)
1045{
1046 unsigned long rss = 0;
1047 unsigned long nr_scan_pages;
1048
1049 /*
1050 * Calculations based on RSS as non-present and empty pages are skipped
1051 * by the PTE scanner and NUMA hinting faults should be trapped based
1052 * on resident pages
1053 */
1054 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1055 rss = get_mm_rss(p->mm);
1056 if (!rss)
1057 rss = nr_scan_pages;
1058
1059 rss = round_up(rss, nr_scan_pages);
1060 return rss / nr_scan_pages;
1061}
1062
1063/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1064#define MAX_SCAN_WINDOW 2560
1065
1066static unsigned int task_scan_min(struct task_struct *p)
1067{
Jason Low316c1608d2015-04-28 13:00:20 -07001068 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
Mel Gorman598f0ec2013-10-07 11:28:55 +01001069 unsigned int scan, floor;
1070 unsigned int windows = 1;
1071
Kirill Tkhai64192652014-10-16 14:39:37 +04001072 if (scan_size < MAX_SCAN_WINDOW)
1073 windows = MAX_SCAN_WINDOW / scan_size;
Mel Gorman598f0ec2013-10-07 11:28:55 +01001074 floor = 1000 / windows;
1075
1076 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1077 return max_t(unsigned int, floor, scan);
1078}
1079
1080static unsigned int task_scan_max(struct task_struct *p)
1081{
1082 unsigned int smin = task_scan_min(p);
1083 unsigned int smax;
1084
1085 /* Watch for min being lower than max due to floor calculations */
1086 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1087 return max(smin, smax);
1088}
1089
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01001090static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1091{
1092 rq->nr_numa_running += (p->numa_preferred_nid != -1);
1093 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1094}
1095
1096static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1097{
1098 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1099 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1100}
1101
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01001102struct numa_group {
1103 atomic_t refcount;
1104
1105 spinlock_t lock; /* nr_tasks, tasks */
1106 int nr_tasks;
Mel Gormane29cf082013-10-07 11:29:22 +01001107 pid_t gid;
Rik van Riel4142c3e2016-01-25 17:07:39 -05001108 int active_nodes;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01001109
1110 struct rcu_head rcu;
Mel Gorman989348b2013-10-07 11:29:40 +01001111 unsigned long total_faults;
Rik van Riel4142c3e2016-01-25 17:07:39 -05001112 unsigned long max_faults_cpu;
Rik van Riel7e2703e2014-01-27 17:03:45 -05001113 /*
1114 * Faults_cpu is used to decide whether memory should move
1115 * towards the CPU. As a consequence, these stats are weighted
1116 * more by CPU use than by memory faults.
1117 */
Rik van Riel50ec8a42014-01-27 17:03:42 -05001118 unsigned long *faults_cpu;
Mel Gorman989348b2013-10-07 11:29:40 +01001119 unsigned long faults[0];
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01001120};
1121
Rik van Rielbe1e4e72014-01-27 17:03:48 -05001122/* Shared or private faults. */
1123#define NR_NUMA_HINT_FAULT_TYPES 2
1124
1125/* Memory and CPU locality */
1126#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1127
1128/* Averaged statistics, and temporary buffers. */
1129#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1130
Mel Gormane29cf082013-10-07 11:29:22 +01001131pid_t task_numa_group_id(struct task_struct *p)
1132{
1133 return p->numa_group ? p->numa_group->gid : 0;
1134}
1135
Iulia Manda44dba3d2014-10-31 02:13:31 +02001136/*
1137 * The averaged statistics, shared & private, memory & cpu,
1138 * occupy the first half of the array. The second half of the
1139 * array is for current counters, which are averaged into the
1140 * first set by task_numa_placement.
1141 */
1142static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
Mel Gormanac8e8952013-10-07 11:29:03 +01001143{
Iulia Manda44dba3d2014-10-31 02:13:31 +02001144 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
Mel Gormanac8e8952013-10-07 11:29:03 +01001145}
1146
1147static inline unsigned long task_faults(struct task_struct *p, int nid)
1148{
Iulia Manda44dba3d2014-10-31 02:13:31 +02001149 if (!p->numa_faults)
Mel Gormanac8e8952013-10-07 11:29:03 +01001150 return 0;
1151
Iulia Manda44dba3d2014-10-31 02:13:31 +02001152 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1153 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
Mel Gormanac8e8952013-10-07 11:29:03 +01001154}
1155
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001156static inline unsigned long group_faults(struct task_struct *p, int nid)
1157{
1158 if (!p->numa_group)
1159 return 0;
1160
Iulia Manda44dba3d2014-10-31 02:13:31 +02001161 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1162 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001163}
1164
Rik van Riel20e07de2014-01-27 17:03:43 -05001165static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1166{
Iulia Manda44dba3d2014-10-31 02:13:31 +02001167 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1168 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
Rik van Riel20e07de2014-01-27 17:03:43 -05001169}
1170
Rik van Riel4142c3e2016-01-25 17:07:39 -05001171/*
1172 * A node triggering more than 1/3 as many NUMA faults as the maximum is
1173 * considered part of a numa group's pseudo-interleaving set. Migrations
1174 * between these nodes are slowed down, to allow things to settle down.
1175 */
1176#define ACTIVE_NODE_FRACTION 3
1177
1178static bool numa_is_active_node(int nid, struct numa_group *ng)
1179{
1180 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1181}
1182
Rik van Riel6c6b1192014-10-17 03:29:52 -04001183/* Handle placement on systems where not all nodes are directly connected. */
1184static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1185 int maxdist, bool task)
1186{
1187 unsigned long score = 0;
1188 int node;
1189
1190 /*
1191 * All nodes are directly connected, and the same distance
1192 * from each other. No need for fancy placement algorithms.
1193 */
1194 if (sched_numa_topology_type == NUMA_DIRECT)
1195 return 0;
1196
1197 /*
1198 * This code is called for each node, introducing N^2 complexity,
1199 * which should be ok given the number of nodes rarely exceeds 8.
1200 */
1201 for_each_online_node(node) {
1202 unsigned long faults;
1203 int dist = node_distance(nid, node);
1204
1205 /*
1206 * The furthest away nodes in the system are not interesting
1207 * for placement; nid was already counted.
1208 */
1209 if (dist == sched_max_numa_distance || node == nid)
1210 continue;
1211
1212 /*
1213 * On systems with a backplane NUMA topology, compare groups
1214 * of nodes, and move tasks towards the group with the most
1215 * memory accesses. When comparing two nodes at distance
1216 * "hoplimit", only nodes closer by than "hoplimit" are part
1217 * of each group. Skip other nodes.
1218 */
1219 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1220 dist > maxdist)
1221 continue;
1222
1223 /* Add up the faults from nearby nodes. */
1224 if (task)
1225 faults = task_faults(p, node);
1226 else
1227 faults = group_faults(p, node);
1228
1229 /*
1230 * On systems with a glueless mesh NUMA topology, there are
1231 * no fixed "groups of nodes". Instead, nodes that are not
1232 * directly connected bounce traffic through intermediate
1233 * nodes; a numa_group can occupy any set of nodes.
1234 * The further away a node is, the less the faults count.
1235 * This seems to result in good task placement.
1236 */
1237 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1238 faults *= (sched_max_numa_distance - dist);
1239 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1240 }
1241
1242 score += faults;
1243 }
1244
1245 return score;
1246}
1247
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001248/*
1249 * These return the fraction of accesses done by a particular task, or
1250 * task group, on a particular numa node. The group weight is given a
1251 * larger multiplier, in order to group tasks together that are almost
1252 * evenly spread out between numa nodes.
1253 */
Rik van Riel7bd95322014-10-17 03:29:51 -04001254static inline unsigned long task_weight(struct task_struct *p, int nid,
1255 int dist)
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001256{
Rik van Riel7bd95322014-10-17 03:29:51 -04001257 unsigned long faults, total_faults;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001258
Iulia Manda44dba3d2014-10-31 02:13:31 +02001259 if (!p->numa_faults)
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001260 return 0;
1261
1262 total_faults = p->total_numa_faults;
1263
1264 if (!total_faults)
1265 return 0;
1266
Rik van Riel7bd95322014-10-17 03:29:51 -04001267 faults = task_faults(p, nid);
Rik van Riel6c6b1192014-10-17 03:29:52 -04001268 faults += score_nearby_nodes(p, nid, dist, true);
1269
Rik van Riel7bd95322014-10-17 03:29:51 -04001270 return 1000 * faults / total_faults;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001271}
1272
Rik van Riel7bd95322014-10-17 03:29:51 -04001273static inline unsigned long group_weight(struct task_struct *p, int nid,
1274 int dist)
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001275{
Rik van Riel7bd95322014-10-17 03:29:51 -04001276 unsigned long faults, total_faults;
1277
1278 if (!p->numa_group)
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001279 return 0;
1280
Rik van Riel7bd95322014-10-17 03:29:51 -04001281 total_faults = p->numa_group->total_faults;
1282
1283 if (!total_faults)
1284 return 0;
1285
1286 faults = group_faults(p, nid);
Rik van Riel6c6b1192014-10-17 03:29:52 -04001287 faults += score_nearby_nodes(p, nid, dist, false);
1288
Rik van Riel7bd95322014-10-17 03:29:51 -04001289 return 1000 * faults / total_faults;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001290}
1291
Rik van Riel10f39042014-01-27 17:03:44 -05001292bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1293 int src_nid, int dst_cpu)
1294{
1295 struct numa_group *ng = p->numa_group;
1296 int dst_nid = cpu_to_node(dst_cpu);
1297 int last_cpupid, this_cpupid;
1298
1299 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1300
1301 /*
1302 * Multi-stage node selection is used in conjunction with a periodic
1303 * migration fault to build a temporal task<->page relation. By using
1304 * a two-stage filter we remove short/unlikely relations.
1305 *
1306 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1307 * a task's usage of a particular page (n_p) per total usage of this
1308 * page (n_t) (in a given time-span) to a probability.
1309 *
1310 * Our periodic faults will sample this probability and getting the
1311 * same result twice in a row, given these samples are fully
1312 * independent, is then given by P(n)^2, provided our sample period
1313 * is sufficiently short compared to the usage pattern.
1314 *
1315 * This quadric squishes small probabilities, making it less likely we
1316 * act on an unlikely task<->page relation.
1317 */
1318 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1319 if (!cpupid_pid_unset(last_cpupid) &&
1320 cpupid_to_nid(last_cpupid) != dst_nid)
1321 return false;
1322
1323 /* Always allow migrate on private faults */
1324 if (cpupid_match_pid(p, last_cpupid))
1325 return true;
1326
1327 /* A shared fault, but p->numa_group has not been set up yet. */
1328 if (!ng)
1329 return true;
1330
1331 /*
Rik van Riel4142c3e2016-01-25 17:07:39 -05001332 * Destination node is much more heavily used than the source
1333 * node? Allow migration.
Rik van Riel10f39042014-01-27 17:03:44 -05001334 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05001335 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1336 ACTIVE_NODE_FRACTION)
Rik van Riel10f39042014-01-27 17:03:44 -05001337 return true;
1338
1339 /*
Rik van Riel4142c3e2016-01-25 17:07:39 -05001340 * Distribute memory according to CPU & memory use on each node,
1341 * with 3/4 hysteresis to avoid unnecessary memory migrations:
1342 *
1343 * faults_cpu(dst) 3 faults_cpu(src)
1344 * --------------- * - > ---------------
1345 * faults_mem(dst) 4 faults_mem(src)
Rik van Riel10f39042014-01-27 17:03:44 -05001346 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05001347 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1348 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
Rik van Riel10f39042014-01-27 17:03:44 -05001349}
1350
Mel Gormane6628d52013-10-07 11:29:02 +01001351static unsigned long weighted_cpuload(const int cpu);
Mel Gorman58d081b2013-10-07 11:29:10 +01001352static unsigned long source_load(int cpu, int type);
1353static unsigned long target_load(int cpu, int type);
Nicolas Pitreced549f2014-05-26 18:19:38 -04001354static unsigned long capacity_of(int cpu);
Mel Gorman58d081b2013-10-07 11:29:10 +01001355static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
Mel Gormane6628d52013-10-07 11:29:02 +01001356
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001357/* Cached statistics for all CPUs within a node */
Mel Gorman58d081b2013-10-07 11:29:10 +01001358struct numa_stats {
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001359 unsigned long nr_running;
Mel Gorman58d081b2013-10-07 11:29:10 +01001360 unsigned long load;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001361
1362 /* Total compute capacity of CPUs on a node */
Nicolas Pitre5ef20ca2014-05-26 18:19:34 -04001363 unsigned long compute_capacity;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001364
1365 /* Approximate capacity in terms of runnable tasks on a node */
Nicolas Pitre5ef20ca2014-05-26 18:19:34 -04001366 unsigned long task_capacity;
Nicolas Pitre1b6a7492014-05-26 18:19:35 -04001367 int has_free_capacity;
Mel Gorman58d081b2013-10-07 11:29:10 +01001368};
Mel Gormane6628d52013-10-07 11:29:02 +01001369
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001370/*
1371 * XXX borrowed from update_sg_lb_stats
1372 */
1373static void update_numa_stats(struct numa_stats *ns, int nid)
1374{
Rik van Riel83d7f242014-08-04 13:23:28 -04001375 int smt, cpu, cpus = 0;
1376 unsigned long capacity;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001377
1378 memset(ns, 0, sizeof(*ns));
1379 for_each_cpu(cpu, cpumask_of_node(nid)) {
1380 struct rq *rq = cpu_rq(cpu);
1381
1382 ns->nr_running += rq->nr_running;
1383 ns->load += weighted_cpuload(cpu);
Nicolas Pitreced549f2014-05-26 18:19:38 -04001384 ns->compute_capacity += capacity_of(cpu);
Peter Zijlstra5eca82a2013-11-06 18:47:57 +01001385
1386 cpus++;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001387 }
1388
Peter Zijlstra5eca82a2013-11-06 18:47:57 +01001389 /*
1390 * If we raced with hotplug and there are no CPUs left in our mask
1391 * the @ns structure is NULL'ed and task_numa_compare() will
1392 * not find this node attractive.
1393 *
Nicolas Pitre1b6a7492014-05-26 18:19:35 -04001394 * We'll either bail at !has_free_capacity, or we'll detect a huge
1395 * imbalance and bail there.
Peter Zijlstra5eca82a2013-11-06 18:47:57 +01001396 */
1397 if (!cpus)
1398 return;
1399
Rik van Riel83d7f242014-08-04 13:23:28 -04001400 /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1401 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1402 capacity = cpus / smt; /* cores */
1403
1404 ns->task_capacity = min_t(unsigned, capacity,
1405 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
Nicolas Pitre1b6a7492014-05-26 18:19:35 -04001406 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001407}
1408
Mel Gorman58d081b2013-10-07 11:29:10 +01001409struct task_numa_env {
1410 struct task_struct *p;
1411
1412 int src_cpu, src_nid;
1413 int dst_cpu, dst_nid;
1414
1415 struct numa_stats src_stats, dst_stats;
1416
Wanpeng Li40ea2b42013-12-05 19:10:17 +08001417 int imbalance_pct;
Rik van Riel7bd95322014-10-17 03:29:51 -04001418 int dist;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001419
1420 struct task_struct *best_task;
1421 long best_imp;
Mel Gorman58d081b2013-10-07 11:29:10 +01001422 int best_cpu;
1423};
1424
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001425static void task_numa_assign(struct task_numa_env *env,
1426 struct task_struct *p, long imp)
1427{
1428 if (env->best_task)
1429 put_task_struct(env->best_task);
Oleg Nesterovbac78572016-05-18 21:57:33 +02001430 if (p)
1431 get_task_struct(p);
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001432
1433 env->best_task = p;
1434 env->best_imp = imp;
1435 env->best_cpu = env->dst_cpu;
1436}
1437
Rik van Riel28a21742014-06-23 11:46:13 -04001438static bool load_too_imbalanced(long src_load, long dst_load,
Rik van Riele63da032014-05-14 13:22:21 -04001439 struct task_numa_env *env)
1440{
Rik van Riele4991b22015-05-27 15:04:27 -04001441 long imb, old_imb;
1442 long orig_src_load, orig_dst_load;
Rik van Riel28a21742014-06-23 11:46:13 -04001443 long src_capacity, dst_capacity;
1444
1445 /*
1446 * The load is corrected for the CPU capacity available on each node.
1447 *
1448 * src_load dst_load
1449 * ------------ vs ---------
1450 * src_capacity dst_capacity
1451 */
1452 src_capacity = env->src_stats.compute_capacity;
1453 dst_capacity = env->dst_stats.compute_capacity;
Rik van Riele63da032014-05-14 13:22:21 -04001454
1455 /* We care about the slope of the imbalance, not the direction. */
Rik van Riele4991b22015-05-27 15:04:27 -04001456 if (dst_load < src_load)
1457 swap(dst_load, src_load);
Rik van Riele63da032014-05-14 13:22:21 -04001458
1459 /* Is the difference below the threshold? */
Rik van Riele4991b22015-05-27 15:04:27 -04001460 imb = dst_load * src_capacity * 100 -
1461 src_load * dst_capacity * env->imbalance_pct;
Rik van Riele63da032014-05-14 13:22:21 -04001462 if (imb <= 0)
1463 return false;
1464
1465 /*
1466 * The imbalance is above the allowed threshold.
Rik van Riele4991b22015-05-27 15:04:27 -04001467 * Compare it with the old imbalance.
Rik van Riele63da032014-05-14 13:22:21 -04001468 */
Rik van Riel28a21742014-06-23 11:46:13 -04001469 orig_src_load = env->src_stats.load;
Rik van Riele4991b22015-05-27 15:04:27 -04001470 orig_dst_load = env->dst_stats.load;
Rik van Riel28a21742014-06-23 11:46:13 -04001471
Rik van Riele4991b22015-05-27 15:04:27 -04001472 if (orig_dst_load < orig_src_load)
1473 swap(orig_dst_load, orig_src_load);
Rik van Riele63da032014-05-14 13:22:21 -04001474
Rik van Riele4991b22015-05-27 15:04:27 -04001475 old_imb = orig_dst_load * src_capacity * 100 -
1476 orig_src_load * dst_capacity * env->imbalance_pct;
1477
1478 /* Would this change make things worse? */
1479 return (imb > old_imb);
Rik van Riele63da032014-05-14 13:22:21 -04001480}
1481
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001482/*
1483 * This checks if the overall compute and NUMA accesses of the system would
1484 * be improved if the source tasks was migrated to the target dst_cpu taking
1485 * into account that it might be best if task running on the dst_cpu should
1486 * be exchanged with the source task
1487 */
Rik van Riel887c2902013-10-07 11:29:31 +01001488static void task_numa_compare(struct task_numa_env *env,
1489 long taskimp, long groupimp)
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001490{
1491 struct rq *src_rq = cpu_rq(env->src_cpu);
1492 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1493 struct task_struct *cur;
Rik van Riel28a21742014-06-23 11:46:13 -04001494 long src_load, dst_load;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001495 long load;
Rik van Riel1c5d3eb2014-06-23 11:46:15 -04001496 long imp = env->p->numa_group ? groupimp : taskimp;
Rik van Riel0132c3e2014-06-23 11:46:16 -04001497 long moveimp = imp;
Rik van Riel7bd95322014-10-17 03:29:51 -04001498 int dist = env->dist;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001499
1500 rcu_read_lock();
Oleg Nesterovbac78572016-05-18 21:57:33 +02001501 cur = task_rcu_dereference(&dst_rq->curr);
1502 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001503 cur = NULL;
1504
1505 /*
Peter Zijlstra7af68332014-11-10 10:54:35 +01001506 * Because we have preemption enabled we can get migrated around and
1507 * end try selecting ourselves (current == env->p) as a swap candidate.
1508 */
1509 if (cur == env->p)
1510 goto unlock;
1511
1512 /*
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001513 * "imp" is the fault differential for the source task between the
1514 * source and destination node. Calculate the total differential for
1515 * the source task and potential destination task. The more negative
1516 * the value is, the more rmeote accesses that would be expected to
1517 * be incurred if the tasks were swapped.
1518 */
1519 if (cur) {
1520 /* Skip this swap candidate if cannot move to the source cpu */
1521 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1522 goto unlock;
1523
Rik van Riel887c2902013-10-07 11:29:31 +01001524 /*
1525 * If dst and source tasks are in the same NUMA group, or not
Rik van Rielca28aa532013-10-07 11:29:32 +01001526 * in any group then look only at task weights.
Rik van Riel887c2902013-10-07 11:29:31 +01001527 */
Rik van Rielca28aa532013-10-07 11:29:32 +01001528 if (cur->numa_group == env->p->numa_group) {
Rik van Riel7bd95322014-10-17 03:29:51 -04001529 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1530 task_weight(cur, env->dst_nid, dist);
Rik van Rielca28aa532013-10-07 11:29:32 +01001531 /*
1532 * Add some hysteresis to prevent swapping the
1533 * tasks within a group over tiny differences.
1534 */
1535 if (cur->numa_group)
1536 imp -= imp/16;
Rik van Riel887c2902013-10-07 11:29:31 +01001537 } else {
Rik van Rielca28aa532013-10-07 11:29:32 +01001538 /*
1539 * Compare the group weights. If a task is all by
1540 * itself (not part of a group), use the task weight
1541 * instead.
1542 */
Rik van Rielca28aa532013-10-07 11:29:32 +01001543 if (cur->numa_group)
Rik van Riel7bd95322014-10-17 03:29:51 -04001544 imp += group_weight(cur, env->src_nid, dist) -
1545 group_weight(cur, env->dst_nid, dist);
Rik van Rielca28aa532013-10-07 11:29:32 +01001546 else
Rik van Riel7bd95322014-10-17 03:29:51 -04001547 imp += task_weight(cur, env->src_nid, dist) -
1548 task_weight(cur, env->dst_nid, dist);
Rik van Riel887c2902013-10-07 11:29:31 +01001549 }
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001550 }
1551
Rik van Riel0132c3e2014-06-23 11:46:16 -04001552 if (imp <= env->best_imp && moveimp <= env->best_imp)
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001553 goto unlock;
1554
1555 if (!cur) {
1556 /* Is there capacity at our destination? */
Rik van Rielb932c032014-08-04 13:23:27 -04001557 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
Nicolas Pitre1b6a7492014-05-26 18:19:35 -04001558 !env->dst_stats.has_free_capacity)
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001559 goto unlock;
1560
1561 goto balance;
1562 }
1563
1564 /* Balance doesn't matter much if we're running a task per cpu */
Rik van Riel0132c3e2014-06-23 11:46:16 -04001565 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1566 dst_rq->nr_running == 1)
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001567 goto assign;
1568
1569 /*
1570 * In the overloaded case, try and keep the load balanced.
1571 */
1572balance:
Peter Zijlstrae720fff2014-07-11 16:01:53 +02001573 load = task_h_load(env->p);
1574 dst_load = env->dst_stats.load + load;
1575 src_load = env->src_stats.load - load;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001576
Rik van Riel0132c3e2014-06-23 11:46:16 -04001577 if (moveimp > imp && moveimp > env->best_imp) {
1578 /*
1579 * If the improvement from just moving env->p direction is
1580 * better than swapping tasks around, check if a move is
1581 * possible. Store a slightly smaller score than moveimp,
1582 * so an actually idle CPU will win.
1583 */
1584 if (!load_too_imbalanced(src_load, dst_load, env)) {
1585 imp = moveimp - 1;
1586 cur = NULL;
1587 goto assign;
1588 }
1589 }
1590
1591 if (imp <= env->best_imp)
1592 goto unlock;
1593
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001594 if (cur) {
Peter Zijlstrae720fff2014-07-11 16:01:53 +02001595 load = task_h_load(cur);
1596 dst_load -= load;
1597 src_load += load;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001598 }
1599
Rik van Riel28a21742014-06-23 11:46:13 -04001600 if (load_too_imbalanced(src_load, dst_load, env))
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001601 goto unlock;
1602
Rik van Rielba7e5a22014-09-04 16:35:30 -04001603 /*
1604 * One idle CPU per node is evaluated for a task numa move.
1605 * Call select_idle_sibling to maybe find a better one.
1606 */
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02001607 if (!cur) {
1608 /*
1609 * select_idle_siblings() uses an per-cpu cpumask that
1610 * can be used from IRQ context.
1611 */
1612 local_irq_disable();
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01001613 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1614 env->dst_cpu);
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02001615 local_irq_enable();
1616 }
Rik van Rielba7e5a22014-09-04 16:35:30 -04001617
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001618assign:
1619 task_numa_assign(env, cur, imp);
1620unlock:
1621 rcu_read_unlock();
1622}
1623
Rik van Riel887c2902013-10-07 11:29:31 +01001624static void task_numa_find_cpu(struct task_numa_env *env,
1625 long taskimp, long groupimp)
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001626{
1627 int cpu;
1628
1629 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1630 /* Skip this CPU if the source task cannot migrate */
1631 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1632 continue;
1633
1634 env->dst_cpu = cpu;
Rik van Riel887c2902013-10-07 11:29:31 +01001635 task_numa_compare(env, taskimp, groupimp);
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001636 }
1637}
1638
Rik van Riel6f9aad02015-05-28 09:52:49 -04001639/* Only move tasks to a NUMA node less busy than the current node. */
1640static bool numa_has_capacity(struct task_numa_env *env)
1641{
1642 struct numa_stats *src = &env->src_stats;
1643 struct numa_stats *dst = &env->dst_stats;
1644
1645 if (src->has_free_capacity && !dst->has_free_capacity)
1646 return false;
1647
1648 /*
1649 * Only consider a task move if the source has a higher load
1650 * than the destination, corrected for CPU capacity on each node.
1651 *
1652 * src->load dst->load
1653 * --------------------- vs ---------------------
1654 * src->compute_capacity dst->compute_capacity
1655 */
Srikar Dronamraju44dcb042015-06-16 17:26:00 +05301656 if (src->load * dst->compute_capacity * env->imbalance_pct >
1657
1658 dst->load * src->compute_capacity * 100)
Rik van Riel6f9aad02015-05-28 09:52:49 -04001659 return true;
1660
1661 return false;
1662}
1663
Mel Gorman58d081b2013-10-07 11:29:10 +01001664static int task_numa_migrate(struct task_struct *p)
Mel Gormane6628d52013-10-07 11:29:02 +01001665{
Mel Gorman58d081b2013-10-07 11:29:10 +01001666 struct task_numa_env env = {
1667 .p = p,
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001668
Mel Gorman58d081b2013-10-07 11:29:10 +01001669 .src_cpu = task_cpu(p),
Ingo Molnarb32e86b2013-10-07 11:29:30 +01001670 .src_nid = task_node(p),
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001671
1672 .imbalance_pct = 112,
1673
1674 .best_task = NULL,
1675 .best_imp = 0,
Rik van Riel4142c3e2016-01-25 17:07:39 -05001676 .best_cpu = -1,
Mel Gorman58d081b2013-10-07 11:29:10 +01001677 };
1678 struct sched_domain *sd;
Rik van Riel887c2902013-10-07 11:29:31 +01001679 unsigned long taskweight, groupweight;
Rik van Riel7bd95322014-10-17 03:29:51 -04001680 int nid, ret, dist;
Rik van Riel887c2902013-10-07 11:29:31 +01001681 long taskimp, groupimp;
Mel Gormane6628d52013-10-07 11:29:02 +01001682
Mel Gorman58d081b2013-10-07 11:29:10 +01001683 /*
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001684 * Pick the lowest SD_NUMA domain, as that would have the smallest
1685 * imbalance and would be the first to start moving tasks about.
1686 *
1687 * And we want to avoid any moving of tasks about, as that would create
1688 * random movement of tasks -- counter the numa conditions we're trying
1689 * to satisfy here.
Mel Gorman58d081b2013-10-07 11:29:10 +01001690 */
Mel Gormane6628d52013-10-07 11:29:02 +01001691 rcu_read_lock();
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001692 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
Rik van Riel46a73e82013-11-11 19:29:25 -05001693 if (sd)
1694 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
Mel Gormane6628d52013-10-07 11:29:02 +01001695 rcu_read_unlock();
1696
Rik van Riel46a73e82013-11-11 19:29:25 -05001697 /*
1698 * Cpusets can break the scheduler domain tree into smaller
1699 * balance domains, some of which do not cross NUMA boundaries.
1700 * Tasks that are "trapped" in such domains cannot be migrated
1701 * elsewhere, so there is no point in (re)trying.
1702 */
1703 if (unlikely(!sd)) {
Wanpeng Lide1b3012013-12-12 15:23:24 +08001704 p->numa_preferred_nid = task_node(p);
Rik van Riel46a73e82013-11-11 19:29:25 -05001705 return -EINVAL;
1706 }
1707
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001708 env.dst_nid = p->numa_preferred_nid;
Rik van Riel7bd95322014-10-17 03:29:51 -04001709 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1710 taskweight = task_weight(p, env.src_nid, dist);
1711 groupweight = group_weight(p, env.src_nid, dist);
1712 update_numa_stats(&env.src_stats, env.src_nid);
1713 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1714 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001715 update_numa_stats(&env.dst_stats, env.dst_nid);
Mel Gorman58d081b2013-10-07 11:29:10 +01001716
Rik van Riela43455a2014-06-04 16:09:42 -04001717 /* Try to find a spot on the preferred nid. */
Rik van Riel6f9aad02015-05-28 09:52:49 -04001718 if (numa_has_capacity(&env))
1719 task_numa_find_cpu(&env, taskimp, groupimp);
Rik van Riele1dda8a2013-10-07 11:29:19 +01001720
Rik van Riel9de05d42014-10-09 17:27:47 -04001721 /*
1722 * Look at other nodes in these cases:
1723 * - there is no space available on the preferred_nid
1724 * - the task is part of a numa_group that is interleaved across
1725 * multiple NUMA nodes; in order to better consolidate the group,
1726 * we need to check other locations.
1727 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05001728 if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001729 for_each_online_node(nid) {
1730 if (nid == env.src_nid || nid == p->numa_preferred_nid)
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001731 continue;
1732
Rik van Riel7bd95322014-10-17 03:29:51 -04001733 dist = node_distance(env.src_nid, env.dst_nid);
Rik van Riel6c6b1192014-10-17 03:29:52 -04001734 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1735 dist != env.dist) {
1736 taskweight = task_weight(p, env.src_nid, dist);
1737 groupweight = group_weight(p, env.src_nid, dist);
1738 }
Rik van Riel7bd95322014-10-17 03:29:51 -04001739
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001740 /* Only consider nodes where both task and groups benefit */
Rik van Riel7bd95322014-10-17 03:29:51 -04001741 taskimp = task_weight(p, nid, dist) - taskweight;
1742 groupimp = group_weight(p, nid, dist) - groupweight;
Rik van Riel887c2902013-10-07 11:29:31 +01001743 if (taskimp < 0 && groupimp < 0)
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001744 continue;
1745
Rik van Riel7bd95322014-10-17 03:29:51 -04001746 env.dist = dist;
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001747 env.dst_nid = nid;
1748 update_numa_stats(&env.dst_stats, env.dst_nid);
Rik van Riel6f9aad02015-05-28 09:52:49 -04001749 if (numa_has_capacity(&env))
1750 task_numa_find_cpu(&env, taskimp, groupimp);
Mel Gorman58d081b2013-10-07 11:29:10 +01001751 }
1752 }
1753
Rik van Riel68d1b022014-04-11 13:00:29 -04001754 /*
1755 * If the task is part of a workload that spans multiple NUMA nodes,
1756 * and is migrating into one of the workload's active nodes, remember
1757 * this node as the task's preferred numa node, so the workload can
1758 * settle down.
1759 * A task that migrated to a second choice node will be better off
1760 * trying for a better one later. Do not set the preferred node here.
1761 */
Rik van Rieldb015da2014-06-23 11:41:34 -04001762 if (p->numa_group) {
Rik van Riel4142c3e2016-01-25 17:07:39 -05001763 struct numa_group *ng = p->numa_group;
1764
Rik van Rieldb015da2014-06-23 11:41:34 -04001765 if (env.best_cpu == -1)
1766 nid = env.src_nid;
1767 else
1768 nid = env.dst_nid;
1769
Rik van Riel4142c3e2016-01-25 17:07:39 -05001770 if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
Rik van Rieldb015da2014-06-23 11:41:34 -04001771 sched_setnuma(p, env.dst_nid);
1772 }
1773
1774 /* No better CPU than the current one was found. */
1775 if (env.best_cpu == -1)
1776 return -EAGAIN;
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01001777
Rik van Riel04bb2f92013-10-07 11:29:36 +01001778 /*
1779 * Reset the scan period if the task is being rescheduled on an
1780 * alternative node to recheck if the tasks is now properly placed.
1781 */
1782 p->numa_scan_period = task_scan_min(p);
1783
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001784 if (env.best_task == NULL) {
Mel Gorman286549d2014-01-21 15:51:03 -08001785 ret = migrate_task_to(p, env.best_cpu);
1786 if (ret != 0)
1787 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001788 return ret;
1789 }
1790
1791 ret = migrate_swap(p, env.best_task);
Mel Gorman286549d2014-01-21 15:51:03 -08001792 if (ret != 0)
1793 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001794 put_task_struct(env.best_task);
1795 return ret;
Mel Gormane6628d52013-10-07 11:29:02 +01001796}
1797
Mel Gorman6b9a7462013-10-07 11:29:11 +01001798/* Attempt to migrate a task to a CPU on the preferred node. */
1799static void numa_migrate_preferred(struct task_struct *p)
1800{
Rik van Riel5085e2a2014-04-11 13:00:28 -04001801 unsigned long interval = HZ;
1802
Rik van Riel2739d3e2013-10-07 11:29:41 +01001803 /* This task has no NUMA fault statistics yet */
Iulia Manda44dba3d2014-10-31 02:13:31 +02001804 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
Rik van Riel2739d3e2013-10-07 11:29:41 +01001805 return;
1806
1807 /* Periodically retry migrating the task to the preferred node */
Rik van Riel5085e2a2014-04-11 13:00:28 -04001808 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1809 p->numa_migrate_retry = jiffies + interval;
Rik van Riel2739d3e2013-10-07 11:29:41 +01001810
Mel Gorman6b9a7462013-10-07 11:29:11 +01001811 /* Success if task is already running on preferred CPU */
Wanpeng Lide1b3012013-12-12 15:23:24 +08001812 if (task_node(p) == p->numa_preferred_nid)
Mel Gorman6b9a7462013-10-07 11:29:11 +01001813 return;
1814
Mel Gorman6b9a7462013-10-07 11:29:11 +01001815 /* Otherwise, try migrate to a CPU on the preferred node */
Rik van Riel2739d3e2013-10-07 11:29:41 +01001816 task_numa_migrate(p);
Mel Gorman6b9a7462013-10-07 11:29:11 +01001817}
1818
Rik van Riel04bb2f92013-10-07 11:29:36 +01001819/*
Rik van Riel4142c3e2016-01-25 17:07:39 -05001820 * Find out how many nodes on the workload is actively running on. Do this by
Rik van Riel20e07de2014-01-27 17:03:43 -05001821 * tracking the nodes from which NUMA hinting faults are triggered. This can
1822 * be different from the set of nodes where the workload's memory is currently
1823 * located.
Rik van Riel20e07de2014-01-27 17:03:43 -05001824 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05001825static void numa_group_count_active_nodes(struct numa_group *numa_group)
Rik van Riel20e07de2014-01-27 17:03:43 -05001826{
1827 unsigned long faults, max_faults = 0;
Rik van Riel4142c3e2016-01-25 17:07:39 -05001828 int nid, active_nodes = 0;
Rik van Riel20e07de2014-01-27 17:03:43 -05001829
1830 for_each_online_node(nid) {
1831 faults = group_faults_cpu(numa_group, nid);
1832 if (faults > max_faults)
1833 max_faults = faults;
1834 }
1835
1836 for_each_online_node(nid) {
1837 faults = group_faults_cpu(numa_group, nid);
Rik van Riel4142c3e2016-01-25 17:07:39 -05001838 if (faults * ACTIVE_NODE_FRACTION > max_faults)
1839 active_nodes++;
Rik van Riel20e07de2014-01-27 17:03:43 -05001840 }
Rik van Riel4142c3e2016-01-25 17:07:39 -05001841
1842 numa_group->max_faults_cpu = max_faults;
1843 numa_group->active_nodes = active_nodes;
Rik van Riel20e07de2014-01-27 17:03:43 -05001844}
1845
1846/*
Rik van Riel04bb2f92013-10-07 11:29:36 +01001847 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1848 * increments. The more local the fault statistics are, the higher the scan
Rik van Riela22b4b02014-06-23 11:41:35 -04001849 * period will be for the next scan window. If local/(local+remote) ratio is
1850 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1851 * the scan period will decrease. Aim for 70% local accesses.
Rik van Riel04bb2f92013-10-07 11:29:36 +01001852 */
1853#define NUMA_PERIOD_SLOTS 10
Rik van Riela22b4b02014-06-23 11:41:35 -04001854#define NUMA_PERIOD_THRESHOLD 7
Rik van Riel04bb2f92013-10-07 11:29:36 +01001855
1856/*
1857 * Increase the scan period (slow down scanning) if the majority of
1858 * our memory is already on our local node, or if the majority of
1859 * the page accesses are shared with other processes.
1860 * Otherwise, decrease the scan period.
1861 */
1862static void update_task_scan_period(struct task_struct *p,
1863 unsigned long shared, unsigned long private)
1864{
1865 unsigned int period_slot;
1866 int ratio;
1867 int diff;
1868
1869 unsigned long remote = p->numa_faults_locality[0];
1870 unsigned long local = p->numa_faults_locality[1];
1871
1872 /*
1873 * If there were no record hinting faults then either the task is
1874 * completely idle or all activity is areas that are not of interest
Mel Gorman074c2382015-03-25 15:55:42 -07001875 * to automatic numa balancing. Related to that, if there were failed
1876 * migration then it implies we are migrating too quickly or the local
1877 * node is overloaded. In either case, scan slower
Rik van Riel04bb2f92013-10-07 11:29:36 +01001878 */
Mel Gorman074c2382015-03-25 15:55:42 -07001879 if (local + shared == 0 || p->numa_faults_locality[2]) {
Rik van Riel04bb2f92013-10-07 11:29:36 +01001880 p->numa_scan_period = min(p->numa_scan_period_max,
1881 p->numa_scan_period << 1);
1882
1883 p->mm->numa_next_scan = jiffies +
1884 msecs_to_jiffies(p->numa_scan_period);
1885
1886 return;
1887 }
1888
1889 /*
1890 * Prepare to scale scan period relative to the current period.
1891 * == NUMA_PERIOD_THRESHOLD scan period stays the same
1892 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1893 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1894 */
1895 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1896 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1897 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1898 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1899 if (!slot)
1900 slot = 1;
1901 diff = slot * period_slot;
1902 } else {
1903 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1904
1905 /*
1906 * Scale scan rate increases based on sharing. There is an
1907 * inverse relationship between the degree of sharing and
1908 * the adjustment made to the scanning period. Broadly
1909 * speaking the intent is that there is little point
1910 * scanning faster if shared accesses dominate as it may
1911 * simply bounce migrations uselessly
1912 */
Yasuaki Ishimatsu2847c902014-10-22 16:04:35 +09001913 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
Rik van Riel04bb2f92013-10-07 11:29:36 +01001914 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1915 }
1916
1917 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1918 task_scan_min(p), task_scan_max(p));
1919 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1920}
1921
Rik van Riel7e2703e2014-01-27 17:03:45 -05001922/*
1923 * Get the fraction of time the task has been running since the last
1924 * NUMA placement cycle. The scheduler keeps similar statistics, but
1925 * decays those on a 32ms period, which is orders of magnitude off
1926 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1927 * stats only if the task is so new there are no NUMA statistics yet.
1928 */
1929static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1930{
1931 u64 runtime, delta, now;
1932 /* Use the start of this time slice to avoid calculations. */
1933 now = p->se.exec_start;
1934 runtime = p->se.sum_exec_runtime;
1935
1936 if (p->last_task_numa_placement) {
1937 delta = runtime - p->last_sum_exec_runtime;
1938 *period = now - p->last_task_numa_placement;
1939 } else {
Yuyang Du9d89c252015-07-15 08:04:37 +08001940 delta = p->se.avg.load_sum / p->se.load.weight;
1941 *period = LOAD_AVG_MAX;
Rik van Riel7e2703e2014-01-27 17:03:45 -05001942 }
1943
1944 p->last_sum_exec_runtime = runtime;
1945 p->last_task_numa_placement = now;
1946
1947 return delta;
1948}
1949
Rik van Riel54009412014-10-17 03:29:53 -04001950/*
1951 * Determine the preferred nid for a task in a numa_group. This needs to
1952 * be done in a way that produces consistent results with group_weight,
1953 * otherwise workloads might not converge.
1954 */
1955static int preferred_group_nid(struct task_struct *p, int nid)
1956{
1957 nodemask_t nodes;
1958 int dist;
1959
1960 /* Direct connections between all NUMA nodes. */
1961 if (sched_numa_topology_type == NUMA_DIRECT)
1962 return nid;
1963
1964 /*
1965 * On a system with glueless mesh NUMA topology, group_weight
1966 * scores nodes according to the number of NUMA hinting faults on
1967 * both the node itself, and on nearby nodes.
1968 */
1969 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1970 unsigned long score, max_score = 0;
1971 int node, max_node = nid;
1972
1973 dist = sched_max_numa_distance;
1974
1975 for_each_online_node(node) {
1976 score = group_weight(p, node, dist);
1977 if (score > max_score) {
1978 max_score = score;
1979 max_node = node;
1980 }
1981 }
1982 return max_node;
1983 }
1984
1985 /*
1986 * Finding the preferred nid in a system with NUMA backplane
1987 * interconnect topology is more involved. The goal is to locate
1988 * tasks from numa_groups near each other in the system, and
1989 * untangle workloads from different sides of the system. This requires
1990 * searching down the hierarchy of node groups, recursively searching
1991 * inside the highest scoring group of nodes. The nodemask tricks
1992 * keep the complexity of the search down.
1993 */
1994 nodes = node_online_map;
1995 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1996 unsigned long max_faults = 0;
Jan Beulich81907472015-01-23 08:25:38 +00001997 nodemask_t max_group = NODE_MASK_NONE;
Rik van Riel54009412014-10-17 03:29:53 -04001998 int a, b;
1999
2000 /* Are there nodes at this distance from each other? */
2001 if (!find_numa_distance(dist))
2002 continue;
2003
2004 for_each_node_mask(a, nodes) {
2005 unsigned long faults = 0;
2006 nodemask_t this_group;
2007 nodes_clear(this_group);
2008
2009 /* Sum group's NUMA faults; includes a==b case. */
2010 for_each_node_mask(b, nodes) {
2011 if (node_distance(a, b) < dist) {
2012 faults += group_faults(p, b);
2013 node_set(b, this_group);
2014 node_clear(b, nodes);
2015 }
2016 }
2017
2018 /* Remember the top group. */
2019 if (faults > max_faults) {
2020 max_faults = faults;
2021 max_group = this_group;
2022 /*
2023 * subtle: at the smallest distance there is
2024 * just one node left in each "group", the
2025 * winner is the preferred nid.
2026 */
2027 nid = a;
2028 }
2029 }
2030 /* Next round, evaluate the nodes within max_group. */
Jan Beulich890a5402015-02-09 12:30:00 +01002031 if (!max_faults)
2032 break;
Rik van Riel54009412014-10-17 03:29:53 -04002033 nodes = max_group;
2034 }
2035 return nid;
2036}
2037
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002038static void task_numa_placement(struct task_struct *p)
2039{
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002040 int seq, nid, max_nid = -1, max_group_nid = -1;
2041 unsigned long max_faults = 0, max_group_faults = 0;
Rik van Riel04bb2f92013-10-07 11:29:36 +01002042 unsigned long fault_types[2] = { 0, 0 };
Rik van Riel7e2703e2014-01-27 17:03:45 -05002043 unsigned long total_faults;
2044 u64 runtime, period;
Mel Gorman7dbd13e2013-10-07 11:29:29 +01002045 spinlock_t *group_lock = NULL;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002046
Jason Low7e5a2c12015-04-30 17:28:14 -07002047 /*
2048 * The p->mm->numa_scan_seq field gets updated without
2049 * exclusive access. Use READ_ONCE() here to ensure
2050 * that the field is read in a single access:
2051 */
Jason Low316c1608d2015-04-28 13:00:20 -07002052 seq = READ_ONCE(p->mm->numa_scan_seq);
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002053 if (p->numa_scan_seq == seq)
2054 return;
2055 p->numa_scan_seq = seq;
Mel Gorman598f0ec2013-10-07 11:28:55 +01002056 p->numa_scan_period_max = task_scan_max(p);
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002057
Rik van Riel7e2703e2014-01-27 17:03:45 -05002058 total_faults = p->numa_faults_locality[0] +
2059 p->numa_faults_locality[1];
2060 runtime = numa_get_avg_runtime(p, &period);
2061
Mel Gorman7dbd13e2013-10-07 11:29:29 +01002062 /* If the task is part of a group prevent parallel updates to group stats */
2063 if (p->numa_group) {
2064 group_lock = &p->numa_group->lock;
Mike Galbraith60e69ee2014-04-07 10:55:15 +02002065 spin_lock_irq(group_lock);
Mel Gorman7dbd13e2013-10-07 11:29:29 +01002066 }
2067
Mel Gorman688b7582013-10-07 11:28:58 +01002068 /* Find the node with the highest number of faults */
2069 for_each_online_node(nid) {
Iulia Manda44dba3d2014-10-31 02:13:31 +02002070 /* Keep track of the offsets in numa_faults array */
2071 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002072 unsigned long faults = 0, group_faults = 0;
Iulia Manda44dba3d2014-10-31 02:13:31 +02002073 int priv;
Mel Gorman745d6142013-10-07 11:28:59 +01002074
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002075 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
Rik van Riel7e2703e2014-01-27 17:03:45 -05002076 long diff, f_diff, f_weight;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002077
Iulia Manda44dba3d2014-10-31 02:13:31 +02002078 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2079 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2080 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2081 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
Mel Gorman745d6142013-10-07 11:28:59 +01002082
Mel Gormanac8e8952013-10-07 11:29:03 +01002083 /* Decay existing window, copy faults since last scan */
Iulia Manda44dba3d2014-10-31 02:13:31 +02002084 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2085 fault_types[priv] += p->numa_faults[membuf_idx];
2086 p->numa_faults[membuf_idx] = 0;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01002087
Rik van Riel7e2703e2014-01-27 17:03:45 -05002088 /*
2089 * Normalize the faults_from, so all tasks in a group
2090 * count according to CPU use, instead of by the raw
2091 * number of faults. Tasks with little runtime have
2092 * little over-all impact on throughput, and thus their
2093 * faults are less important.
2094 */
2095 f_weight = div64_u64(runtime << 16, period + 1);
Iulia Manda44dba3d2014-10-31 02:13:31 +02002096 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
Rik van Riel7e2703e2014-01-27 17:03:45 -05002097 (total_faults + 1);
Iulia Manda44dba3d2014-10-31 02:13:31 +02002098 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2099 p->numa_faults[cpubuf_idx] = 0;
Rik van Riel50ec8a42014-01-27 17:03:42 -05002100
Iulia Manda44dba3d2014-10-31 02:13:31 +02002101 p->numa_faults[mem_idx] += diff;
2102 p->numa_faults[cpu_idx] += f_diff;
2103 faults += p->numa_faults[mem_idx];
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002104 p->total_numa_faults += diff;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002105 if (p->numa_group) {
Iulia Manda44dba3d2014-10-31 02:13:31 +02002106 /*
2107 * safe because we can only change our own group
2108 *
2109 * mem_idx represents the offset for a given
2110 * nid and priv in a specific region because it
2111 * is at the beginning of the numa_faults array.
2112 */
2113 p->numa_group->faults[mem_idx] += diff;
2114 p->numa_group->faults_cpu[mem_idx] += f_diff;
Mel Gorman989348b2013-10-07 11:29:40 +01002115 p->numa_group->total_faults += diff;
Iulia Manda44dba3d2014-10-31 02:13:31 +02002116 group_faults += p->numa_group->faults[mem_idx];
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002117 }
Mel Gormanac8e8952013-10-07 11:29:03 +01002118 }
2119
Mel Gorman688b7582013-10-07 11:28:58 +01002120 if (faults > max_faults) {
2121 max_faults = faults;
2122 max_nid = nid;
2123 }
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002124
2125 if (group_faults > max_group_faults) {
2126 max_group_faults = group_faults;
2127 max_group_nid = nid;
2128 }
2129 }
2130
Rik van Riel04bb2f92013-10-07 11:29:36 +01002131 update_task_scan_period(p, fault_types[0], fault_types[1]);
2132
Mel Gorman7dbd13e2013-10-07 11:29:29 +01002133 if (p->numa_group) {
Rik van Riel4142c3e2016-01-25 17:07:39 -05002134 numa_group_count_active_nodes(p->numa_group);
Mike Galbraith60e69ee2014-04-07 10:55:15 +02002135 spin_unlock_irq(group_lock);
Rik van Riel54009412014-10-17 03:29:53 -04002136 max_nid = preferred_group_nid(p, max_group_nid);
Mel Gorman688b7582013-10-07 11:28:58 +01002137 }
2138
Rik van Rielbb97fc32014-06-04 16:33:15 -04002139 if (max_faults) {
2140 /* Set the new preferred node */
2141 if (max_nid != p->numa_preferred_nid)
2142 sched_setnuma(p, max_nid);
2143
2144 if (task_node(p) != p->numa_preferred_nid)
2145 numa_migrate_preferred(p);
Mel Gorman3a7053b2013-10-07 11:29:00 +01002146 }
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002147}
2148
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002149static inline int get_numa_group(struct numa_group *grp)
2150{
2151 return atomic_inc_not_zero(&grp->refcount);
2152}
2153
2154static inline void put_numa_group(struct numa_group *grp)
2155{
2156 if (atomic_dec_and_test(&grp->refcount))
2157 kfree_rcu(grp, rcu);
2158}
2159
Mel Gorman3e6a9412013-10-07 11:29:35 +01002160static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2161 int *priv)
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002162{
2163 struct numa_group *grp, *my_grp;
2164 struct task_struct *tsk;
2165 bool join = false;
2166 int cpu = cpupid_to_cpu(cpupid);
2167 int i;
2168
2169 if (unlikely(!p->numa_group)) {
2170 unsigned int size = sizeof(struct numa_group) +
Rik van Riel50ec8a42014-01-27 17:03:42 -05002171 4*nr_node_ids*sizeof(unsigned long);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002172
2173 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2174 if (!grp)
2175 return;
2176
2177 atomic_set(&grp->refcount, 1);
Rik van Riel4142c3e2016-01-25 17:07:39 -05002178 grp->active_nodes = 1;
2179 grp->max_faults_cpu = 0;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002180 spin_lock_init(&grp->lock);
Mel Gormane29cf082013-10-07 11:29:22 +01002181 grp->gid = p->pid;
Rik van Riel50ec8a42014-01-27 17:03:42 -05002182 /* Second half of the array tracks nids where faults happen */
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002183 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2184 nr_node_ids;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002185
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002186 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
Iulia Manda44dba3d2014-10-31 02:13:31 +02002187 grp->faults[i] = p->numa_faults[i];
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002188
Mel Gorman989348b2013-10-07 11:29:40 +01002189 grp->total_faults = p->total_numa_faults;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002190
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002191 grp->nr_tasks++;
2192 rcu_assign_pointer(p->numa_group, grp);
2193 }
2194
2195 rcu_read_lock();
Jason Low316c1608d2015-04-28 13:00:20 -07002196 tsk = READ_ONCE(cpu_rq(cpu)->curr);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002197
2198 if (!cpupid_match_pid(tsk, cpupid))
Peter Zijlstra33547812013-10-09 10:24:48 +02002199 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002200
2201 grp = rcu_dereference(tsk->numa_group);
2202 if (!grp)
Peter Zijlstra33547812013-10-09 10:24:48 +02002203 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002204
2205 my_grp = p->numa_group;
2206 if (grp == my_grp)
Peter Zijlstra33547812013-10-09 10:24:48 +02002207 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002208
2209 /*
2210 * Only join the other group if its bigger; if we're the bigger group,
2211 * the other task will join us.
2212 */
2213 if (my_grp->nr_tasks > grp->nr_tasks)
Peter Zijlstra33547812013-10-09 10:24:48 +02002214 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002215
2216 /*
2217 * Tie-break on the grp address.
2218 */
2219 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
Peter Zijlstra33547812013-10-09 10:24:48 +02002220 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002221
Rik van Rieldabe1d92013-10-07 11:29:34 +01002222 /* Always join threads in the same process. */
2223 if (tsk->mm == current->mm)
2224 join = true;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002225
Rik van Rieldabe1d92013-10-07 11:29:34 +01002226 /* Simple filter to avoid false positives due to PID collisions */
2227 if (flags & TNF_SHARED)
2228 join = true;
2229
Mel Gorman3e6a9412013-10-07 11:29:35 +01002230 /* Update priv based on whether false sharing was detected */
2231 *priv = !join;
2232
Rik van Rieldabe1d92013-10-07 11:29:34 +01002233 if (join && !get_numa_group(grp))
Peter Zijlstra33547812013-10-09 10:24:48 +02002234 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002235
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002236 rcu_read_unlock();
2237
2238 if (!join)
2239 return;
2240
Mike Galbraith60e69ee2014-04-07 10:55:15 +02002241 BUG_ON(irqs_disabled());
2242 double_lock_irq(&my_grp->lock, &grp->lock);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002243
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002244 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
Iulia Manda44dba3d2014-10-31 02:13:31 +02002245 my_grp->faults[i] -= p->numa_faults[i];
2246 grp->faults[i] += p->numa_faults[i];
Mel Gorman989348b2013-10-07 11:29:40 +01002247 }
2248 my_grp->total_faults -= p->total_numa_faults;
2249 grp->total_faults += p->total_numa_faults;
2250
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002251 my_grp->nr_tasks--;
2252 grp->nr_tasks++;
2253
2254 spin_unlock(&my_grp->lock);
Mike Galbraith60e69ee2014-04-07 10:55:15 +02002255 spin_unlock_irq(&grp->lock);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002256
2257 rcu_assign_pointer(p->numa_group, grp);
2258
2259 put_numa_group(my_grp);
Peter Zijlstra33547812013-10-09 10:24:48 +02002260 return;
2261
2262no_join:
2263 rcu_read_unlock();
2264 return;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002265}
2266
2267void task_numa_free(struct task_struct *p)
2268{
2269 struct numa_group *grp = p->numa_group;
Iulia Manda44dba3d2014-10-31 02:13:31 +02002270 void *numa_faults = p->numa_faults;
Steven Rostedte9dd6852014-05-27 17:02:04 -04002271 unsigned long flags;
2272 int i;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002273
2274 if (grp) {
Steven Rostedte9dd6852014-05-27 17:02:04 -04002275 spin_lock_irqsave(&grp->lock, flags);
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002276 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
Iulia Manda44dba3d2014-10-31 02:13:31 +02002277 grp->faults[i] -= p->numa_faults[i];
Mel Gorman989348b2013-10-07 11:29:40 +01002278 grp->total_faults -= p->total_numa_faults;
2279
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002280 grp->nr_tasks--;
Steven Rostedte9dd6852014-05-27 17:02:04 -04002281 spin_unlock_irqrestore(&grp->lock, flags);
Andreea-Cristina Bernat35b123e2014-08-22 17:50:43 +03002282 RCU_INIT_POINTER(p->numa_group, NULL);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002283 put_numa_group(grp);
2284 }
2285
Iulia Manda44dba3d2014-10-31 02:13:31 +02002286 p->numa_faults = NULL;
Rik van Riel82727012013-10-07 11:29:28 +01002287 kfree(numa_faults);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002288}
2289
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002290/*
2291 * Got a PROT_NONE fault for a page on @node.
2292 */
Rik van Riel58b46da2014-01-27 17:03:47 -05002293void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002294{
2295 struct task_struct *p = current;
Peter Zijlstra6688cc02013-10-07 11:29:24 +01002296 bool migrated = flags & TNF_MIGRATED;
Rik van Riel58b46da2014-01-27 17:03:47 -05002297 int cpu_node = task_node(current);
Rik van Riel792568e2014-04-11 13:00:27 -04002298 int local = !!(flags & TNF_FAULT_LOCAL);
Rik van Riel4142c3e2016-01-25 17:07:39 -05002299 struct numa_group *ng;
Mel Gormanac8e8952013-10-07 11:29:03 +01002300 int priv;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002301
Srikar Dronamraju2a595722015-08-11 21:54:21 +05302302 if (!static_branch_likely(&sched_numa_balancing))
Mel Gorman1a687c22012-11-22 11:16:36 +00002303 return;
2304
Mel Gorman9ff1d9f2013-10-07 11:29:04 +01002305 /* for example, ksmd faulting in a user's mm */
2306 if (!p->mm)
2307 return;
2308
Mel Gormanf809ca92013-10-07 11:28:57 +01002309 /* Allocate buffer to track faults on a per-node basis */
Iulia Manda44dba3d2014-10-31 02:13:31 +02002310 if (unlikely(!p->numa_faults)) {
2311 int size = sizeof(*p->numa_faults) *
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002312 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
Mel Gormanf809ca92013-10-07 11:28:57 +01002313
Iulia Manda44dba3d2014-10-31 02:13:31 +02002314 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2315 if (!p->numa_faults)
Mel Gormanf809ca92013-10-07 11:28:57 +01002316 return;
Mel Gorman745d6142013-10-07 11:28:59 +01002317
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002318 p->total_numa_faults = 0;
Rik van Riel04bb2f92013-10-07 11:29:36 +01002319 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
Mel Gormanf809ca92013-10-07 11:28:57 +01002320 }
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002321
Mel Gormanfb003b82012-11-15 09:01:14 +00002322 /*
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002323 * First accesses are treated as private, otherwise consider accesses
2324 * to be private if the accessing pid has not changed
2325 */
2326 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2327 priv = 1;
2328 } else {
2329 priv = cpupid_match_pid(p, last_cpupid);
Peter Zijlstra6688cc02013-10-07 11:29:24 +01002330 if (!priv && !(flags & TNF_NO_GROUP))
Mel Gorman3e6a9412013-10-07 11:29:35 +01002331 task_numa_group(p, last_cpupid, flags, &priv);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002332 }
2333
Rik van Riel792568e2014-04-11 13:00:27 -04002334 /*
2335 * If a workload spans multiple NUMA nodes, a shared fault that
2336 * occurs wholly within the set of nodes that the workload is
2337 * actively using should be counted as local. This allows the
2338 * scan rate to slow down when a workload has settled down.
2339 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05002340 ng = p->numa_group;
2341 if (!priv && !local && ng && ng->active_nodes > 1 &&
2342 numa_is_active_node(cpu_node, ng) &&
2343 numa_is_active_node(mem_node, ng))
Rik van Riel792568e2014-04-11 13:00:27 -04002344 local = 1;
2345
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002346 task_numa_placement(p);
Mel Gormanf809ca92013-10-07 11:28:57 +01002347
Rik van Riel2739d3e2013-10-07 11:29:41 +01002348 /*
2349 * Retry task to preferred node migration periodically, in case it
2350 * case it previously failed, or the scheduler moved us.
2351 */
2352 if (time_after(jiffies, p->numa_migrate_retry))
Mel Gorman6b9a7462013-10-07 11:29:11 +01002353 numa_migrate_preferred(p);
2354
Ingo Molnarb32e86b2013-10-07 11:29:30 +01002355 if (migrated)
2356 p->numa_pages_migrated += pages;
Mel Gorman074c2382015-03-25 15:55:42 -07002357 if (flags & TNF_MIGRATE_FAIL)
2358 p->numa_faults_locality[2] += pages;
Ingo Molnarb32e86b2013-10-07 11:29:30 +01002359
Iulia Manda44dba3d2014-10-31 02:13:31 +02002360 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2361 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
Rik van Riel792568e2014-04-11 13:00:27 -04002362 p->numa_faults_locality[local] += pages;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002363}
2364
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002365static void reset_ptenuma_scan(struct task_struct *p)
2366{
Jason Low7e5a2c12015-04-30 17:28:14 -07002367 /*
2368 * We only did a read acquisition of the mmap sem, so
2369 * p->mm->numa_scan_seq is written to without exclusive access
2370 * and the update is not guaranteed to be atomic. That's not
2371 * much of an issue though, since this is just used for
2372 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2373 * expensive, to avoid any form of compiler optimizations:
2374 */
Jason Low316c1608d2015-04-28 13:00:20 -07002375 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002376 p->mm->numa_scan_offset = 0;
2377}
2378
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002379/*
2380 * The expensive part of numa migration is done from task_work context.
2381 * Triggered from task_tick_numa().
2382 */
2383void task_numa_work(struct callback_head *work)
2384{
2385 unsigned long migrate, next_scan, now = jiffies;
2386 struct task_struct *p = current;
2387 struct mm_struct *mm = p->mm;
Rik van Riel51170842015-11-05 15:56:23 -05002388 u64 runtime = p->se.sum_exec_runtime;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002389 struct vm_area_struct *vma;
Mel Gorman9f406042012-11-14 18:34:32 +00002390 unsigned long start, end;
Mel Gorman598f0ec2013-10-07 11:28:55 +01002391 unsigned long nr_pte_updates = 0;
Rik van Riel4620f8c2015-09-11 09:00:27 -04002392 long pages, virtpages;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002393
Peter Zijlstra9148a3a2016-09-20 22:34:51 +02002394 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002395
2396 work->next = work; /* protect against double add */
2397 /*
2398 * Who cares about NUMA placement when they're dying.
2399 *
2400 * NOTE: make sure not to dereference p->mm before this check,
2401 * exit_task_work() happens _after_ exit_mm() so we could be called
2402 * without p->mm even though we still had it when we enqueued this
2403 * work.
2404 */
2405 if (p->flags & PF_EXITING)
2406 return;
2407
Mel Gorman930aa172013-10-07 11:29:37 +01002408 if (!mm->numa_next_scan) {
Mel Gorman7e8d16b2013-10-07 11:28:54 +01002409 mm->numa_next_scan = now +
2410 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
Mel Gormanb8593bf2012-11-21 01:18:23 +00002411 }
2412
2413 /*
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002414 * Enforce maximal scan/migration frequency..
2415 */
2416 migrate = mm->numa_next_scan;
2417 if (time_before(now, migrate))
2418 return;
2419
Mel Gorman598f0ec2013-10-07 11:28:55 +01002420 if (p->numa_scan_period == 0) {
2421 p->numa_scan_period_max = task_scan_max(p);
2422 p->numa_scan_period = task_scan_min(p);
2423 }
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002424
Mel Gormanfb003b82012-11-15 09:01:14 +00002425 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002426 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2427 return;
2428
Mel Gormane14808b2012-11-19 10:59:15 +00002429 /*
Peter Zijlstra19a78d12013-10-07 11:28:51 +01002430 * Delay this task enough that another task of this mm will likely win
2431 * the next time around.
2432 */
2433 p->node_stamp += 2 * TICK_NSEC;
2434
Mel Gorman9f406042012-11-14 18:34:32 +00002435 start = mm->numa_scan_offset;
2436 pages = sysctl_numa_balancing_scan_size;
2437 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
Rik van Riel4620f8c2015-09-11 09:00:27 -04002438 virtpages = pages * 8; /* Scan up to this much virtual space */
Mel Gorman9f406042012-11-14 18:34:32 +00002439 if (!pages)
2440 return;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002441
Rik van Riel4620f8c2015-09-11 09:00:27 -04002442
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002443 down_read(&mm->mmap_sem);
Mel Gorman9f406042012-11-14 18:34:32 +00002444 vma = find_vma(mm, start);
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002445 if (!vma) {
2446 reset_ptenuma_scan(p);
Mel Gorman9f406042012-11-14 18:34:32 +00002447 start = 0;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002448 vma = mm->mmap;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002449 }
Mel Gorman9f406042012-11-14 18:34:32 +00002450 for (; vma; vma = vma->vm_next) {
Naoya Horiguchi6b79c572015-04-07 14:26:47 -07002451 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
Mel Gorman8e76d4e2015-06-10 11:15:00 -07002452 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002453 continue;
Naoya Horiguchi6b79c572015-04-07 14:26:47 -07002454 }
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002455
Mel Gorman4591ce4f2013-10-07 11:29:13 +01002456 /*
2457 * Shared library pages mapped by multiple processes are not
2458 * migrated as it is expected they are cache replicated. Avoid
2459 * hinting faults in read-only file-backed mappings or the vdso
2460 * as migrating the pages will be of marginal benefit.
2461 */
2462 if (!vma->vm_mm ||
2463 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2464 continue;
2465
Mel Gorman3c67f472013-12-18 17:08:40 -08002466 /*
2467 * Skip inaccessible VMAs to avoid any confusion between
2468 * PROT_NONE and NUMA hinting ptes
2469 */
2470 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2471 continue;
2472
Mel Gorman9f406042012-11-14 18:34:32 +00002473 do {
2474 start = max(start, vma->vm_start);
2475 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2476 end = min(end, vma->vm_end);
Rik van Riel4620f8c2015-09-11 09:00:27 -04002477 nr_pte_updates = change_prot_numa(vma, start, end);
Mel Gorman598f0ec2013-10-07 11:28:55 +01002478
2479 /*
Rik van Riel4620f8c2015-09-11 09:00:27 -04002480 * Try to scan sysctl_numa_balancing_size worth of
2481 * hpages that have at least one present PTE that
2482 * is not already pte-numa. If the VMA contains
2483 * areas that are unused or already full of prot_numa
2484 * PTEs, scan up to virtpages, to skip through those
2485 * areas faster.
Mel Gorman598f0ec2013-10-07 11:28:55 +01002486 */
2487 if (nr_pte_updates)
2488 pages -= (end - start) >> PAGE_SHIFT;
Rik van Riel4620f8c2015-09-11 09:00:27 -04002489 virtpages -= (end - start) >> PAGE_SHIFT;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002490
Mel Gorman9f406042012-11-14 18:34:32 +00002491 start = end;
Rik van Riel4620f8c2015-09-11 09:00:27 -04002492 if (pages <= 0 || virtpages <= 0)
Mel Gorman9f406042012-11-14 18:34:32 +00002493 goto out;
Rik van Riel3cf19622014-02-18 17:12:44 -05002494
2495 cond_resched();
Mel Gorman9f406042012-11-14 18:34:32 +00002496 } while (end != vma->vm_end);
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002497 }
2498
Mel Gorman9f406042012-11-14 18:34:32 +00002499out:
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002500 /*
Peter Zijlstrac69307d2013-10-07 11:28:41 +01002501 * It is possible to reach the end of the VMA list but the last few
2502 * VMAs are not guaranteed to the vma_migratable. If they are not, we
2503 * would find the !migratable VMA on the next scan but not reset the
2504 * scanner to the start so check it now.
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002505 */
2506 if (vma)
Mel Gorman9f406042012-11-14 18:34:32 +00002507 mm->numa_scan_offset = start;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002508 else
2509 reset_ptenuma_scan(p);
2510 up_read(&mm->mmap_sem);
Rik van Riel51170842015-11-05 15:56:23 -05002511
2512 /*
2513 * Make sure tasks use at least 32x as much time to run other code
2514 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2515 * Usually update_task_scan_period slows down scanning enough; on an
2516 * overloaded system we need to limit overhead on a per task basis.
2517 */
2518 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2519 u64 diff = p->se.sum_exec_runtime - runtime;
2520 p->node_stamp += 32 * diff;
2521 }
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002522}
2523
2524/*
2525 * Drive the periodic memory faults..
2526 */
2527void task_tick_numa(struct rq *rq, struct task_struct *curr)
2528{
2529 struct callback_head *work = &curr->numa_work;
2530 u64 period, now;
2531
2532 /*
2533 * We don't care about NUMA placement if we don't have memory.
2534 */
2535 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2536 return;
2537
2538 /*
2539 * Using runtime rather than walltime has the dual advantage that
2540 * we (mostly) drive the selection from busy threads and that the
2541 * task needs to have done some actual work before we bother with
2542 * NUMA placement.
2543 */
2544 now = curr->se.sum_exec_runtime;
2545 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2546
Rik van Riel25b3e5a2015-11-05 15:56:22 -05002547 if (now > curr->node_stamp + period) {
Peter Zijlstra4b96a29b2012-10-25 14:16:47 +02002548 if (!curr->node_stamp)
Mel Gorman598f0ec2013-10-07 11:28:55 +01002549 curr->numa_scan_period = task_scan_min(curr);
Peter Zijlstra19a78d12013-10-07 11:28:51 +01002550 curr->node_stamp += period;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002551
2552 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2553 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2554 task_work_add(curr, work, true);
2555 }
2556 }
2557}
2558#else
2559static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2560{
2561}
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01002562
2563static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2564{
2565}
2566
2567static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2568{
2569}
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002570#endif /* CONFIG_NUMA_BALANCING */
2571
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002572static void
2573account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2574{
2575 update_load_add(&cfs_rq->load, se->load.weight);
Peter Zijlstrac09595f2008-06-27 13:41:14 +02002576 if (!parent_entity(se))
Peter Zijlstra029632f2011-10-25 10:00:11 +02002577 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra367456c2012-02-20 21:49:09 +01002578#ifdef CONFIG_SMP
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01002579 if (entity_is_task(se)) {
2580 struct rq *rq = rq_of(cfs_rq);
2581
2582 account_numa_enqueue(rq, task_of(se));
2583 list_add(&se->group_node, &rq->cfs_tasks);
2584 }
Peter Zijlstra367456c2012-02-20 21:49:09 +01002585#endif
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002586 cfs_rq->nr_running++;
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002587}
2588
2589static void
2590account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2591{
2592 update_load_sub(&cfs_rq->load, se->load.weight);
Peter Zijlstrac09595f2008-06-27 13:41:14 +02002593 if (!parent_entity(se))
Peter Zijlstra029632f2011-10-25 10:00:11 +02002594 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
Tim Chenbfdb1982016-02-01 14:47:59 -08002595#ifdef CONFIG_SMP
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01002596 if (entity_is_task(se)) {
2597 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
Bharata B Raob87f1722008-09-25 09:53:54 +05302598 list_del_init(&se->group_node);
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01002599 }
Tim Chenbfdb1982016-02-01 14:47:59 -08002600#endif
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002601 cfs_rq->nr_running--;
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002602}
2603
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002604#ifdef CONFIG_FAIR_GROUP_SCHED
2605# ifdef CONFIG_SMP
Paul Turner6d5ab292011-01-21 20:45:01 -08002606static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002607{
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02002608 long tg_weight, load, shares;
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002609
Peter Zijlstraea1dc6f2016-06-24 16:11:02 +02002610 /*
2611 * This really should be: cfs_rq->avg.load_avg, but instead we use
2612 * cfs_rq->load.weight, which is its upper bound. This helps ramp up
2613 * the shares for small weight interactive tasks.
2614 */
2615 load = scale_load_down(cfs_rq->load.weight);
2616
2617 tg_weight = atomic_long_read(&tg->load_avg);
2618
2619 /* Ensure tg_weight >= load */
2620 tg_weight -= cfs_rq->tg_load_avg_contrib;
2621 tg_weight += load;
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002622
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002623 shares = (tg->shares * load);
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02002624 if (tg_weight)
2625 shares /= tg_weight;
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002626
2627 if (shares < MIN_SHARES)
2628 shares = MIN_SHARES;
2629 if (shares > tg->shares)
2630 shares = tg->shares;
2631
2632 return shares;
2633}
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002634# else /* CONFIG_SMP */
Paul Turner6d5ab292011-01-21 20:45:01 -08002635static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002636{
2637 return tg->shares;
2638}
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002639# endif /* CONFIG_SMP */
Peter Zijlstraea1dc6f2016-06-24 16:11:02 +02002640
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002641static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2642 unsigned long weight)
2643{
Paul Turner19e5eeb2010-12-15 19:10:18 -08002644 if (se->on_rq) {
2645 /* commit outstanding execution time */
2646 if (cfs_rq->curr == se)
2647 update_curr(cfs_rq);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002648 account_entity_dequeue(cfs_rq, se);
Paul Turner19e5eeb2010-12-15 19:10:18 -08002649 }
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002650
2651 update_load_set(&se->load, weight);
2652
2653 if (se->on_rq)
2654 account_entity_enqueue(cfs_rq, se);
2655}
2656
Paul Turner82958362012-10-04 13:18:31 +02002657static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2658
Paul Turner6d5ab292011-01-21 20:45:01 -08002659static void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002660{
2661 struct task_group *tg;
2662 struct sched_entity *se;
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002663 long shares;
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002664
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002665 tg = cfs_rq->tg;
2666 se = tg->se[cpu_of(rq_of(cfs_rq))];
Paul Turner64660c82011-07-21 09:43:36 -07002667 if (!se || throttled_hierarchy(cfs_rq))
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002668 return;
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002669#ifndef CONFIG_SMP
2670 if (likely(se->load.weight == tg->shares))
2671 return;
2672#endif
Paul Turner6d5ab292011-01-21 20:45:01 -08002673 shares = calc_cfs_shares(cfs_rq, tg);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002674
2675 reweight_entity(cfs_rq_of(se), se, shares);
2676}
2677#else /* CONFIG_FAIR_GROUP_SCHED */
Paul Turner6d5ab292011-01-21 20:45:01 -08002678static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002679{
2680}
2681#endif /* CONFIG_FAIR_GROUP_SCHED */
2682
Alex Shi141965c2013-06-26 13:05:39 +08002683#ifdef CONFIG_SMP
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002684u32 sched_get_wake_up_idle(struct task_struct *p)
2685{
2686 u32 enabled = p->flags & PF_WAKE_UP_IDLE;
2687
2688 return !!enabled;
2689}
2690
2691int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle)
2692{
2693 int enable = !!wake_up_idle;
2694
2695 if (enable)
2696 p->flags |= PF_WAKE_UP_IDLE;
2697 else
2698 p->flags &= ~PF_WAKE_UP_IDLE;
2699
2700 return 0;
2701}
2702
Paul Turner5b51f2f2012-10-04 13:18:32 +02002703/* Precomputed fixed inverse multiplies for multiplication by y^n */
2704static const u32 runnable_avg_yN_inv[] = {
2705 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2706 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2707 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2708 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2709 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2710 0x85aac367, 0x82cd8698,
2711};
2712
2713/*
2714 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
2715 * over-estimates when re-combining.
2716 */
2717static const u32 runnable_avg_yN_sum[] = {
2718 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2719 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2720 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2721};
2722
2723/*
Yuyang Du7b20b912016-05-03 05:54:27 +08002724 * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
2725 * lower integers. See Documentation/scheduler/sched-avg.txt how these
2726 * were generated:
2727 */
2728static const u32 __accumulated_sum_N32[] = {
2729 0, 23371, 35056, 40899, 43820, 45281,
2730 46011, 46376, 46559, 46650, 46696, 46719,
2731};
2732
2733/*
Paul Turner9d85f212012-10-04 13:18:29 +02002734 * Approximate:
2735 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
2736 */
2737static __always_inline u64 decay_load(u64 val, u64 n)
2738{
Paul Turner5b51f2f2012-10-04 13:18:32 +02002739 unsigned int local_n;
2740
2741 if (!n)
2742 return val;
2743 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2744 return 0;
2745
2746 /* after bounds checking we can collapse to 32-bit */
2747 local_n = n;
2748
2749 /*
2750 * As y^PERIOD = 1/2, we can combine
Zhihui Zhang9c58c792014-09-20 21:24:36 -04002751 * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2752 * With a look-up table which covers y^n (n<PERIOD)
Paul Turner5b51f2f2012-10-04 13:18:32 +02002753 *
2754 * To achieve constant time decay_load.
2755 */
2756 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2757 val >>= local_n / LOAD_AVG_PERIOD;
2758 local_n %= LOAD_AVG_PERIOD;
Paul Turner9d85f212012-10-04 13:18:29 +02002759 }
2760
Yuyang Du9d89c252015-07-15 08:04:37 +08002761 val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2762 return val;
Paul Turner5b51f2f2012-10-04 13:18:32 +02002763}
2764
2765/*
2766 * For updates fully spanning n periods, the contribution to runnable
2767 * average will be: \Sum 1024*y^n
2768 *
2769 * We can compute this reasonably efficiently by combining:
2770 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
2771 */
2772static u32 __compute_runnable_contrib(u64 n)
2773{
2774 u32 contrib = 0;
2775
2776 if (likely(n <= LOAD_AVG_PERIOD))
2777 return runnable_avg_yN_sum[n];
2778 else if (unlikely(n >= LOAD_AVG_MAX_N))
2779 return LOAD_AVG_MAX;
2780
Yuyang Du7b20b912016-05-03 05:54:27 +08002781 /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
2782 contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
2783 n %= LOAD_AVG_PERIOD;
Paul Turner5b51f2f2012-10-04 13:18:32 +02002784 contrib = decay_load(contrib, n);
2785 return contrib + runnable_avg_yN_sum[n];
Paul Turner9d85f212012-10-04 13:18:29 +02002786}
2787
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002788#ifdef CONFIG_SCHED_HMP
2789
2790/* CPU selection flag */
2791#define SBC_FLAG_PREV_CPU 0x1
2792#define SBC_FLAG_BEST_CAP_CPU 0x2
2793#define SBC_FLAG_CPU_COST 0x4
2794#define SBC_FLAG_MIN_COST 0x8
2795#define SBC_FLAG_IDLE_LEAST_LOADED 0x10
2796#define SBC_FLAG_IDLE_CSTATE 0x20
2797#define SBC_FLAG_COST_CSTATE_TIE_BREAKER 0x40
2798#define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER 0x80
2799#define SBC_FLAG_CSTATE_LOAD 0x100
2800#define SBC_FLAG_BEST_SIBLING 0x200
Pavankumar Kondeti72b49a32016-09-06 11:59:28 +05302801#define SBC_FLAG_WAKER_CPU 0x400
Srivatsa Vaddagirib36e6612016-09-09 19:38:03 +05302802#define SBC_FLAG_PACK_TASK 0x800
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002803
2804/* Cluster selection flag */
2805#define SBC_FLAG_COLOC_CLUSTER 0x10000
2806#define SBC_FLAG_WAKER_CLUSTER 0x20000
2807#define SBC_FLAG_BACKUP_CLUSTER 0x40000
Joonwoo Park427060b2016-09-23 12:21:55 -07002808#define SBC_FLAG_BOOST_CLUSTER 0x80000
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002809
2810struct cpu_select_env {
2811 struct task_struct *p;
2812 struct related_thread_group *rtg;
2813 u8 reason;
2814 u8 need_idle:1;
2815 u8 need_waker_cluster:1;
2816 u8 sync:1;
2817 u8 ignore_prev_cpu:1;
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07002818 enum sched_boost_policy boost_policy;
Srivatsa Vaddagirib36e6612016-09-09 19:38:03 +05302819 u8 pack_task:1;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002820 int prev_cpu;
2821 DECLARE_BITMAP(candidate_list, NR_CPUS);
2822 DECLARE_BITMAP(backup_list, NR_CPUS);
2823 u64 task_load;
2824 u64 cpu_load;
2825 u32 sbc_best_flag;
2826 u32 sbc_best_cluster_flag;
2827};
2828
2829struct cluster_cpu_stats {
2830 int best_idle_cpu, least_loaded_cpu;
2831 int best_capacity_cpu, best_cpu, best_sibling_cpu;
2832 int min_cost, best_sibling_cpu_cost;
Joonwoo Park3e7f21b2016-09-23 12:55:54 -07002833 int best_cpu_wakeup_latency;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002834 u64 min_load, best_load, best_sibling_cpu_load;
2835 s64 highest_spare_capacity;
2836};
2837
2838static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
2839{
2840 u64 total_load;
2841
2842 total_load = env->task_load + env->cpu_load;
2843
2844 if (total_load > sched_spill_load ||
2845 (rq->nr_running + 1) > sysctl_sched_spill_nr_run)
2846 return 1;
2847
2848 return 0;
2849}
2850
2851static int skip_cpu(int cpu, struct cpu_select_env *env)
2852{
2853 int tcpu = task_cpu(env->p);
2854 int skip = 0;
2855
2856 if (!env->reason)
2857 return 0;
2858
2859 if (is_reserved(cpu))
2860 return 1;
2861
2862 switch (env->reason) {
2863 case UP_MIGRATION:
2864 skip = !idle_cpu(cpu);
2865 break;
2866 case IRQLOAD_MIGRATION:
2867 /* Purposely fall through */
2868 default:
2869 skip = (cpu == tcpu);
2870 break;
2871 }
2872
2873 return skip;
2874}
2875
2876static inline int
2877acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env)
2878{
2879 int tcpu;
2880
2881 if (!env->reason)
2882 return 1;
2883
2884 tcpu = task_cpu(env->p);
2885 switch (env->reason) {
2886 case UP_MIGRATION:
2887 return cluster->capacity > cpu_capacity(tcpu);
2888
2889 case DOWN_MIGRATION:
2890 return cluster->capacity < cpu_capacity(tcpu);
2891
2892 default:
2893 break;
2894 }
2895
2896 return 1;
2897}
2898
2899static int
2900skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env)
2901{
2902 if (!test_bit(cluster->id, env->candidate_list))
2903 return 1;
2904
2905 if (!acceptable_capacity(cluster, env)) {
2906 __clear_bit(cluster->id, env->candidate_list);
2907 return 1;
2908 }
2909
2910 return 0;
2911}
2912
2913static struct sched_cluster *
2914select_least_power_cluster(struct cpu_select_env *env)
2915{
2916 struct sched_cluster *cluster;
2917
2918 if (env->rtg) {
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07002919 int cpu = cluster_first_cpu(env->rtg->preferred_cluster);
Joonwoo Park427060b2016-09-23 12:21:55 -07002920
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07002921 env->task_load = scale_load_to_cpu(task_load(env->p), cpu);
2922
2923 if (task_load_will_fit(env->p, env->task_load,
2924 cpu, env->boost_policy)) {
2925 env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER;
2926
2927 if (env->boost_policy == SCHED_BOOST_NONE)
2928 return env->rtg->preferred_cluster;
2929
Joonwoo Park427060b2016-09-23 12:21:55 -07002930 for_each_sched_cluster(cluster) {
2931 if (cluster != env->rtg->preferred_cluster) {
2932 __set_bit(cluster->id,
2933 env->backup_list);
2934 __clear_bit(cluster->id,
2935 env->candidate_list);
2936 }
2937 }
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07002938
2939 return env->rtg->preferred_cluster;
Joonwoo Park427060b2016-09-23 12:21:55 -07002940 }
2941
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07002942 /*
2943 * Since the task load does not fit on the preferred
2944 * cluster anymore, pretend that the task does not
2945 * have any preferred cluster. This allows the waking
2946 * task to get the appropriate CPU it needs as per the
2947 * non co-location placement policy without having to
2948 * wait until the preferred cluster is updated.
2949 */
2950 env->rtg = NULL;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002951 }
2952
2953 for_each_sched_cluster(cluster) {
2954 if (!skip_cluster(cluster, env)) {
2955 int cpu = cluster_first_cpu(cluster);
2956
2957 env->task_load = scale_load_to_cpu(task_load(env->p),
2958 cpu);
2959 if (task_load_will_fit(env->p, env->task_load, cpu,
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07002960 env->boost_policy))
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002961 return cluster;
2962
2963 __set_bit(cluster->id, env->backup_list);
2964 __clear_bit(cluster->id, env->candidate_list);
2965 }
2966 }
2967
2968 return NULL;
2969}
2970
2971static struct sched_cluster *
2972next_candidate(const unsigned long *list, int start, int end)
2973{
2974 int cluster_id;
2975
2976 cluster_id = find_next_bit(list, end, start - 1 + 1);
2977 if (cluster_id >= end)
2978 return NULL;
2979
2980 return sched_cluster[cluster_id];
2981}
2982
2983static void
2984update_spare_capacity(struct cluster_cpu_stats *stats,
2985 struct cpu_select_env *env, int cpu, int capacity,
2986 u64 cpu_load)
2987{
2988 s64 spare_capacity = sched_ravg_window - cpu_load;
2989
2990 if (spare_capacity > 0 &&
2991 (spare_capacity > stats->highest_spare_capacity ||
2992 (spare_capacity == stats->highest_spare_capacity &&
2993 ((!env->need_waker_cluster &&
2994 capacity > cpu_capacity(stats->best_capacity_cpu)) ||
2995 (env->need_waker_cluster &&
2996 cpu_rq(cpu)->nr_running <
2997 cpu_rq(stats->best_capacity_cpu)->nr_running))))) {
2998 /*
2999 * If sync waker is the only runnable of CPU, cr_avg of the
3000 * CPU is 0 so we have high chance to place the wakee on the
3001 * waker's CPU which likely causes preemtion of the waker.
3002 * This can lead migration of preempted waker. Place the
3003 * wakee on the real idle CPU when it's possible by checking
3004 * nr_running to avoid such preemption.
3005 */
3006 stats->highest_spare_capacity = spare_capacity;
3007 stats->best_capacity_cpu = cpu;
3008 }
3009}
3010
3011static inline void find_backup_cluster(
3012struct cpu_select_env *env, struct cluster_cpu_stats *stats)
3013{
3014 struct sched_cluster *next = NULL;
3015 int i;
3016
3017 while (!bitmap_empty(env->backup_list, num_clusters)) {
3018 next = next_candidate(env->backup_list, 0, num_clusters);
3019 __clear_bit(next->id, env->backup_list);
3020 for_each_cpu_and(i, &env->p->cpus_allowed, &next->cpus) {
3021 trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
3022 sched_irqload(i), power_cost(i, task_load(env->p) +
3023 cpu_cravg_sync(i, env->sync)), 0);
3024
3025 update_spare_capacity(stats, env, i, next->capacity,
3026 cpu_load_sync(i, env->sync));
3027 }
3028 env->sbc_best_cluster_flag = SBC_FLAG_BACKUP_CLUSTER;
3029 }
3030}
3031
3032struct sched_cluster *
3033next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env,
3034 struct cluster_cpu_stats *stats)
3035{
3036 struct sched_cluster *next = NULL;
3037
3038 __clear_bit(cluster->id, env->candidate_list);
3039
3040 if (env->rtg && preferred_cluster(cluster, env->p))
3041 return NULL;
3042
3043 do {
3044 if (bitmap_empty(env->candidate_list, num_clusters))
3045 return NULL;
3046
3047 next = next_candidate(env->candidate_list, 0, num_clusters);
3048 if (next) {
3049 if (next->min_power_cost > stats->min_cost) {
3050 clear_bit(next->id, env->candidate_list);
3051 next = NULL;
3052 continue;
3053 }
3054
3055 if (skip_cluster(next, env))
3056 next = NULL;
3057 }
3058 } while (!next);
3059
3060 env->task_load = scale_load_to_cpu(task_load(env->p),
3061 cluster_first_cpu(next));
3062 return next;
3063}
3064
3065#ifdef CONFIG_SCHED_HMP_CSTATE_AWARE
3066static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
3067 struct cpu_select_env *env, int cpu_cost)
3068{
Joonwoo Park3e7f21b2016-09-23 12:55:54 -07003069 int wakeup_latency;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003070 int prev_cpu = env->prev_cpu;
3071
Joonwoo Park3e7f21b2016-09-23 12:55:54 -07003072 wakeup_latency = cpu_rq(cpu)->wakeup_latency;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003073
3074 if (env->need_idle) {
3075 stats->min_cost = cpu_cost;
3076 if (idle_cpu(cpu)) {
Joonwoo Park3e7f21b2016-09-23 12:55:54 -07003077 if (wakeup_latency < stats->best_cpu_wakeup_latency ||
3078 (wakeup_latency == stats->best_cpu_wakeup_latency &&
3079 cpu == prev_cpu)) {
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003080 stats->best_idle_cpu = cpu;
Joonwoo Park3e7f21b2016-09-23 12:55:54 -07003081 stats->best_cpu_wakeup_latency = wakeup_latency;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003082 }
3083 } else {
3084 if (env->cpu_load < stats->min_load ||
3085 (env->cpu_load == stats->min_load &&
3086 cpu == prev_cpu)) {
3087 stats->least_loaded_cpu = cpu;
3088 stats->min_load = env->cpu_load;
3089 }
3090 }
3091
3092 return;
3093 }
3094
3095 if (cpu_cost < stats->min_cost) {
3096 stats->min_cost = cpu_cost;
Joonwoo Park3e7f21b2016-09-23 12:55:54 -07003097 stats->best_cpu_wakeup_latency = wakeup_latency;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003098 stats->best_load = env->cpu_load;
3099 stats->best_cpu = cpu;
3100 env->sbc_best_flag = SBC_FLAG_CPU_COST;
3101 return;
3102 }
3103
3104 /* CPU cost is the same. Start breaking the tie by C-state */
3105
Joonwoo Park3e7f21b2016-09-23 12:55:54 -07003106 if (wakeup_latency > stats->best_cpu_wakeup_latency)
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003107 return;
3108
Joonwoo Park3e7f21b2016-09-23 12:55:54 -07003109 if (wakeup_latency < stats->best_cpu_wakeup_latency) {
3110 stats->best_cpu_wakeup_latency = wakeup_latency;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003111 stats->best_load = env->cpu_load;
3112 stats->best_cpu = cpu;
3113 env->sbc_best_flag = SBC_FLAG_COST_CSTATE_TIE_BREAKER;
3114 return;
3115 }
3116
3117 /* C-state is the same. Use prev CPU to break the tie */
3118 if (cpu == prev_cpu) {
3119 stats->best_cpu = cpu;
3120 env->sbc_best_flag = SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER;
3121 return;
3122 }
3123
3124 if (stats->best_cpu != prev_cpu &&
Joonwoo Park3e7f21b2016-09-23 12:55:54 -07003125 ((wakeup_latency == 0 && env->cpu_load < stats->best_load) ||
3126 (wakeup_latency > 0 && env->cpu_load > stats->best_load))) {
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003127 stats->best_load = env->cpu_load;
3128 stats->best_cpu = cpu;
3129 env->sbc_best_flag = SBC_FLAG_CSTATE_LOAD;
3130 }
3131}
3132#else /* CONFIG_SCHED_HMP_CSTATE_AWARE */
3133static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
3134 struct cpu_select_env *env, int cpu_cost)
3135{
3136 int prev_cpu = env->prev_cpu;
3137
3138 if (cpu != prev_cpu && cpus_share_cache(prev_cpu, cpu)) {
3139 if (stats->best_sibling_cpu_cost > cpu_cost ||
3140 (stats->best_sibling_cpu_cost == cpu_cost &&
3141 stats->best_sibling_cpu_load > env->cpu_load)) {
3142 stats->best_sibling_cpu_cost = cpu_cost;
3143 stats->best_sibling_cpu_load = env->cpu_load;
3144 stats->best_sibling_cpu = cpu;
3145 }
3146 }
3147
3148 if ((cpu_cost < stats->min_cost) ||
3149 ((stats->best_cpu != prev_cpu &&
3150 stats->min_load > env->cpu_load) || cpu == prev_cpu)) {
3151 if (env->need_idle) {
3152 if (idle_cpu(cpu)) {
3153 stats->min_cost = cpu_cost;
3154 stats->best_idle_cpu = cpu;
3155 }
3156 } else {
3157 stats->min_cost = cpu_cost;
3158 stats->min_load = env->cpu_load;
3159 stats->best_cpu = cpu;
3160 env->sbc_best_flag = SBC_FLAG_MIN_COST;
3161 }
3162 }
3163}
3164#endif /* CONFIG_SCHED_HMP_CSTATE_AWARE */
3165
3166static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
3167 struct cpu_select_env *env)
3168{
3169 int cpu_cost;
3170
Srivatsa Vaddagirib36e6612016-09-09 19:38:03 +05303171 /*
3172 * We try to find the least loaded *busy* CPU irrespective
3173 * of the power cost.
3174 */
3175 if (env->pack_task)
3176 cpu_cost = cpu_min_power_cost(cpu);
3177
3178 else
3179 cpu_cost = power_cost(cpu, task_load(env->p) +
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003180 cpu_cravg_sync(cpu, env->sync));
Srivatsa Vaddagirib36e6612016-09-09 19:38:03 +05303181
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003182 if (cpu_cost <= stats->min_cost)
3183 __update_cluster_stats(cpu, stats, env, cpu_cost);
3184}
3185
3186static void find_best_cpu_in_cluster(struct sched_cluster *c,
3187 struct cpu_select_env *env, struct cluster_cpu_stats *stats)
3188{
3189 int i;
3190 struct cpumask search_cpus;
3191
3192 cpumask_and(&search_cpus, tsk_cpus_allowed(env->p), &c->cpus);
Olav Haugan3f2cb302016-05-31 14:34:46 -07003193 cpumask_andnot(&search_cpus, &search_cpus, cpu_isolated_mask);
3194
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003195 if (env->ignore_prev_cpu)
3196 cpumask_clear_cpu(env->prev_cpu, &search_cpus);
3197
3198 for_each_cpu(i, &search_cpus) {
3199 env->cpu_load = cpu_load_sync(i, env->sync);
3200
3201 trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
3202 sched_irqload(i),
3203 power_cost(i, task_load(env->p) +
3204 cpu_cravg_sync(i, env->sync)), 0);
3205
3206 if (unlikely(!cpu_active(i)) || skip_cpu(i, env))
3207 continue;
3208
3209 update_spare_capacity(stats, env, i, c->capacity,
3210 env->cpu_load);
3211
Joonwoo Park427060b2016-09-23 12:21:55 -07003212 /*
3213 * need_idle takes precedence over sched boost but when both
3214 * are set, idlest CPU with in all the clusters is selected
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07003215 * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the
3216 * big cluster is selected within boost_policy = BOOST_ON_BIG.
Joonwoo Park427060b2016-09-23 12:21:55 -07003217 */
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07003218 if ((!env->need_idle &&
3219 env->boost_policy != SCHED_BOOST_NONE) ||
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003220 env->need_waker_cluster ||
3221 sched_cpu_high_irqload(i) ||
3222 spill_threshold_crossed(env, cpu_rq(i)))
3223 continue;
3224
3225 update_cluster_stats(i, stats, env);
3226 }
3227}
3228
3229static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats)
3230{
3231 stats->best_cpu = stats->best_idle_cpu = -1;
3232 stats->best_capacity_cpu = stats->best_sibling_cpu = -1;
3233 stats->min_cost = stats->best_sibling_cpu_cost = INT_MAX;
3234 stats->min_load = stats->best_sibling_cpu_load = ULLONG_MAX;
3235 stats->highest_spare_capacity = 0;
3236 stats->least_loaded_cpu = -1;
Joonwoo Park3e7f21b2016-09-23 12:55:54 -07003237 stats->best_cpu_wakeup_latency = INT_MAX;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003238 /* No need to initialize stats->best_load */
3239}
3240
3241/*
3242 * Should task be woken to any available idle cpu?
3243 *
3244 * Waking tasks to idle cpu has mixed implications on both performance and
3245 * power. In many cases, scheduler can't estimate correctly impact of using idle
3246 * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel
3247 * module to pass a strong hint to scheduler that the task in question should be
3248 * woken to idle cpu, generally to improve performance.
3249 */
3250static inline int wake_to_idle(struct task_struct *p)
3251{
3252 return (current->flags & PF_WAKE_UP_IDLE) ||
3253 (p->flags & PF_WAKE_UP_IDLE) || sysctl_sched_wake_to_idle;
3254}
3255
Srivatsa Vaddagirib36e6612016-09-09 19:38:03 +05303256static inline bool env_has_special_flags(struct cpu_select_env *env)
3257{
3258 if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE ||
3259 env->reason)
3260 return true;
3261
3262 return false;
3263}
3264
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003265static inline bool
3266bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
3267{
3268 int prev_cpu;
3269 struct task_struct *task = env->p;
3270 struct sched_cluster *cluster;
3271
Srivatsa Vaddagirib36e6612016-09-09 19:38:03 +05303272 if (!task->ravg.mark_start || !sched_short_sleep_task_threshold)
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003273 return false;
3274
3275 prev_cpu = env->prev_cpu;
3276 if (!cpumask_test_cpu(prev_cpu, tsk_cpus_allowed(task)) ||
Olav Haugan3f2cb302016-05-31 14:34:46 -07003277 unlikely(!cpu_active(prev_cpu)) ||
3278 cpu_isolated(prev_cpu))
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003279 return false;
3280
3281 if (task->ravg.mark_start - task->last_cpu_selected_ts >=
3282 sched_long_cpu_selection_threshold)
3283 return false;
3284
3285 /*
3286 * This function should be used by task wake up path only as it's
3287 * assuming p->last_switch_out_ts as last sleep time.
3288 * p->last_switch_out_ts can denote last preemption time as well as
3289 * last sleep time.
3290 */
3291 if (task->ravg.mark_start - task->last_switch_out_ts >=
3292 sched_short_sleep_task_threshold)
3293 return false;
3294
3295 env->task_load = scale_load_to_cpu(task_load(task), prev_cpu);
3296 cluster = cpu_rq(prev_cpu)->cluster;
3297
3298 if (!task_load_will_fit(task, env->task_load, prev_cpu,
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07003299 sched_boost_policy())) {
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003300
3301 __set_bit(cluster->id, env->backup_list);
3302 __clear_bit(cluster->id, env->candidate_list);
3303 return false;
3304 }
3305
3306 env->cpu_load = cpu_load_sync(prev_cpu, env->sync);
3307 if (sched_cpu_high_irqload(prev_cpu) ||
3308 spill_threshold_crossed(env, cpu_rq(prev_cpu))) {
3309 update_spare_capacity(stats, env, prev_cpu,
3310 cluster->capacity, env->cpu_load);
3311 env->ignore_prev_cpu = 1;
3312 return false;
3313 }
3314
3315 return true;
3316}
3317
3318static inline bool
3319wake_to_waker_cluster(struct cpu_select_env *env)
3320{
Srivatsa Vaddagirib36e6612016-09-09 19:38:03 +05303321 return env->sync &&
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003322 task_load(current) > sched_big_waker_task_load &&
3323 task_load(env->p) < sched_small_wakee_task_load;
3324}
3325
Pavankumar Kondeti72b49a32016-09-06 11:59:28 +05303326static inline bool
3327bias_to_waker_cpu(struct task_struct *p, int cpu)
3328{
3329 return sysctl_sched_prefer_sync_wakee_to_waker &&
3330 cpu_rq(cpu)->nr_running == 1 &&
3331 cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) &&
3332 cpu_active(cpu) && !cpu_isolated(cpu);
3333}
3334
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003335static inline int
3336cluster_allowed(struct task_struct *p, struct sched_cluster *cluster)
3337{
3338 cpumask_t tmp_mask;
3339
3340 cpumask_and(&tmp_mask, &cluster->cpus, cpu_active_mask);
3341 cpumask_and(&tmp_mask, &tmp_mask, &p->cpus_allowed);
3342
3343 return !cpumask_empty(&tmp_mask);
3344}
3345
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003346/* return cheapest cpu that can fit this task */
3347static int select_best_cpu(struct task_struct *p, int target, int reason,
3348 int sync)
3349{
3350 struct sched_cluster *cluster, *pref_cluster = NULL;
3351 struct cluster_cpu_stats stats;
3352 struct related_thread_group *grp;
3353 unsigned int sbc_flag = 0;
Pavankumar Kondeti72b49a32016-09-06 11:59:28 +05303354 int cpu = raw_smp_processor_id();
Srivatsa Vaddagirib36e6612016-09-09 19:38:03 +05303355 bool special;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003356
3357 struct cpu_select_env env = {
3358 .p = p,
3359 .reason = reason,
3360 .need_idle = wake_to_idle(p),
3361 .need_waker_cluster = 0,
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003362 .sync = sync,
3363 .prev_cpu = target,
3364 .ignore_prev_cpu = 0,
3365 .rtg = NULL,
3366 .sbc_best_flag = 0,
3367 .sbc_best_cluster_flag = 0,
Srivatsa Vaddagirib36e6612016-09-09 19:38:03 +05303368 .pack_task = false,
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003369 };
3370
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07003371 env.boost_policy = task_sched_boost(p) ?
3372 sched_boost_policy() : SCHED_BOOST_NONE;
3373
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003374 bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
3375 bitmap_zero(env.backup_list, NR_CPUS);
3376
3377 init_cluster_cpu_stats(&stats);
Srivatsa Vaddagirib36e6612016-09-09 19:38:03 +05303378 special = env_has_special_flags(&env);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003379
3380 rcu_read_lock();
3381
3382 grp = task_related_thread_group(p);
3383
3384 if (grp && grp->preferred_cluster) {
3385 pref_cluster = grp->preferred_cluster;
3386 if (!cluster_allowed(p, pref_cluster))
3387 clear_bit(pref_cluster->id, env.candidate_list);
3388 else
3389 env.rtg = grp;
Srivatsa Vaddagirib36e6612016-09-09 19:38:03 +05303390 } else if (!special) {
Pavankumar Kondeti72b49a32016-09-06 11:59:28 +05303391 cluster = cpu_rq(cpu)->cluster;
3392 if (wake_to_waker_cluster(&env)) {
3393 if (bias_to_waker_cpu(p, cpu)) {
3394 target = cpu;
3395 sbc_flag = SBC_FLAG_WAKER_CLUSTER |
3396 SBC_FLAG_WAKER_CPU;
3397 goto out;
3398 } else if (cluster_allowed(p, cluster)) {
3399 env.need_waker_cluster = 1;
3400 bitmap_zero(env.candidate_list, NR_CPUS);
3401 __set_bit(cluster->id, env.candidate_list);
3402 env.sbc_best_cluster_flag =
3403 SBC_FLAG_WAKER_CLUSTER;
3404 }
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003405 } else if (bias_to_prev_cpu(&env, &stats)) {
3406 sbc_flag = SBC_FLAG_PREV_CPU;
3407 goto out;
3408 }
3409 }
3410
Srivatsa Vaddagirib36e6612016-09-09 19:38:03 +05303411 if (!special && is_short_burst_task(p)) {
3412 env.pack_task = true;
3413 sbc_flag = SBC_FLAG_PACK_TASK;
3414 }
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003415retry:
3416 cluster = select_least_power_cluster(&env);
3417
3418 if (!cluster)
3419 goto out;
3420
3421 /*
3422 * 'cluster' now points to the minimum power cluster which can satisfy
3423 * task's perf goals. Walk down the cluster list starting with that
3424 * cluster. For non-small tasks, skip clusters that don't have
3425 * mostly_idle/idle cpus
3426 */
3427
3428 do {
3429 find_best_cpu_in_cluster(cluster, &env, &stats);
3430
3431 } while ((cluster = next_best_cluster(cluster, &env, &stats)));
3432
3433 if (env.need_idle) {
3434 if (stats.best_idle_cpu >= 0) {
3435 target = stats.best_idle_cpu;
3436 sbc_flag |= SBC_FLAG_IDLE_CSTATE;
3437 } else if (stats.least_loaded_cpu >= 0) {
3438 target = stats.least_loaded_cpu;
3439 sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED;
3440 }
3441 } else if (stats.best_cpu >= 0) {
3442 if (stats.best_cpu != task_cpu(p) &&
3443 stats.min_cost == stats.best_sibling_cpu_cost) {
3444 stats.best_cpu = stats.best_sibling_cpu;
3445 sbc_flag |= SBC_FLAG_BEST_SIBLING;
3446 }
3447 sbc_flag |= env.sbc_best_flag;
3448 target = stats.best_cpu;
3449 } else {
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07003450 if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) {
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003451 env.rtg = NULL;
3452 goto retry;
3453 }
3454
Joonwoo Park427060b2016-09-23 12:21:55 -07003455 /*
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07003456 * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with
Joonwoo Park427060b2016-09-23 12:21:55 -07003457 * backup_list = little cluster, candidate_list = none and
3458 * stats->best_capacity_cpu points the best spare capacity
3459 * CPU among the CPUs in the big cluster.
3460 */
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07003461 if (env.boost_policy == SCHED_BOOST_ON_BIG &&
Joonwoo Park427060b2016-09-23 12:21:55 -07003462 stats.best_capacity_cpu >= 0)
3463 sbc_flag |= SBC_FLAG_BOOST_CLUSTER;
3464 else
3465 find_backup_cluster(&env, &stats);
3466
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003467 if (stats.best_capacity_cpu >= 0) {
3468 target = stats.best_capacity_cpu;
3469 sbc_flag |= SBC_FLAG_BEST_CAP_CPU;
3470 }
3471 }
3472 p->last_cpu_selected_ts = sched_ktime_clock();
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003473out:
Olav Haugan27960592016-10-17 17:05:54 -07003474 sbc_flag |= env.sbc_best_cluster_flag;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003475 rcu_read_unlock();
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07003476 trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p),
3477 env.reason, env.sync, env.need_idle, sbc_flag, target);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003478 return target;
3479}
3480
3481#ifdef CONFIG_CFS_BANDWIDTH
3482
3483static inline struct task_group *next_task_group(struct task_group *tg)
3484{
3485 tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list);
3486
3487 return (&tg->list == &task_groups) ? NULL : tg;
3488}
3489
3490/* Iterate over all cfs_rq in a cpu */
3491#define for_each_cfs_rq(cfs_rq, tg, cpu) \
3492 for (tg = container_of(&task_groups, struct task_group, list); \
3493 ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));)
3494
3495void reset_cfs_rq_hmp_stats(int cpu, int reset_cra)
3496{
3497 struct task_group *tg;
3498 struct cfs_rq *cfs_rq;
3499
3500 rcu_read_lock();
3501
3502 for_each_cfs_rq(cfs_rq, tg, cpu)
3503 reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra);
3504
3505 rcu_read_unlock();
3506}
3507
3508static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
3509
3510static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3511 struct task_struct *p, int change_cra);
3512static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3513 struct task_struct *p, int change_cra);
3514
3515/* Add task's contribution to a cpu' HMP statistics */
3516void inc_hmp_sched_stats_fair(struct rq *rq,
3517 struct task_struct *p, int change_cra)
3518{
3519 struct cfs_rq *cfs_rq;
3520 struct sched_entity *se = &p->se;
3521
3522 /*
3523 * Although below check is not strictly required (as
3524 * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called
3525 * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on
3526 * efficiency by short-circuiting for_each_sched_entity() loop when
3527 * sched_disable_window_stats
3528 */
3529 if (sched_disable_window_stats)
3530 return;
3531
3532 for_each_sched_entity(se) {
3533 cfs_rq = cfs_rq_of(se);
3534 inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra);
3535 if (cfs_rq_throttled(cfs_rq))
3536 break;
3537 }
3538
3539 /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */
3540 if (!se)
3541 inc_rq_hmp_stats(rq, p, change_cra);
3542}
3543
3544static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
3545 u32 new_task_load, u32 new_pred_demand)
3546{
3547 struct cfs_rq *cfs_rq;
3548 struct sched_entity *se = &p->se;
3549 s64 task_load_delta = (s64)new_task_load - task_load(p);
3550 s64 pred_demand_delta = PRED_DEMAND_DELTA;
3551
3552 for_each_sched_entity(se) {
3553 cfs_rq = cfs_rq_of(se);
3554
3555 fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p,
3556 task_load_delta,
3557 pred_demand_delta);
3558 fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta);
3559 if (cfs_rq_throttled(cfs_rq))
3560 break;
3561 }
3562
3563 /* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */
3564 if (!se) {
3565 fixup_cumulative_runnable_avg(&rq->hmp_stats, p,
3566 task_load_delta,
3567 pred_demand_delta);
3568 fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
3569 }
3570}
3571
3572static int task_will_be_throttled(struct task_struct *p);
3573
3574#else /* CONFIG_CFS_BANDWIDTH */
3575
3576inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { }
3577
3578static void
3579fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p,
3580 u32 new_task_load, u32 new_pred_demand)
3581{
3582 s64 task_load_delta = (s64)new_task_load - task_load(p);
3583 s64 pred_demand_delta = PRED_DEMAND_DELTA;
3584
3585 fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta,
3586 pred_demand_delta);
3587 fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta);
3588}
3589
3590static inline int task_will_be_throttled(struct task_struct *p)
3591{
3592 return 0;
3593}
3594
3595void inc_hmp_sched_stats_fair(struct rq *rq,
3596 struct task_struct *p, int change_cra)
3597{
3598 inc_nr_big_task(&rq->hmp_stats, p);
3599}
3600
3601#endif /* CONFIG_CFS_BANDWIDTH */
3602
3603/*
3604 * Reset balance_interval at all sched_domain levels of given cpu, so that it
3605 * honors kick.
3606 */
3607static inline void reset_balance_interval(int cpu)
3608{
3609 struct sched_domain *sd;
3610
3611 if (cpu >= nr_cpu_ids)
3612 return;
3613
3614 rcu_read_lock();
3615 for_each_domain(cpu, sd)
3616 sd->balance_interval = 0;
3617 rcu_read_unlock();
3618}
3619
3620/*
3621 * Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal
3622 * cpu as per its demand or priority)
3623 *
3624 * Returns reason why task needs to be migrated
3625 */
3626static inline int migration_needed(struct task_struct *p, int cpu)
3627{
3628 int nice;
3629 struct related_thread_group *grp;
3630
3631 if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1)
3632 return 0;
3633
3634 /* No need to migrate task that is about to be throttled */
3635 if (task_will_be_throttled(p))
3636 return 0;
3637
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07003638 if (sched_boost_policy() == SCHED_BOOST_ON_BIG &&
3639 cpu_capacity(cpu) != max_capacity && task_sched_boost(p))
3640 return UP_MIGRATION;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003641
3642 if (sched_cpu_high_irqload(cpu))
3643 return IRQLOAD_MIGRATION;
3644
3645 nice = task_nice(p);
3646 rcu_read_lock();
3647 grp = task_related_thread_group(p);
3648 if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE ||
3649 upmigrate_discouraged(p)) && cpu_capacity(cpu) > min_capacity) {
3650 rcu_read_unlock();
3651 return DOWN_MIGRATION;
3652 }
3653
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07003654 if (!task_will_fit(p, cpu)) {
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003655 rcu_read_unlock();
3656 return UP_MIGRATION;
3657 }
3658 rcu_read_unlock();
3659
3660 return 0;
3661}
3662
3663static inline int
3664kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
3665{
3666 unsigned long flags;
3667 int rc = 0;
3668
3669 /* Invoke active balance to force migrate currently running task */
3670 raw_spin_lock_irqsave(&rq->lock, flags);
3671 if (!rq->active_balance) {
3672 rq->active_balance = 1;
3673 rq->push_cpu = new_cpu;
3674 get_task_struct(p);
3675 rq->push_task = p;
3676 rc = 1;
3677 }
3678 raw_spin_unlock_irqrestore(&rq->lock, flags);
3679
3680 return rc;
3681}
3682
3683static DEFINE_RAW_SPINLOCK(migration_lock);
3684
3685/*
3686 * Check if currently running task should be migrated to a better cpu.
3687 *
3688 * Todo: Effect this via changes to nohz_balancer_kick() and load balance?
3689 */
3690void check_for_migration(struct rq *rq, struct task_struct *p)
3691{
3692 int cpu = cpu_of(rq), new_cpu;
3693 int active_balance = 0, reason;
3694
3695 reason = migration_needed(p, cpu);
3696 if (!reason)
3697 return;
3698
3699 raw_spin_lock(&migration_lock);
3700 new_cpu = select_best_cpu(p, cpu, reason, 0);
3701
3702 if (new_cpu != cpu) {
3703 active_balance = kick_active_balance(rq, p, new_cpu);
3704 if (active_balance)
3705 mark_reserved(new_cpu);
3706 }
3707
3708 raw_spin_unlock(&migration_lock);
3709
3710 if (active_balance)
3711 stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq,
3712 &rq->active_balance_work);
3713}
3714
3715#ifdef CONFIG_CFS_BANDWIDTH
3716
3717static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq)
3718{
3719 cfs_rq->hmp_stats.nr_big_tasks = 0;
3720 cfs_rq->hmp_stats.cumulative_runnable_avg = 0;
3721 cfs_rq->hmp_stats.pred_demands_sum = 0;
3722}
3723
3724static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3725 struct task_struct *p, int change_cra)
3726{
3727 inc_nr_big_task(&cfs_rq->hmp_stats, p);
3728 if (change_cra)
3729 inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
3730}
3731
3732static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3733 struct task_struct *p, int change_cra)
3734{
3735 dec_nr_big_task(&cfs_rq->hmp_stats, p);
3736 if (change_cra)
3737 dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p);
3738}
3739
3740static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
3741 struct cfs_rq *cfs_rq)
3742{
3743 stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks;
3744 stats->cumulative_runnable_avg +=
3745 cfs_rq->hmp_stats.cumulative_runnable_avg;
3746 stats->pred_demands_sum += cfs_rq->hmp_stats.pred_demands_sum;
3747}
3748
3749static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
3750 struct cfs_rq *cfs_rq)
3751{
3752 stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks;
3753 stats->cumulative_runnable_avg -=
3754 cfs_rq->hmp_stats.cumulative_runnable_avg;
3755 stats->pred_demands_sum -= cfs_rq->hmp_stats.pred_demands_sum;
3756
3757 BUG_ON(stats->nr_big_tasks < 0 ||
3758 (s64)stats->cumulative_runnable_avg < 0);
Olav Hauganb0cc9e12016-10-25 10:38:45 -07003759 BUG_ON((s64)stats->pred_demands_sum < 0);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003760}
3761
3762#else /* CONFIG_CFS_BANDWIDTH */
3763
3764static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3765 struct task_struct *p, int change_cra) { }
3766
3767static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3768 struct task_struct *p, int change_cra) { }
3769
3770#endif /* CONFIG_CFS_BANDWIDTH */
3771
3772#else /* CONFIG_SCHED_HMP */
3773
3774static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { }
3775
3776static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3777 struct task_struct *p, int change_cra) { }
3778
3779static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
3780 struct task_struct *p, int change_cra) { }
3781
3782static inline void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
3783 struct cfs_rq *cfs_rq)
3784{
3785}
3786
3787static inline void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats,
3788 struct cfs_rq *cfs_rq)
3789{
3790}
3791#endif /* CONFIG_SCHED_HMP */
3792
Peter Zijlstra54a21382015-09-07 15:05:42 +02003793#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
Dietmar Eggemanne0f5f3a2015-08-14 17:23:09 +01003794
Paul Turner9d85f212012-10-04 13:18:29 +02003795/*
3796 * We can represent the historical contribution to runnable average as the
3797 * coefficients of a geometric series. To do this we sub-divide our runnable
3798 * history into segments of approximately 1ms (1024us); label the segment that
3799 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
3800 *
3801 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
3802 * p0 p1 p2
3803 * (now) (~1ms ago) (~2ms ago)
3804 *
3805 * Let u_i denote the fraction of p_i that the entity was runnable.
3806 *
3807 * We then designate the fractions u_i as our co-efficients, yielding the
3808 * following representation of historical load:
3809 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
3810 *
3811 * We choose y based on the with of a reasonably scheduling period, fixing:
3812 * y^32 = 0.5
3813 *
3814 * This means that the contribution to load ~32ms ago (u_32) will be weighted
3815 * approximately half as much as the contribution to load within the last ms
3816 * (u_0).
3817 *
3818 * When a period "rolls over" and we have new u_0`, multiplying the previous
3819 * sum again by y is sufficient to update:
3820 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
3821 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
3822 */
Yuyang Du9d89c252015-07-15 08:04:37 +08003823static __always_inline int
3824__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
Yuyang Du13962232015-07-15 08:04:41 +08003825 unsigned long weight, int running, struct cfs_rq *cfs_rq)
Paul Turner9d85f212012-10-04 13:18:29 +02003826{
Dietmar Eggemanne0f5f3a2015-08-14 17:23:09 +01003827 u64 delta, scaled_delta, periods;
Yuyang Du9d89c252015-07-15 08:04:37 +08003828 u32 contrib;
Peter Zijlstra6115c792015-09-07 15:09:15 +02003829 unsigned int delta_w, scaled_delta_w, decayed = 0;
Dietmar Eggemann6f2b0452015-09-07 14:57:22 +01003830 unsigned long scale_freq, scale_cpu;
Paul Turner9d85f212012-10-04 13:18:29 +02003831
Yuyang Du9d89c252015-07-15 08:04:37 +08003832 delta = now - sa->last_update_time;
Paul Turner9d85f212012-10-04 13:18:29 +02003833 /*
3834 * This should only happen when time goes backwards, which it
3835 * unfortunately does during sched clock init when we swap over to TSC.
3836 */
3837 if ((s64)delta < 0) {
Yuyang Du9d89c252015-07-15 08:04:37 +08003838 sa->last_update_time = now;
Paul Turner9d85f212012-10-04 13:18:29 +02003839 return 0;
3840 }
3841
3842 /*
3843 * Use 1024ns as the unit of measurement since it's a reasonable
3844 * approximation of 1us and fast to compute.
3845 */
3846 delta >>= 10;
3847 if (!delta)
3848 return 0;
Yuyang Du9d89c252015-07-15 08:04:37 +08003849 sa->last_update_time = now;
Paul Turner9d85f212012-10-04 13:18:29 +02003850
Dietmar Eggemann6f2b0452015-09-07 14:57:22 +01003851 scale_freq = arch_scale_freq_capacity(NULL, cpu);
3852 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
3853
Paul Turner9d85f212012-10-04 13:18:29 +02003854 /* delta_w is the amount already accumulated against our next period */
Yuyang Du9d89c252015-07-15 08:04:37 +08003855 delta_w = sa->period_contrib;
Paul Turner9d85f212012-10-04 13:18:29 +02003856 if (delta + delta_w >= 1024) {
Paul Turner9d85f212012-10-04 13:18:29 +02003857 decayed = 1;
3858
Yuyang Du9d89c252015-07-15 08:04:37 +08003859 /* how much left for next period will start over, we don't know yet */
3860 sa->period_contrib = 0;
3861
Paul Turner9d85f212012-10-04 13:18:29 +02003862 /*
3863 * Now that we know we're crossing a period boundary, figure
3864 * out how much from delta we need to complete the current
3865 * period and accrue it.
3866 */
3867 delta_w = 1024 - delta_w;
Peter Zijlstra54a21382015-09-07 15:05:42 +02003868 scaled_delta_w = cap_scale(delta_w, scale_freq);
Yuyang Du13962232015-07-15 08:04:41 +08003869 if (weight) {
Dietmar Eggemanne0f5f3a2015-08-14 17:23:09 +01003870 sa->load_sum += weight * scaled_delta_w;
3871 if (cfs_rq) {
3872 cfs_rq->runnable_load_sum +=
3873 weight * scaled_delta_w;
3874 }
Yuyang Du13962232015-07-15 08:04:41 +08003875 }
Vincent Guittot36ee28e2015-02-27 16:54:04 +01003876 if (running)
Peter Zijlstra006cdf02015-09-09 09:06:17 +02003877 sa->util_sum += scaled_delta_w * scale_cpu;
Paul Turner9d85f212012-10-04 13:18:29 +02003878
Paul Turner5b51f2f2012-10-04 13:18:32 +02003879 delta -= delta_w;
Paul Turner9d85f212012-10-04 13:18:29 +02003880
Paul Turner5b51f2f2012-10-04 13:18:32 +02003881 /* Figure out how many additional periods this update spans */
3882 periods = delta / 1024;
3883 delta %= 1024;
3884
Yuyang Du9d89c252015-07-15 08:04:37 +08003885 sa->load_sum = decay_load(sa->load_sum, periods + 1);
Yuyang Du13962232015-07-15 08:04:41 +08003886 if (cfs_rq) {
3887 cfs_rq->runnable_load_sum =
3888 decay_load(cfs_rq->runnable_load_sum, periods + 1);
3889 }
Yuyang Du9d89c252015-07-15 08:04:37 +08003890 sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
Paul Turner5b51f2f2012-10-04 13:18:32 +02003891
3892 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
Yuyang Du9d89c252015-07-15 08:04:37 +08003893 contrib = __compute_runnable_contrib(periods);
Peter Zijlstra54a21382015-09-07 15:05:42 +02003894 contrib = cap_scale(contrib, scale_freq);
Yuyang Du13962232015-07-15 08:04:41 +08003895 if (weight) {
Yuyang Du9d89c252015-07-15 08:04:37 +08003896 sa->load_sum += weight * contrib;
Yuyang Du13962232015-07-15 08:04:41 +08003897 if (cfs_rq)
3898 cfs_rq->runnable_load_sum += weight * contrib;
3899 }
Vincent Guittot36ee28e2015-02-27 16:54:04 +01003900 if (running)
Peter Zijlstra006cdf02015-09-09 09:06:17 +02003901 sa->util_sum += contrib * scale_cpu;
Paul Turner9d85f212012-10-04 13:18:29 +02003902 }
3903
3904 /* Remainder of delta accrued against u_0` */
Peter Zijlstra54a21382015-09-07 15:05:42 +02003905 scaled_delta = cap_scale(delta, scale_freq);
Yuyang Du13962232015-07-15 08:04:41 +08003906 if (weight) {
Dietmar Eggemanne0f5f3a2015-08-14 17:23:09 +01003907 sa->load_sum += weight * scaled_delta;
Yuyang Du13962232015-07-15 08:04:41 +08003908 if (cfs_rq)
Dietmar Eggemanne0f5f3a2015-08-14 17:23:09 +01003909 cfs_rq->runnable_load_sum += weight * scaled_delta;
Yuyang Du13962232015-07-15 08:04:41 +08003910 }
Vincent Guittot36ee28e2015-02-27 16:54:04 +01003911 if (running)
Peter Zijlstra006cdf02015-09-09 09:06:17 +02003912 sa->util_sum += scaled_delta * scale_cpu;
Yuyang Du9d89c252015-07-15 08:04:37 +08003913
3914 sa->period_contrib += delta;
3915
3916 if (decayed) {
3917 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
Yuyang Du13962232015-07-15 08:04:41 +08003918 if (cfs_rq) {
3919 cfs_rq->runnable_load_avg =
3920 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
3921 }
Peter Zijlstra006cdf02015-09-09 09:06:17 +02003922 sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
Yuyang Du9d89c252015-07-15 08:04:37 +08003923 }
Paul Turner9d85f212012-10-04 13:18:29 +02003924
3925 return decayed;
3926}
3927
Paul Turnerc566e8e2012-10-04 13:18:30 +02003928#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra7c3edd22016-07-13 10:56:25 +02003929/**
3930 * update_tg_load_avg - update the tg's load avg
3931 * @cfs_rq: the cfs_rq whose avg changed
3932 * @force: update regardless of how small the difference
3933 *
3934 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3935 * However, because tg->load_avg is a global value there are performance
3936 * considerations.
3937 *
3938 * In order to avoid having to look at the other cfs_rq's, we use a
3939 * differential update where we store the last value we propagated. This in
3940 * turn allows skipping updates if the differential is 'small'.
3941 *
3942 * Updating tg's load_avg is necessary before update_cfs_share() (which is
3943 * done) and effective_load() (which is not done because it is too costly).
Paul Turnerbb17f652012-10-04 13:18:31 +02003944 */
Yuyang Du9d89c252015-07-15 08:04:37 +08003945static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
Paul Turnerbb17f652012-10-04 13:18:31 +02003946{
Yuyang Du9d89c252015-07-15 08:04:37 +08003947 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
Paul Turnerbb17f652012-10-04 13:18:31 +02003948
Waiman Longaa0b7ae2015-12-02 13:41:50 -05003949 /*
3950 * No need to update load_avg for root_task_group as it is not used.
3951 */
3952 if (cfs_rq->tg == &root_task_group)
3953 return;
3954
Yuyang Du9d89c252015-07-15 08:04:37 +08003955 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3956 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3957 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
Paul Turnerbb17f652012-10-04 13:18:31 +02003958 }
Paul Turner8165e142012-10-04 13:18:31 +02003959}
Dietmar Eggemannf5f97392014-02-26 11:19:33 +00003960
Byungchul Parkad936d82015-10-24 01:16:19 +09003961/*
3962 * Called within set_task_rq() right before setting a task's cpu. The
3963 * caller only guarantees p->pi_lock is held; no other assumptions,
3964 * including the state of rq->lock, should be made.
3965 */
3966void set_task_rq_fair(struct sched_entity *se,
3967 struct cfs_rq *prev, struct cfs_rq *next)
3968{
3969 if (!sched_feat(ATTACH_AGE_LOAD))
3970 return;
3971
3972 /*
3973 * We are supposed to update the task to "current" time, then its up to
3974 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
3975 * getting what current time is, so simply throw away the out-of-date
3976 * time. This will result in the wakee task is less decayed, but giving
3977 * the wakee more load sounds not bad.
3978 */
3979 if (se->avg.last_update_time && prev) {
3980 u64 p_last_update_time;
3981 u64 n_last_update_time;
3982
3983#ifndef CONFIG_64BIT
3984 u64 p_last_update_time_copy;
3985 u64 n_last_update_time_copy;
3986
3987 do {
3988 p_last_update_time_copy = prev->load_last_update_time_copy;
3989 n_last_update_time_copy = next->load_last_update_time_copy;
3990
3991 smp_rmb();
3992
3993 p_last_update_time = prev->avg.last_update_time;
3994 n_last_update_time = next->avg.last_update_time;
3995
3996 } while (p_last_update_time != p_last_update_time_copy ||
3997 n_last_update_time != n_last_update_time_copy);
3998#else
3999 p_last_update_time = prev->avg.last_update_time;
4000 n_last_update_time = next->avg.last_update_time;
4001#endif
4002 __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
4003 &se->avg, 0, 0, NULL);
4004 se->avg.last_update_time = n_last_update_time;
4005 }
4006}
Peter Zijlstra6e831252014-02-11 16:11:48 +01004007#else /* CONFIG_FAIR_GROUP_SCHED */
Yuyang Du9d89c252015-07-15 08:04:37 +08004008static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
Peter Zijlstra6e831252014-02-11 16:11:48 +01004009#endif /* CONFIG_FAIR_GROUP_SCHED */
Paul Turnerc566e8e2012-10-04 13:18:30 +02004010
Steve Mucklea2c6c912016-03-24 15:26:07 -07004011static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
Yuyang Du9d89c252015-07-15 08:04:37 +08004012{
Rafael J. Wysocki58919e82016-08-16 22:14:55 +02004013 if (&this_rq()->cfs == cfs_rq) {
Steve Muckle21e96f82016-03-21 17:21:07 -07004014 /*
4015 * There are a few boundary cases this might miss but it should
4016 * get called often enough that that should (hopefully) not be
4017 * a real problem -- added to that it only calls on the local
4018 * CPU, so if we enqueue remotely we'll miss an update, but
4019 * the next tick/schedule should update.
4020 *
4021 * It will not get called when we go idle, because the idle
4022 * thread is a different class (!fair), nor will the utilization
4023 * number include things like RT tasks.
4024 *
4025 * As is, the util number is not freq-invariant (we'd have to
4026 * implement arch_scale_freq_capacity() for that).
4027 *
4028 * See cpu_util().
4029 */
Rafael J. Wysocki12bde332016-08-10 03:11:17 +02004030 cpufreq_update_util(rq_of(cfs_rq), 0);
Steve Muckle21e96f82016-03-21 17:21:07 -07004031 }
Steve Mucklea2c6c912016-03-24 15:26:07 -07004032}
4033
Peter Zijlstra89741892016-06-16 10:50:40 +02004034/*
4035 * Unsigned subtract and clamp on underflow.
4036 *
4037 * Explicitly do a load-store to ensure the intermediate value never hits
4038 * memory. This allows lockless observations without ever seeing the negative
4039 * values.
4040 */
4041#define sub_positive(_ptr, _val) do { \
4042 typeof(_ptr) ptr = (_ptr); \
4043 typeof(*ptr) val = (_val); \
4044 typeof(*ptr) res, var = READ_ONCE(*ptr); \
4045 res = var - val; \
4046 if (res > var) \
4047 res = 0; \
4048 WRITE_ONCE(*ptr, res); \
4049} while (0)
4050
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02004051/**
4052 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
4053 * @now: current time, as per cfs_rq_clock_task()
4054 * @cfs_rq: cfs_rq to update
4055 * @update_freq: should we call cfs_rq_util_change() or will the call do so
4056 *
4057 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
4058 * avg. The immediate corollary is that all (fair) tasks must be attached, see
4059 * post_init_entity_util_avg().
4060 *
4061 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
4062 *
Peter Zijlstra7c3edd22016-07-13 10:56:25 +02004063 * Returns true if the load decayed or we removed load.
4064 *
4065 * Since both these conditions indicate a changed cfs_rq->avg.load we should
4066 * call update_tg_load_avg() when this function returns true.
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02004067 */
Steve Mucklea2c6c912016-03-24 15:26:07 -07004068static inline int
4069update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
4070{
4071 struct sched_avg *sa = &cfs_rq->avg;
4072 int decayed, removed_load = 0, removed_util = 0;
4073
4074 if (atomic_long_read(&cfs_rq->removed_load_avg)) {
4075 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
Peter Zijlstra89741892016-06-16 10:50:40 +02004076 sub_positive(&sa->load_avg, r);
4077 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
Steve Mucklea2c6c912016-03-24 15:26:07 -07004078 removed_load = 1;
4079 }
4080
4081 if (atomic_long_read(&cfs_rq->removed_util_avg)) {
4082 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
Peter Zijlstra89741892016-06-16 10:50:40 +02004083 sub_positive(&sa->util_avg, r);
4084 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
Steve Mucklea2c6c912016-03-24 15:26:07 -07004085 removed_util = 1;
4086 }
4087
4088 decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
4089 scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
4090
4091#ifndef CONFIG_64BIT
4092 smp_wmb();
4093 cfs_rq->load_last_update_time_copy = sa->last_update_time;
4094#endif
4095
4096 if (update_freq && (decayed || removed_util))
4097 cfs_rq_util_change(cfs_rq);
Steve Muckle21e96f82016-03-21 17:21:07 -07004098
Steve Muckle41e0d372016-03-21 17:21:08 -07004099 return decayed || removed_load;
Yuyang Du9d89c252015-07-15 08:04:37 +08004100}
4101
4102/* Update task and its cfs_rq load average */
4103static inline void update_load_avg(struct sched_entity *se, int update_tg)
Paul Turner9d85f212012-10-04 13:18:29 +02004104{
Paul Turner2dac7542012-10-04 13:18:30 +02004105 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Yuyang Du9d89c252015-07-15 08:04:37 +08004106 u64 now = cfs_rq_clock_task(cfs_rq);
Rafael J. Wysocki34e2c552016-02-15 20:20:42 +01004107 struct rq *rq = rq_of(cfs_rq);
4108 int cpu = cpu_of(rq);
Paul Turner2dac7542012-10-04 13:18:30 +02004109
Paul Turnerf1b17282012-10-04 13:18:31 +02004110 /*
Yuyang Du9d89c252015-07-15 08:04:37 +08004111 * Track task load average for carrying it to new CPU after migrated, and
4112 * track group sched_entity load average for task_h_load calc in migration
Paul Turnerf1b17282012-10-04 13:18:31 +02004113 */
Yuyang Du9d89c252015-07-15 08:04:37 +08004114 __update_load_avg(now, cpu, &se->avg,
Byungchul Parka05e8c52015-08-20 20:21:56 +09004115 se->on_rq * scale_load_down(se->load.weight),
4116 cfs_rq->curr == se, NULL);
Paul Turnerf1b17282012-10-04 13:18:31 +02004117
Steve Mucklea2c6c912016-03-24 15:26:07 -07004118 if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
Yuyang Du9d89c252015-07-15 08:04:37 +08004119 update_tg_load_avg(cfs_rq, 0);
4120}
Paul Turner2dac7542012-10-04 13:18:30 +02004121
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02004122/**
4123 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
4124 * @cfs_rq: cfs_rq to attach to
4125 * @se: sched_entity to attach
4126 *
4127 * Must call update_cfs_rq_load_avg() before this, since we rely on
4128 * cfs_rq->avg.last_update_time being current.
4129 */
Byungchul Parka05e8c52015-08-20 20:21:56 +09004130static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4131{
Peter Zijlstraa9280512015-09-11 16:10:59 +02004132 if (!sched_feat(ATTACH_AGE_LOAD))
4133 goto skip_aging;
4134
Byungchul Park6efdb102015-08-20 20:21:59 +09004135 /*
4136 * If we got migrated (either between CPUs or between cgroups) we'll
4137 * have aged the average right before clearing @last_update_time.
Peter Zijlstra7dc603c2016-06-16 13:29:28 +02004138 *
4139 * Or we're fresh through post_init_entity_util_avg().
Byungchul Park6efdb102015-08-20 20:21:59 +09004140 */
4141 if (se->avg.last_update_time) {
4142 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
4143 &se->avg, 0, 0, NULL);
4144
4145 /*
4146 * XXX: we could have just aged the entire load away if we've been
4147 * absent from the fair class for too long.
4148 */
4149 }
4150
Peter Zijlstraa9280512015-09-11 16:10:59 +02004151skip_aging:
Byungchul Parka05e8c52015-08-20 20:21:56 +09004152 se->avg.last_update_time = cfs_rq->avg.last_update_time;
4153 cfs_rq->avg.load_avg += se->avg.load_avg;
4154 cfs_rq->avg.load_sum += se->avg.load_sum;
4155 cfs_rq->avg.util_avg += se->avg.util_avg;
4156 cfs_rq->avg.util_sum += se->avg.util_sum;
Steve Mucklea2c6c912016-03-24 15:26:07 -07004157
4158 cfs_rq_util_change(cfs_rq);
Byungchul Parka05e8c52015-08-20 20:21:56 +09004159}
4160
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02004161/**
4162 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
4163 * @cfs_rq: cfs_rq to detach from
4164 * @se: sched_entity to detach
4165 *
4166 * Must call update_cfs_rq_load_avg() before this, since we rely on
4167 * cfs_rq->avg.last_update_time being current.
4168 */
Byungchul Parka05e8c52015-08-20 20:21:56 +09004169static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4170{
4171 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
4172 &se->avg, se->on_rq * scale_load_down(se->load.weight),
4173 cfs_rq->curr == se, NULL);
4174
Peter Zijlstra89741892016-06-16 10:50:40 +02004175 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
4176 sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
4177 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
4178 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
Steve Mucklea2c6c912016-03-24 15:26:07 -07004179
4180 cfs_rq_util_change(cfs_rq);
Byungchul Parka05e8c52015-08-20 20:21:56 +09004181}
4182
Yuyang Du9d89c252015-07-15 08:04:37 +08004183/* Add the load generated by se into cfs_rq's load average */
4184static inline void
4185enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4186{
4187 struct sched_avg *sa = &se->avg;
4188 u64 now = cfs_rq_clock_task(cfs_rq);
Byungchul Parka05e8c52015-08-20 20:21:56 +09004189 int migrated, decayed;
Paul Turner9ee474f2012-10-04 13:18:30 +02004190
Byungchul Parka05e8c52015-08-20 20:21:56 +09004191 migrated = !sa->last_update_time;
4192 if (!migrated) {
Yuyang Du9d89c252015-07-15 08:04:37 +08004193 __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
Yuyang Du13962232015-07-15 08:04:41 +08004194 se->on_rq * scale_load_down(se->load.weight),
4195 cfs_rq->curr == se, NULL);
Yuyang Du9d89c252015-07-15 08:04:37 +08004196 }
4197
Steve Mucklea2c6c912016-03-24 15:26:07 -07004198 decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
Yuyang Du9d89c252015-07-15 08:04:37 +08004199
Yuyang Du13962232015-07-15 08:04:41 +08004200 cfs_rq->runnable_load_avg += sa->load_avg;
4201 cfs_rq->runnable_load_sum += sa->load_sum;
4202
Byungchul Parka05e8c52015-08-20 20:21:56 +09004203 if (migrated)
4204 attach_entity_load_avg(cfs_rq, se);
Yuyang Du9d89c252015-07-15 08:04:37 +08004205
4206 if (decayed || migrated)
4207 update_tg_load_avg(cfs_rq, 0);
Paul Turner9ee474f2012-10-04 13:18:30 +02004208}
4209
Yuyang Du13962232015-07-15 08:04:41 +08004210/* Remove the runnable load generated by se from cfs_rq's runnable load average */
4211static inline void
4212dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4213{
4214 update_load_avg(se, 1);
4215
4216 cfs_rq->runnable_load_avg =
4217 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
4218 cfs_rq->runnable_load_sum =
Byungchul Parka05e8c52015-08-20 20:21:56 +09004219 max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
Yuyang Du13962232015-07-15 08:04:41 +08004220}
4221
Yuyang Du0905f042015-12-17 07:34:27 +08004222#ifndef CONFIG_64BIT
4223static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
4224{
4225 u64 last_update_time_copy;
4226 u64 last_update_time;
4227
4228 do {
4229 last_update_time_copy = cfs_rq->load_last_update_time_copy;
4230 smp_rmb();
4231 last_update_time = cfs_rq->avg.last_update_time;
4232 } while (last_update_time != last_update_time_copy);
4233
4234 return last_update_time;
4235}
4236#else
4237static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
4238{
4239 return cfs_rq->avg.last_update_time;
4240}
4241#endif
4242
Paul Turner9ee474f2012-10-04 13:18:30 +02004243/*
Yuyang Du9d89c252015-07-15 08:04:37 +08004244 * Task first catches up with cfs_rq, and then subtract
4245 * itself from the cfs_rq (task must be off the queue now).
Paul Turner9ee474f2012-10-04 13:18:30 +02004246 */
Yuyang Du9d89c252015-07-15 08:04:37 +08004247void remove_entity_load_avg(struct sched_entity *se)
Paul Turner9ee474f2012-10-04 13:18:30 +02004248{
Yuyang Du9d89c252015-07-15 08:04:37 +08004249 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4250 u64 last_update_time;
Paul Turner9ee474f2012-10-04 13:18:30 +02004251
Yuyang Du0905f042015-12-17 07:34:27 +08004252 /*
Peter Zijlstra7dc603c2016-06-16 13:29:28 +02004253 * tasks cannot exit without having gone through wake_up_new_task() ->
4254 * post_init_entity_util_avg() which will have added things to the
4255 * cfs_rq, so we can remove unconditionally.
4256 *
4257 * Similarly for groups, they will have passed through
4258 * post_init_entity_util_avg() before unregister_sched_fair_group()
4259 * calls this.
Yuyang Du0905f042015-12-17 07:34:27 +08004260 */
Paul Turner9ee474f2012-10-04 13:18:30 +02004261
Yuyang Du0905f042015-12-17 07:34:27 +08004262 last_update_time = cfs_rq_last_update_time(cfs_rq);
Paul Turner9ee474f2012-10-04 13:18:30 +02004263
Yuyang Du13962232015-07-15 08:04:41 +08004264 __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
Yuyang Du9d89c252015-07-15 08:04:37 +08004265 atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
4266 atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
Paul Turner2dac7542012-10-04 13:18:30 +02004267}
Vincent Guittot642dbc32013-04-18 18:34:26 +02004268
Yuyang Du7ea241a2015-07-15 08:04:42 +08004269static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
4270{
4271 return cfs_rq->runnable_load_avg;
4272}
4273
4274static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
4275{
4276 return cfs_rq->avg.load_avg;
4277}
4278
Peter Zijlstra6e831252014-02-11 16:11:48 +01004279static int idle_balance(struct rq *this_rq);
4280
Peter Zijlstra38033c32014-01-23 20:32:21 +01004281#else /* CONFIG_SMP */
4282
Peter Zijlstra01011472016-06-17 11:20:46 +02004283static inline int
4284update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
4285{
4286 return 0;
4287}
4288
Rafael J. Wysocki536bd002016-05-06 14:58:43 +02004289static inline void update_load_avg(struct sched_entity *se, int not_used)
4290{
Rafael J. Wysocki12bde332016-08-10 03:11:17 +02004291 cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
Rafael J. Wysocki536bd002016-05-06 14:58:43 +02004292}
4293
Yuyang Du9d89c252015-07-15 08:04:37 +08004294static inline void
4295enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
Yuyang Du13962232015-07-15 08:04:41 +08004296static inline void
4297dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
Yuyang Du9d89c252015-07-15 08:04:37 +08004298static inline void remove_entity_load_avg(struct sched_entity *se) {}
Peter Zijlstra6e831252014-02-11 16:11:48 +01004299
Byungchul Parka05e8c52015-08-20 20:21:56 +09004300static inline void
4301attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4302static inline void
4303detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4304
Peter Zijlstra6e831252014-02-11 16:11:48 +01004305static inline int idle_balance(struct rq *rq)
4306{
4307 return 0;
4308}
4309
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07004310static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
4311 struct task_struct *p, int change_cra) { }
4312
4313static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq,
4314 struct task_struct *p, int change_cra) { }
4315
Peter Zijlstra38033c32014-01-23 20:32:21 +01004316#endif /* CONFIG_SMP */
Paul Turner9d85f212012-10-04 13:18:29 +02004317
Peter Zijlstraddc97292007-10-15 17:00:10 +02004318static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
4319{
4320#ifdef CONFIG_SCHED_DEBUG
4321 s64 d = se->vruntime - cfs_rq->min_vruntime;
4322
4323 if (d < 0)
4324 d = -d;
4325
4326 if (d > 3*sysctl_sched_latency)
Josh Poimboeufae928822016-06-17 12:43:24 -05004327 schedstat_inc(cfs_rq->nr_spread_over);
Peter Zijlstraddc97292007-10-15 17:00:10 +02004328#endif
4329}
4330
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004331static void
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02004332place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
4333{
Peter Zijlstra1af5f732008-10-24 11:06:13 +02004334 u64 vruntime = cfs_rq->min_vruntime;
Peter Zijlstra94dfb5e2007-10-15 17:00:05 +02004335
Peter Zijlstra2cb86002007-11-09 22:39:37 +01004336 /*
4337 * The 'current' period is already promised to the current tasks,
4338 * however the extra weight of the new task will slow them down a
4339 * little, place the new task so that it fits in the slot that
4340 * stays open at the end.
4341 */
Peter Zijlstra94dfb5e2007-10-15 17:00:05 +02004342 if (initial && sched_feat(START_DEBIT))
Peter Zijlstraf9c0b092008-10-17 19:27:04 +02004343 vruntime += sched_vslice(cfs_rq, se);
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02004344
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02004345 /* sleeps up to a single latency don't count. */
Mike Galbraith5ca98802010-03-11 17:17:17 +01004346 if (!initial) {
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02004347 unsigned long thresh = sysctl_sched_latency;
Peter Zijlstraa7be37a2008-06-27 13:41:11 +02004348
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02004349 /*
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02004350 * Halve their sleep time's effect, to allow
4351 * for a gentler effect of sleepers:
4352 */
4353 if (sched_feat(GENTLE_FAIR_SLEEPERS))
4354 thresh >>= 1;
Ingo Molnar51e03042009-09-16 08:54:45 +02004355
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02004356 vruntime -= thresh;
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02004357 }
4358
Mike Galbraithb5d9d732009-09-08 11:12:28 +02004359 /* ensure we never gain time by being placed backwards. */
Viresh Kumar16c8f1c2012-11-08 13:33:46 +05304360 se->vruntime = max_vruntime(se->vruntime, vruntime);
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02004361}
4362
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004363static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
4364
Mel Gormancb251762016-02-05 09:08:36 +00004365static inline void check_schedstat_required(void)
4366{
4367#ifdef CONFIG_SCHEDSTATS
4368 if (schedstat_enabled())
4369 return;
4370
4371 /* Force schedstat enabled if a dependent tracepoint is active */
4372 if (trace_sched_stat_wait_enabled() ||
4373 trace_sched_stat_sleep_enabled() ||
4374 trace_sched_stat_iowait_enabled() ||
4375 trace_sched_stat_blocked_enabled() ||
4376 trace_sched_stat_runtime_enabled()) {
Josh Poimboeufeda8dca2016-06-13 02:32:09 -05004377 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
Mel Gormancb251762016-02-05 09:08:36 +00004378 "stat_blocked and stat_runtime require the "
4379 "kernel parameter schedstats=enabled or "
4380 "kernel.sched_schedstats=1\n");
4381 }
4382#endif
4383}
4384
Peter Zijlstrab5179ac2016-05-11 16:10:34 +02004385
4386/*
4387 * MIGRATION
4388 *
4389 * dequeue
4390 * update_curr()
4391 * update_min_vruntime()
4392 * vruntime -= min_vruntime
4393 *
4394 * enqueue
4395 * update_curr()
4396 * update_min_vruntime()
4397 * vruntime += min_vruntime
4398 *
4399 * this way the vruntime transition between RQs is done when both
4400 * min_vruntime are up-to-date.
4401 *
4402 * WAKEUP (remote)
4403 *
Peter Zijlstra59efa0b2016-05-10 18:24:37 +02004404 * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
Peter Zijlstrab5179ac2016-05-11 16:10:34 +02004405 * vruntime -= min_vruntime
4406 *
4407 * enqueue
4408 * update_curr()
4409 * update_min_vruntime()
4410 * vruntime += min_vruntime
4411 *
4412 * this way we don't have the most up-to-date min_vruntime on the originating
4413 * CPU and an up-to-date min_vruntime on the destination CPU.
4414 */
4415
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02004416static void
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01004417enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004418{
Peter Zijlstra2f950352016-05-11 19:27:56 +02004419 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
4420 bool curr = cfs_rq->curr == se;
Peter Zijlstra3a47d512016-03-09 13:04:03 +01004421
Ingo Molnar53d3bc72016-05-11 08:25:53 +02004422 /*
Peter Zijlstra2f950352016-05-11 19:27:56 +02004423 * If we're the current task, we must renormalise before calling
4424 * update_curr().
Ingo Molnar53d3bc72016-05-11 08:25:53 +02004425 */
Peter Zijlstra2f950352016-05-11 19:27:56 +02004426 if (renorm && curr)
4427 se->vruntime += cfs_rq->min_vruntime;
4428
Ingo Molnarb7cc0892007-08-09 11:16:47 +02004429 update_curr(cfs_rq);
Peter Zijlstra2f950352016-05-11 19:27:56 +02004430
4431 /*
4432 * Otherwise, renormalise after, such that we're placed at the current
4433 * moment in time, instead of some random moment in the past. Being
4434 * placed in the past could significantly boost this task to the
4435 * fairness detriment of existing tasks.
4436 */
4437 if (renorm && !curr)
4438 se->vruntime += cfs_rq->min_vruntime;
4439
Yuyang Du9d89c252015-07-15 08:04:37 +08004440 enqueue_entity_load_avg(cfs_rq, se);
Linus Torvalds17bc14b2012-12-14 07:20:43 -08004441 account_entity_enqueue(cfs_rq, se);
4442 update_cfs_shares(cfs_rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004443
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -05004444 if (flags & ENQUEUE_WAKEUP)
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02004445 place_entity(cfs_rq, se, 0);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004446
Mel Gormancb251762016-02-05 09:08:36 +00004447 check_schedstat_required();
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05004448 update_stats_enqueue(cfs_rq, se, flags);
4449 check_spread(cfs_rq, se);
Peter Zijlstra2f950352016-05-11 19:27:56 +02004450 if (!curr)
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02004451 __enqueue_entity(cfs_rq, se);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08004452 se->on_rq = 1;
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08004453
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004454 if (cfs_rq->nr_running == 1) {
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08004455 list_add_leaf_cfs_rq(cfs_rq);
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004456 check_enqueue_throttle(cfs_rq);
4457 }
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004458}
4459
Rik van Riel2c13c9192011-02-01 09:48:37 -05004460static void __clear_buddies_last(struct sched_entity *se)
Peter Zijlstra2002c692008-11-11 11:52:33 +01004461{
Rik van Riel2c13c9192011-02-01 09:48:37 -05004462 for_each_sched_entity(se) {
4463 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstraf1044792012-02-11 06:05:00 +01004464 if (cfs_rq->last != se)
Rik van Riel2c13c9192011-02-01 09:48:37 -05004465 break;
Peter Zijlstraf1044792012-02-11 06:05:00 +01004466
4467 cfs_rq->last = NULL;
Rik van Riel2c13c9192011-02-01 09:48:37 -05004468 }
4469}
Peter Zijlstra2002c692008-11-11 11:52:33 +01004470
Rik van Riel2c13c9192011-02-01 09:48:37 -05004471static void __clear_buddies_next(struct sched_entity *se)
4472{
4473 for_each_sched_entity(se) {
4474 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstraf1044792012-02-11 06:05:00 +01004475 if (cfs_rq->next != se)
Rik van Riel2c13c9192011-02-01 09:48:37 -05004476 break;
Peter Zijlstraf1044792012-02-11 06:05:00 +01004477
4478 cfs_rq->next = NULL;
Rik van Riel2c13c9192011-02-01 09:48:37 -05004479 }
Peter Zijlstra2002c692008-11-11 11:52:33 +01004480}
4481
Rik van Rielac53db52011-02-01 09:51:03 -05004482static void __clear_buddies_skip(struct sched_entity *se)
4483{
4484 for_each_sched_entity(se) {
4485 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstraf1044792012-02-11 06:05:00 +01004486 if (cfs_rq->skip != se)
Rik van Rielac53db52011-02-01 09:51:03 -05004487 break;
Peter Zijlstraf1044792012-02-11 06:05:00 +01004488
4489 cfs_rq->skip = NULL;
Rik van Rielac53db52011-02-01 09:51:03 -05004490 }
4491}
4492
Peter Zijlstraa571bbe2009-01-28 14:51:40 +01004493static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4494{
Rik van Riel2c13c9192011-02-01 09:48:37 -05004495 if (cfs_rq->last == se)
4496 __clear_buddies_last(se);
4497
4498 if (cfs_rq->next == se)
4499 __clear_buddies_next(se);
Rik van Rielac53db52011-02-01 09:51:03 -05004500
4501 if (cfs_rq->skip == se)
4502 __clear_buddies_skip(se);
Peter Zijlstraa571bbe2009-01-28 14:51:40 +01004503}
4504
Peter Zijlstra6c16a6d2012-03-21 13:07:16 -07004505static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turnerd8b49862011-07-21 09:43:41 -07004506
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004507static void
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01004508dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004509{
Dmitry Adamushkoa2a2d682007-10-15 17:00:13 +02004510 /*
4511 * Update run-time statistics of the 'current'.
4512 */
4513 update_curr(cfs_rq);
Yuyang Du13962232015-07-15 08:04:41 +08004514 dequeue_entity_load_avg(cfs_rq, se);
Dmitry Adamushkoa2a2d682007-10-15 17:00:13 +02004515
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05004516 update_stats_dequeue(cfs_rq, se, flags);
Peter Zijlstra67e9fb22007-10-15 17:00:10 +02004517
Peter Zijlstra2002c692008-11-11 11:52:33 +01004518 clear_buddies(cfs_rq, se);
Peter Zijlstra47932412008-11-04 21:25:09 +01004519
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02004520 if (se != cfs_rq->curr)
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004521 __dequeue_entity(cfs_rq, se);
Linus Torvalds17bc14b2012-12-14 07:20:43 -08004522 se->on_rq = 0;
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004523 account_entity_dequeue(cfs_rq, se);
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01004524
4525 /*
Peter Zijlstrab60205c2016-09-20 21:58:12 +02004526 * Normalize after update_curr(); which will also have moved
4527 * min_vruntime if @se is the one holding it back. But before doing
4528 * update_min_vruntime() again, which will discount @se's position and
4529 * can move min_vruntime forward still more.
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01004530 */
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01004531 if (!(flags & DEQUEUE_SLEEP))
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01004532 se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra1e876232011-05-17 16:21:10 -07004533
Paul Turnerd8b49862011-07-21 09:43:41 -07004534 /* return excess runtime on last dequeue */
4535 return_cfs_rq_runtime(cfs_rq);
4536
Linus Torvalds17bc14b2012-12-14 07:20:43 -08004537 update_cfs_shares(cfs_rq);
Peter Zijlstrab60205c2016-09-20 21:58:12 +02004538
4539 /*
4540 * Now advance min_vruntime if @se was the entity holding it back,
4541 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
4542 * put back on, and if we advance min_vruntime, we'll be placed back
4543 * further than we started -- ie. we'll be penalized.
4544 */
4545 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
4546 update_min_vruntime(cfs_rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004547}
4548
4549/*
4550 * Preempt the current task with a newly woken task if needed:
4551 */
Peter Zijlstra7c92e542007-09-05 14:32:49 +02004552static void
Ingo Molnar2e09bf52007-10-15 17:00:05 +02004553check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004554{
Peter Zijlstra11697832007-09-05 14:32:49 +02004555 unsigned long ideal_runtime, delta_exec;
Wang Xingchaof4cfb332011-09-16 13:35:52 -04004556 struct sched_entity *se;
4557 s64 delta;
Peter Zijlstra11697832007-09-05 14:32:49 +02004558
Peter Zijlstra6d0f0ebd2007-10-15 17:00:05 +02004559 ideal_runtime = sched_slice(cfs_rq, curr);
Peter Zijlstra11697832007-09-05 14:32:49 +02004560 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
Mike Galbraitha9f3e2b2009-01-28 14:51:39 +01004561 if (delta_exec > ideal_runtime) {
Kirill Tkhai88751252014-06-29 00:03:57 +04004562 resched_curr(rq_of(cfs_rq));
Mike Galbraitha9f3e2b2009-01-28 14:51:39 +01004563 /*
4564 * The current task ran long enough, ensure it doesn't get
4565 * re-elected due to buddy favours.
4566 */
4567 clear_buddies(cfs_rq, curr);
Mike Galbraithf685cea2009-10-23 23:09:22 +02004568 return;
4569 }
4570
4571 /*
4572 * Ensure that a task that missed wakeup preemption by a
4573 * narrow margin doesn't have to wait for a full slice.
4574 * This also mitigates buddy induced latencies under load.
4575 */
Mike Galbraithf685cea2009-10-23 23:09:22 +02004576 if (delta_exec < sysctl_sched_min_granularity)
4577 return;
4578
Wang Xingchaof4cfb332011-09-16 13:35:52 -04004579 se = __pick_first_entity(cfs_rq);
4580 delta = curr->vruntime - se->vruntime;
Mike Galbraithf685cea2009-10-23 23:09:22 +02004581
Wang Xingchaof4cfb332011-09-16 13:35:52 -04004582 if (delta < 0)
4583 return;
Mike Galbraithd7d82942011-01-05 05:41:17 +01004584
Wang Xingchaof4cfb332011-09-16 13:35:52 -04004585 if (delta > ideal_runtime)
Kirill Tkhai88751252014-06-29 00:03:57 +04004586 resched_curr(rq_of(cfs_rq));
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004587}
4588
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02004589static void
Ingo Molnar8494f412007-08-09 11:16:48 +02004590set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004591{
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02004592 /* 'current' is not kept within the tree. */
4593 if (se->on_rq) {
4594 /*
4595 * Any task has to be enqueued before it get to execute on
4596 * a CPU. So account for the time it spent waiting on the
4597 * runqueue.
4598 */
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05004599 update_stats_wait_end(cfs_rq, se);
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02004600 __dequeue_entity(cfs_rq, se);
Yuyang Du9d89c252015-07-15 08:04:37 +08004601 update_load_avg(se, 1);
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02004602 }
4603
Ingo Molnar79303e92007-08-09 11:16:47 +02004604 update_stats_curr_start(cfs_rq, se);
Ingo Molnar429d43b2007-10-15 17:00:03 +02004605 cfs_rq->curr = se;
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05004606
Ingo Molnareba1ed42007-10-15 17:00:02 +02004607 /*
4608 * Track our maximum slice length, if the CPU's load is at
4609 * least twice that of our own weight (i.e. dont track it
4610 * when there are only lesser-weight tasks around):
4611 */
Mel Gormancb251762016-02-05 09:08:36 +00004612 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05004613 schedstat_set(se->statistics.slice_max,
4614 max((u64)schedstat_val(se->statistics.slice_max),
4615 se->sum_exec_runtime - se->prev_sum_exec_runtime));
Ingo Molnareba1ed42007-10-15 17:00:02 +02004616 }
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05004617
Peter Zijlstra4a55b452007-09-05 14:32:49 +02004618 se->prev_sum_exec_runtime = se->sum_exec_runtime;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004619}
4620
Peter Zijlstra3f3a4902008-10-24 11:06:16 +02004621static int
4622wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4623
Rik van Rielac53db52011-02-01 09:51:03 -05004624/*
4625 * Pick the next process, keeping these things in mind, in this order:
4626 * 1) keep things fair between processes/task groups
4627 * 2) pick the "next" process, since someone really wants that to run
4628 * 3) pick the "last" process, for cache locality
4629 * 4) do not run the "skip" process, if something else is available
4630 */
Peter Zijlstra678d5712012-02-11 06:05:00 +01004631static struct sched_entity *
4632pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
Peter Zijlstraaa2ac252008-03-14 21:12:12 +01004633{
Peter Zijlstra678d5712012-02-11 06:05:00 +01004634 struct sched_entity *left = __pick_first_entity(cfs_rq);
4635 struct sched_entity *se;
4636
4637 /*
4638 * If curr is set we have to see if its left of the leftmost entity
4639 * still in the tree, provided there was anything in the tree at all.
4640 */
4641 if (!left || (curr && entity_before(curr, left)))
4642 left = curr;
4643
4644 se = left; /* ideally we run the leftmost entity */
Peter Zijlstraf4b67552008-11-04 21:25:07 +01004645
Rik van Rielac53db52011-02-01 09:51:03 -05004646 /*
4647 * Avoid running the skip buddy, if running something else can
4648 * be done without getting too unfair.
4649 */
4650 if (cfs_rq->skip == se) {
Peter Zijlstra678d5712012-02-11 06:05:00 +01004651 struct sched_entity *second;
4652
4653 if (se == curr) {
4654 second = __pick_first_entity(cfs_rq);
4655 } else {
4656 second = __pick_next_entity(se);
4657 if (!second || (curr && entity_before(curr, second)))
4658 second = curr;
4659 }
4660
Rik van Rielac53db52011-02-01 09:51:03 -05004661 if (second && wakeup_preempt_entity(second, left) < 1)
4662 se = second;
4663 }
Peter Zijlstraaa2ac252008-03-14 21:12:12 +01004664
Mike Galbraithf685cea2009-10-23 23:09:22 +02004665 /*
4666 * Prefer last buddy, try to return the CPU to a preempted task.
4667 */
4668 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4669 se = cfs_rq->last;
4670
Rik van Rielac53db52011-02-01 09:51:03 -05004671 /*
4672 * Someone really wants this to run. If it's not unfair, run it.
4673 */
4674 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4675 se = cfs_rq->next;
4676
Mike Galbraithf685cea2009-10-23 23:09:22 +02004677 clear_buddies(cfs_rq, se);
Peter Zijlstra47932412008-11-04 21:25:09 +01004678
4679 return se;
Peter Zijlstraaa2ac252008-03-14 21:12:12 +01004680}
4681
Peter Zijlstra678d5712012-02-11 06:05:00 +01004682static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004683
Ingo Molnarab6cde22007-08-09 11:16:48 +02004684static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004685{
4686 /*
4687 * If still on the runqueue then deactivate_task()
4688 * was not called and update_curr() has to be done:
4689 */
4690 if (prev->on_rq)
Ingo Molnarb7cc0892007-08-09 11:16:47 +02004691 update_curr(cfs_rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004692
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004693 /* throttle cfs_rqs exceeding runtime */
4694 check_cfs_rq_runtime(cfs_rq);
4695
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05004696 check_spread(cfs_rq, prev);
Mel Gormancb251762016-02-05 09:08:36 +00004697
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004698 if (prev->on_rq) {
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05004699 update_stats_wait_start(cfs_rq, prev);
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004700 /* Put 'current' back into the tree. */
4701 __enqueue_entity(cfs_rq, prev);
Paul Turner9d85f212012-10-04 13:18:29 +02004702 /* in !on_rq case, update occurred at dequeue */
Yuyang Du9d89c252015-07-15 08:04:37 +08004703 update_load_avg(prev, 0);
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004704 }
Ingo Molnar429d43b2007-10-15 17:00:03 +02004705 cfs_rq->curr = NULL;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004706}
4707
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004708static void
4709entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004710{
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004711 /*
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004712 * Update run-time statistics of the 'current'.
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004713 */
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004714 update_curr(cfs_rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004715
Paul Turner43365bd2010-12-15 19:10:17 -08004716 /*
Paul Turner9d85f212012-10-04 13:18:29 +02004717 * Ensure that runnable average is periodically updated.
4718 */
Yuyang Du9d89c252015-07-15 08:04:37 +08004719 update_load_avg(curr, 1);
Peter Zijlstrabf0bd942013-07-26 23:48:42 +02004720 update_cfs_shares(cfs_rq);
Paul Turner9d85f212012-10-04 13:18:29 +02004721
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004722#ifdef CONFIG_SCHED_HRTICK
4723 /*
4724 * queued ticks are scheduled to match the slice, so don't bother
4725 * validating it and just reschedule.
4726 */
Harvey Harrison983ed7a2008-04-24 18:17:55 -07004727 if (queued) {
Kirill Tkhai88751252014-06-29 00:03:57 +04004728 resched_curr(rq_of(cfs_rq));
Harvey Harrison983ed7a2008-04-24 18:17:55 -07004729 return;
4730 }
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004731 /*
4732 * don't let the period tick interfere with the hrtick preemption
4733 */
4734 if (!sched_feat(DOUBLE_TICK) &&
4735 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4736 return;
4737#endif
4738
Yong Zhang2c2efae2011-07-29 16:20:33 +08004739 if (cfs_rq->nr_running > 1)
Ingo Molnar2e09bf52007-10-15 17:00:05 +02004740 check_preempt_tick(cfs_rq, curr);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004741}
4742
Paul Turnerab84d312011-07-21 09:43:28 -07004743
4744/**************************************************
4745 * CFS bandwidth control machinery
4746 */
4747
4748#ifdef CONFIG_CFS_BANDWIDTH
Peter Zijlstra029632f2011-10-25 10:00:11 +02004749
4750#ifdef HAVE_JUMP_LABEL
Ingo Molnarc5905af2012-02-24 08:31:31 +01004751static struct static_key __cfs_bandwidth_used;
Peter Zijlstra029632f2011-10-25 10:00:11 +02004752
4753static inline bool cfs_bandwidth_used(void)
4754{
Ingo Molnarc5905af2012-02-24 08:31:31 +01004755 return static_key_false(&__cfs_bandwidth_used);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004756}
4757
Ben Segall1ee14e62013-10-16 11:16:12 -07004758void cfs_bandwidth_usage_inc(void)
Peter Zijlstra029632f2011-10-25 10:00:11 +02004759{
Ben Segall1ee14e62013-10-16 11:16:12 -07004760 static_key_slow_inc(&__cfs_bandwidth_used);
4761}
4762
4763void cfs_bandwidth_usage_dec(void)
4764{
4765 static_key_slow_dec(&__cfs_bandwidth_used);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004766}
4767#else /* HAVE_JUMP_LABEL */
4768static bool cfs_bandwidth_used(void)
4769{
4770 return true;
4771}
4772
Ben Segall1ee14e62013-10-16 11:16:12 -07004773void cfs_bandwidth_usage_inc(void) {}
4774void cfs_bandwidth_usage_dec(void) {}
Peter Zijlstra029632f2011-10-25 10:00:11 +02004775#endif /* HAVE_JUMP_LABEL */
4776
Paul Turnerab84d312011-07-21 09:43:28 -07004777/*
4778 * default period for cfs group bandwidth.
4779 * default: 0.1s, units: nanoseconds
4780 */
4781static inline u64 default_cfs_period(void)
4782{
4783 return 100000000ULL;
4784}
Paul Turnerec12cb72011-07-21 09:43:30 -07004785
4786static inline u64 sched_cfs_bandwidth_slice(void)
4787{
4788 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4789}
4790
Paul Turnera9cf55b2011-07-21 09:43:32 -07004791/*
4792 * Replenish runtime according to assigned quota and update expiration time.
4793 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
4794 * additional synchronization around rq->lock.
4795 *
4796 * requires cfs_b->lock
4797 */
Peter Zijlstra029632f2011-10-25 10:00:11 +02004798void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
Paul Turnera9cf55b2011-07-21 09:43:32 -07004799{
4800 u64 now;
4801
4802 if (cfs_b->quota == RUNTIME_INF)
4803 return;
4804
4805 now = sched_clock_cpu(smp_processor_id());
4806 cfs_b->runtime = cfs_b->quota;
4807 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
4808}
4809
Peter Zijlstra029632f2011-10-25 10:00:11 +02004810static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4811{
4812 return &tg->cfs_bandwidth;
4813}
4814
Paul Turnerf1b17282012-10-04 13:18:31 +02004815/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
4816static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4817{
4818 if (unlikely(cfs_rq->throttle_count))
Xunlei Pang1a99ae32016-05-10 21:03:18 +08004819 return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
Paul Turnerf1b17282012-10-04 13:18:31 +02004820
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004821 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
Paul Turnerf1b17282012-10-04 13:18:31 +02004822}
4823
Paul Turner85dac902011-07-21 09:43:33 -07004824/* returns 0 on failure to allocate runtime */
4825static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turnerec12cb72011-07-21 09:43:30 -07004826{
4827 struct task_group *tg = cfs_rq->tg;
4828 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
Paul Turnera9cf55b2011-07-21 09:43:32 -07004829 u64 amount = 0, min_amount, expires;
Paul Turnerec12cb72011-07-21 09:43:30 -07004830
4831 /* note: this is a positive sum as runtime_remaining <= 0 */
4832 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
4833
4834 raw_spin_lock(&cfs_b->lock);
4835 if (cfs_b->quota == RUNTIME_INF)
4836 amount = min_amount;
Paul Turner58088ad2011-07-21 09:43:31 -07004837 else {
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02004838 start_cfs_bandwidth(cfs_b);
Paul Turner58088ad2011-07-21 09:43:31 -07004839
4840 if (cfs_b->runtime > 0) {
4841 amount = min(cfs_b->runtime, min_amount);
4842 cfs_b->runtime -= amount;
4843 cfs_b->idle = 0;
4844 }
Paul Turnerec12cb72011-07-21 09:43:30 -07004845 }
Paul Turnera9cf55b2011-07-21 09:43:32 -07004846 expires = cfs_b->runtime_expires;
Paul Turnerec12cb72011-07-21 09:43:30 -07004847 raw_spin_unlock(&cfs_b->lock);
4848
4849 cfs_rq->runtime_remaining += amount;
Paul Turnera9cf55b2011-07-21 09:43:32 -07004850 /*
4851 * we may have advanced our local expiration to account for allowed
4852 * spread between our sched_clock and the one on which runtime was
4853 * issued.
4854 */
4855 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
4856 cfs_rq->runtime_expires = expires;
Paul Turner85dac902011-07-21 09:43:33 -07004857
4858 return cfs_rq->runtime_remaining > 0;
Paul Turnera9cf55b2011-07-21 09:43:32 -07004859}
4860
4861/*
4862 * Note: This depends on the synchronization provided by sched_clock and the
4863 * fact that rq->clock snapshots this value.
4864 */
4865static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4866{
4867 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
Paul Turnera9cf55b2011-07-21 09:43:32 -07004868
4869 /* if the deadline is ahead of our clock, nothing to do */
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004870 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
Paul Turnera9cf55b2011-07-21 09:43:32 -07004871 return;
4872
4873 if (cfs_rq->runtime_remaining < 0)
4874 return;
4875
4876 /*
4877 * If the local deadline has passed we have to consider the
4878 * possibility that our sched_clock is 'fast' and the global deadline
4879 * has not truly expired.
4880 *
4881 * Fortunately we can check determine whether this the case by checking
Ben Segall51f21762014-05-19 15:49:45 -07004882 * whether the global deadline has advanced. It is valid to compare
4883 * cfs_b->runtime_expires without any locks since we only care about
4884 * exact equality, so a partial write will still work.
Paul Turnera9cf55b2011-07-21 09:43:32 -07004885 */
4886
Ben Segall51f21762014-05-19 15:49:45 -07004887 if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
Paul Turnera9cf55b2011-07-21 09:43:32 -07004888 /* extend local deadline, drift is bounded above by 2 ticks */
4889 cfs_rq->runtime_expires += TICK_NSEC;
4890 } else {
4891 /* global deadline is ahead, expiration has passed */
4892 cfs_rq->runtime_remaining = 0;
4893 }
Paul Turnerec12cb72011-07-21 09:43:30 -07004894}
4895
Peter Zijlstra9dbdb152013-11-18 18:27:06 +01004896static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
Paul Turnerec12cb72011-07-21 09:43:30 -07004897{
Paul Turnera9cf55b2011-07-21 09:43:32 -07004898 /* dock delta_exec before expiring quota (as it could span periods) */
Paul Turnerec12cb72011-07-21 09:43:30 -07004899 cfs_rq->runtime_remaining -= delta_exec;
Paul Turnera9cf55b2011-07-21 09:43:32 -07004900 expire_cfs_rq_runtime(cfs_rq);
4901
4902 if (likely(cfs_rq->runtime_remaining > 0))
Paul Turnerec12cb72011-07-21 09:43:30 -07004903 return;
4904
Paul Turner85dac902011-07-21 09:43:33 -07004905 /*
4906 * if we're unable to extend our runtime we resched so that the active
4907 * hierarchy can be throttled
4908 */
4909 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
Kirill Tkhai88751252014-06-29 00:03:57 +04004910 resched_curr(rq_of(cfs_rq));
Paul Turnerec12cb72011-07-21 09:43:30 -07004911}
4912
Peter Zijlstra6c16a6d2012-03-21 13:07:16 -07004913static __always_inline
Peter Zijlstra9dbdb152013-11-18 18:27:06 +01004914void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
Paul Turnerec12cb72011-07-21 09:43:30 -07004915{
Paul Turner56f570e2011-11-07 20:26:33 -08004916 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
Paul Turnerec12cb72011-07-21 09:43:30 -07004917 return;
4918
4919 __account_cfs_rq_runtime(cfs_rq, delta_exec);
4920}
4921
Paul Turner85dac902011-07-21 09:43:33 -07004922static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4923{
Paul Turner56f570e2011-11-07 20:26:33 -08004924 return cfs_bandwidth_used() && cfs_rq->throttled;
Paul Turner85dac902011-07-21 09:43:33 -07004925}
4926
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07004927/*
4928 * Check if task is part of a hierarchy where some cfs_rq does not have any
4929 * runtime left.
4930 *
4931 * We can't rely on throttled_hierarchy() to do this test, as
4932 * cfs_rq->throttle_count will not be updated yet when this function is called
4933 * from scheduler_tick()
4934 */
4935static int task_will_be_throttled(struct task_struct *p)
4936{
4937 struct sched_entity *se = &p->se;
4938 struct cfs_rq *cfs_rq;
4939
4940 if (!cfs_bandwidth_used())
4941 return 0;
4942
4943 for_each_sched_entity(se) {
4944 cfs_rq = cfs_rq_of(se);
4945 if (!cfs_rq->runtime_enabled)
4946 continue;
4947 if (cfs_rq->runtime_remaining <= 0)
4948 return 1;
4949 }
4950
4951 return 0;
4952}
4953
Paul Turner64660c82011-07-21 09:43:36 -07004954/* check whether cfs_rq, or any parent, is throttled */
4955static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4956{
Paul Turner56f570e2011-11-07 20:26:33 -08004957 return cfs_bandwidth_used() && cfs_rq->throttle_count;
Paul Turner64660c82011-07-21 09:43:36 -07004958}
4959
4960/*
4961 * Ensure that neither of the group entities corresponding to src_cpu or
4962 * dest_cpu are members of a throttled hierarchy when performing group
4963 * load-balance operations.
4964 */
4965static inline int throttled_lb_pair(struct task_group *tg,
4966 int src_cpu, int dest_cpu)
4967{
4968 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4969
4970 src_cfs_rq = tg->cfs_rq[src_cpu];
4971 dest_cfs_rq = tg->cfs_rq[dest_cpu];
4972
4973 return throttled_hierarchy(src_cfs_rq) ||
4974 throttled_hierarchy(dest_cfs_rq);
4975}
4976
4977/* updated child weight may affect parent so we have to do this bottom up */
4978static int tg_unthrottle_up(struct task_group *tg, void *data)
4979{
4980 struct rq *rq = data;
4981 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4982
4983 cfs_rq->throttle_count--;
Paul Turner64660c82011-07-21 09:43:36 -07004984 if (!cfs_rq->throttle_count) {
Paul Turnerf1b17282012-10-04 13:18:31 +02004985 /* adjust cfs_rq_clock_task() */
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004986 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
Paul Turnerf1b17282012-10-04 13:18:31 +02004987 cfs_rq->throttled_clock_task;
Paul Turner64660c82011-07-21 09:43:36 -07004988 }
Paul Turner64660c82011-07-21 09:43:36 -07004989
4990 return 0;
4991}
4992
4993static int tg_throttle_down(struct task_group *tg, void *data)
4994{
4995 struct rq *rq = data;
4996 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4997
Paul Turner82958362012-10-04 13:18:31 +02004998 /* group is entering throttled state, stop time */
4999 if (!cfs_rq->throttle_count)
Frederic Weisbecker78becc22013-04-12 01:51:02 +02005000 cfs_rq->throttled_clock_task = rq_clock_task(rq);
Paul Turner64660c82011-07-21 09:43:36 -07005001 cfs_rq->throttle_count++;
5002
5003 return 0;
5004}
5005
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005006static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner85dac902011-07-21 09:43:33 -07005007{
5008 struct rq *rq = rq_of(cfs_rq);
5009 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5010 struct sched_entity *se;
5011 long task_delta, dequeue = 1;
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02005012 bool empty;
Paul Turner85dac902011-07-21 09:43:33 -07005013
5014 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
5015
Paul Turnerf1b17282012-10-04 13:18:31 +02005016 /* freeze hierarchy runnable averages while throttled */
Paul Turner64660c82011-07-21 09:43:36 -07005017 rcu_read_lock();
5018 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
5019 rcu_read_unlock();
Paul Turner85dac902011-07-21 09:43:33 -07005020
5021 task_delta = cfs_rq->h_nr_running;
5022 for_each_sched_entity(se) {
5023 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
5024 /* throttled entity or throttle-on-deactivate */
5025 if (!se->on_rq)
5026 break;
5027
5028 if (dequeue)
5029 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
5030 qcfs_rq->h_nr_running -= task_delta;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005031#ifdef CONFIG_SCHED_HMP
5032 dec_throttled_cfs_rq_hmp_stats(&qcfs_rq->hmp_stats, cfs_rq);
5033#endif
Paul Turner85dac902011-07-21 09:43:33 -07005034
5035 if (qcfs_rq->load.weight)
5036 dequeue = 0;
5037 }
5038
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005039 if (!se) {
Kirill Tkhai72465442014-05-09 03:00:14 +04005040 sub_nr_running(rq, task_delta);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005041#ifdef CONFIG_SCHED_HMP
5042 dec_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, cfs_rq);
5043#endif
5044 }
Paul Turner85dac902011-07-21 09:43:33 -07005045
5046 cfs_rq->throttled = 1;
Frederic Weisbecker78becc22013-04-12 01:51:02 +02005047 cfs_rq->throttled_clock = rq_clock(rq);
Paul Turner85dac902011-07-21 09:43:33 -07005048 raw_spin_lock(&cfs_b->lock);
Cong Wangd49db342015-06-24 12:41:47 -07005049 empty = list_empty(&cfs_b->throttled_cfs_rq);
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02005050
Ben Segallc06f04c2014-06-20 15:21:20 -07005051 /*
5052 * Add to the _head_ of the list, so that an already-started
5053 * distribute_cfs_runtime will not see us
5054 */
5055 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02005056
5057 /*
5058 * If we're the first throttled task, make sure the bandwidth
5059 * timer is running.
5060 */
5061 if (empty)
5062 start_cfs_bandwidth(cfs_b);
5063
Paul Turner85dac902011-07-21 09:43:33 -07005064 raw_spin_unlock(&cfs_b->lock);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005065
5066 /* Log effect on hmp stats after throttling */
5067 trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
5068 sched_irqload(cpu_of(rq)),
5069 power_cost(cpu_of(rq), 0),
5070 cpu_temp(cpu_of(rq)));
Paul Turner85dac902011-07-21 09:43:33 -07005071}
5072
Peter Zijlstra029632f2011-10-25 10:00:11 +02005073void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner671fd9d2011-07-21 09:43:34 -07005074{
5075 struct rq *rq = rq_of(cfs_rq);
5076 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5077 struct sched_entity *se;
5078 int enqueue = 1;
5079 long task_delta;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005080 struct cfs_rq *tcfs_rq = cfs_rq;
Paul Turner671fd9d2011-07-21 09:43:34 -07005081
Michael Wang22b958d2013-06-04 14:23:39 +08005082 se = cfs_rq->tg->se[cpu_of(rq)];
Paul Turner671fd9d2011-07-21 09:43:34 -07005083
5084 cfs_rq->throttled = 0;
Frederic Weisbecker1a55af22013-04-12 01:51:01 +02005085
5086 update_rq_clock(rq);
5087
Paul Turner671fd9d2011-07-21 09:43:34 -07005088 raw_spin_lock(&cfs_b->lock);
Frederic Weisbecker78becc22013-04-12 01:51:02 +02005089 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
Paul Turner671fd9d2011-07-21 09:43:34 -07005090 list_del_rcu(&cfs_rq->throttled_list);
5091 raw_spin_unlock(&cfs_b->lock);
5092
Paul Turner64660c82011-07-21 09:43:36 -07005093 /* update hierarchical throttle state */
5094 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
5095
Paul Turner671fd9d2011-07-21 09:43:34 -07005096 if (!cfs_rq->load.weight)
5097 return;
5098
5099 task_delta = cfs_rq->h_nr_running;
5100 for_each_sched_entity(se) {
5101 if (se->on_rq)
5102 enqueue = 0;
5103
5104 cfs_rq = cfs_rq_of(se);
5105 if (enqueue)
5106 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
5107 cfs_rq->h_nr_running += task_delta;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005108#ifdef CONFIG_SCHED_HMP
5109 inc_throttled_cfs_rq_hmp_stats(&cfs_rq->hmp_stats, tcfs_rq);
5110#endif
Paul Turner671fd9d2011-07-21 09:43:34 -07005111
5112 if (cfs_rq_throttled(cfs_rq))
5113 break;
5114 }
5115
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005116 if (!se) {
Kirill Tkhai72465442014-05-09 03:00:14 +04005117 add_nr_running(rq, task_delta);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005118#ifdef CONFIG_SCHED_HMP
5119 inc_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, tcfs_rq);
5120#endif
5121 }
Paul Turner671fd9d2011-07-21 09:43:34 -07005122
5123 /* determine whether we need to wake up potentially idle cpu */
5124 if (rq->curr == rq->idle && rq->cfs.nr_running)
Kirill Tkhai88751252014-06-29 00:03:57 +04005125 resched_curr(rq);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005126
5127 /* Log effect on hmp stats after un-throttling */
5128 trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)),
5129 sched_irqload(cpu_of(rq)),
5130 power_cost(cpu_of(rq), 0),
5131 cpu_temp(cpu_of(rq)));
Paul Turner671fd9d2011-07-21 09:43:34 -07005132}
5133
5134static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
5135 u64 remaining, u64 expires)
5136{
5137 struct cfs_rq *cfs_rq;
Ben Segallc06f04c2014-06-20 15:21:20 -07005138 u64 runtime;
5139 u64 starting_runtime = remaining;
Paul Turner671fd9d2011-07-21 09:43:34 -07005140
5141 rcu_read_lock();
5142 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
5143 throttled_list) {
5144 struct rq *rq = rq_of(cfs_rq);
5145
5146 raw_spin_lock(&rq->lock);
5147 if (!cfs_rq_throttled(cfs_rq))
5148 goto next;
5149
5150 runtime = -cfs_rq->runtime_remaining + 1;
5151 if (runtime > remaining)
5152 runtime = remaining;
5153 remaining -= runtime;
5154
5155 cfs_rq->runtime_remaining += runtime;
5156 cfs_rq->runtime_expires = expires;
5157
5158 /* we check whether we're throttled above */
5159 if (cfs_rq->runtime_remaining > 0)
5160 unthrottle_cfs_rq(cfs_rq);
5161
5162next:
5163 raw_spin_unlock(&rq->lock);
5164
5165 if (!remaining)
5166 break;
5167 }
5168 rcu_read_unlock();
5169
Ben Segallc06f04c2014-06-20 15:21:20 -07005170 return starting_runtime - remaining;
Paul Turner671fd9d2011-07-21 09:43:34 -07005171}
5172
Paul Turner58088ad2011-07-21 09:43:31 -07005173/*
5174 * Responsible for refilling a task_group's bandwidth and unthrottling its
5175 * cfs_rqs as appropriate. If there has been no activity within the last
5176 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
5177 * used to track this state.
5178 */
5179static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
5180{
Paul Turner671fd9d2011-07-21 09:43:34 -07005181 u64 runtime, runtime_expires;
Ben Segall51f21762014-05-19 15:49:45 -07005182 int throttled;
Paul Turner58088ad2011-07-21 09:43:31 -07005183
Paul Turner58088ad2011-07-21 09:43:31 -07005184 /* no need to continue the timer with no bandwidth constraint */
5185 if (cfs_b->quota == RUNTIME_INF)
Ben Segall51f21762014-05-19 15:49:45 -07005186 goto out_deactivate;
Paul Turner58088ad2011-07-21 09:43:31 -07005187
Paul Turner671fd9d2011-07-21 09:43:34 -07005188 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
Nikhil Raoe8da1b12011-07-21 09:43:40 -07005189 cfs_b->nr_periods += overrun;
Paul Turner671fd9d2011-07-21 09:43:34 -07005190
Ben Segall51f21762014-05-19 15:49:45 -07005191 /*
5192 * idle depends on !throttled (for the case of a large deficit), and if
5193 * we're going inactive then everything else can be deferred
5194 */
5195 if (cfs_b->idle && !throttled)
5196 goto out_deactivate;
Paul Turnera9cf55b2011-07-21 09:43:32 -07005197
5198 __refill_cfs_bandwidth_runtime(cfs_b);
5199
Paul Turner671fd9d2011-07-21 09:43:34 -07005200 if (!throttled) {
5201 /* mark as potentially idle for the upcoming period */
5202 cfs_b->idle = 1;
Ben Segall51f21762014-05-19 15:49:45 -07005203 return 0;
Paul Turner671fd9d2011-07-21 09:43:34 -07005204 }
Paul Turner58088ad2011-07-21 09:43:31 -07005205
Nikhil Raoe8da1b12011-07-21 09:43:40 -07005206 /* account preceding periods in which throttling occurred */
5207 cfs_b->nr_throttled += overrun;
5208
Paul Turner671fd9d2011-07-21 09:43:34 -07005209 runtime_expires = cfs_b->runtime_expires;
Paul Turner671fd9d2011-07-21 09:43:34 -07005210
5211 /*
Ben Segallc06f04c2014-06-20 15:21:20 -07005212 * This check is repeated as we are holding onto the new bandwidth while
5213 * we unthrottle. This can potentially race with an unthrottled group
5214 * trying to acquire new bandwidth from the global pool. This can result
5215 * in us over-using our runtime if it is all used during this loop, but
5216 * only by limited amounts in that extreme case.
Paul Turner671fd9d2011-07-21 09:43:34 -07005217 */
Ben Segallc06f04c2014-06-20 15:21:20 -07005218 while (throttled && cfs_b->runtime > 0) {
5219 runtime = cfs_b->runtime;
Paul Turner671fd9d2011-07-21 09:43:34 -07005220 raw_spin_unlock(&cfs_b->lock);
5221 /* we can't nest cfs_b->lock while distributing bandwidth */
5222 runtime = distribute_cfs_runtime(cfs_b, runtime,
5223 runtime_expires);
5224 raw_spin_lock(&cfs_b->lock);
5225
5226 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
Ben Segallc06f04c2014-06-20 15:21:20 -07005227
5228 cfs_b->runtime -= min(runtime, cfs_b->runtime);
Paul Turner671fd9d2011-07-21 09:43:34 -07005229 }
5230
Paul Turner671fd9d2011-07-21 09:43:34 -07005231 /*
5232 * While we are ensured activity in the period following an
5233 * unthrottle, this also covers the case in which the new bandwidth is
5234 * insufficient to cover the existing bandwidth deficit. (Forcing the
5235 * timer to remain active while there are any throttled entities.)
5236 */
5237 cfs_b->idle = 0;
Paul Turner58088ad2011-07-21 09:43:31 -07005238
Ben Segall51f21762014-05-19 15:49:45 -07005239 return 0;
5240
5241out_deactivate:
Ben Segall51f21762014-05-19 15:49:45 -07005242 return 1;
Paul Turner58088ad2011-07-21 09:43:31 -07005243}
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005244
Paul Turnerd8b49862011-07-21 09:43:41 -07005245/* a cfs_rq won't donate quota below this amount */
5246static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
5247/* minimum remaining period time to redistribute slack quota */
5248static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
5249/* how long we wait to gather additional slack before distributing */
5250static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
5251
Ben Segalldb06e782013-10-16 11:16:17 -07005252/*
5253 * Are we near the end of the current quota period?
5254 *
5255 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
Thomas Gleixner4961b6e2015-04-14 21:09:05 +00005256 * hrtimer base being cleared by hrtimer_start. In the case of
Ben Segalldb06e782013-10-16 11:16:17 -07005257 * migrate_hrtimers, base is never cleared, so we are fine.
5258 */
Paul Turnerd8b49862011-07-21 09:43:41 -07005259static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
5260{
5261 struct hrtimer *refresh_timer = &cfs_b->period_timer;
5262 u64 remaining;
5263
5264 /* if the call-back is running a quota refresh is already occurring */
5265 if (hrtimer_callback_running(refresh_timer))
5266 return 1;
5267
5268 /* is a quota refresh about to occur? */
5269 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
5270 if (remaining < min_expire)
5271 return 1;
5272
5273 return 0;
5274}
5275
5276static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
5277{
5278 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
5279
5280 /* if there's a quota refresh soon don't bother with slack */
5281 if (runtime_refresh_within(cfs_b, min_left))
5282 return;
5283
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02005284 hrtimer_start(&cfs_b->slack_timer,
5285 ns_to_ktime(cfs_bandwidth_slack_period),
5286 HRTIMER_MODE_REL);
Paul Turnerd8b49862011-07-21 09:43:41 -07005287}
5288
5289/* we know any runtime found here is valid as update_curr() precedes return */
5290static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5291{
5292 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5293 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
5294
5295 if (slack_runtime <= 0)
5296 return;
5297
5298 raw_spin_lock(&cfs_b->lock);
5299 if (cfs_b->quota != RUNTIME_INF &&
5300 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
5301 cfs_b->runtime += slack_runtime;
5302
5303 /* we are under rq->lock, defer unthrottling using a timer */
5304 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
5305 !list_empty(&cfs_b->throttled_cfs_rq))
5306 start_cfs_slack_bandwidth(cfs_b);
5307 }
5308 raw_spin_unlock(&cfs_b->lock);
5309
5310 /* even if it's not valid for return we don't want to try again */
5311 cfs_rq->runtime_remaining -= slack_runtime;
5312}
5313
5314static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5315{
Paul Turner56f570e2011-11-07 20:26:33 -08005316 if (!cfs_bandwidth_used())
5317 return;
5318
Paul Turnerfccfdc62011-11-07 20:26:34 -08005319 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
Paul Turnerd8b49862011-07-21 09:43:41 -07005320 return;
5321
5322 __return_cfs_rq_runtime(cfs_rq);
5323}
5324
5325/*
5326 * This is done with a timer (instead of inline with bandwidth return) since
5327 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
5328 */
5329static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
5330{
5331 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
5332 u64 expires;
5333
5334 /* confirm we're still not at a refresh boundary */
Paul Turnerd8b49862011-07-21 09:43:41 -07005335 raw_spin_lock(&cfs_b->lock);
Ben Segalldb06e782013-10-16 11:16:17 -07005336 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
5337 raw_spin_unlock(&cfs_b->lock);
5338 return;
5339 }
5340
Ben Segallc06f04c2014-06-20 15:21:20 -07005341 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
Paul Turnerd8b49862011-07-21 09:43:41 -07005342 runtime = cfs_b->runtime;
Ben Segallc06f04c2014-06-20 15:21:20 -07005343
Paul Turnerd8b49862011-07-21 09:43:41 -07005344 expires = cfs_b->runtime_expires;
5345 raw_spin_unlock(&cfs_b->lock);
5346
5347 if (!runtime)
5348 return;
5349
5350 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
5351
5352 raw_spin_lock(&cfs_b->lock);
5353 if (expires == cfs_b->runtime_expires)
Ben Segallc06f04c2014-06-20 15:21:20 -07005354 cfs_b->runtime -= min(runtime, cfs_b->runtime);
Paul Turnerd8b49862011-07-21 09:43:41 -07005355 raw_spin_unlock(&cfs_b->lock);
5356}
5357
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005358/*
5359 * When a group wakes up we want to make sure that its quota is not already
5360 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
5361 * runtime as update_curr() throttling can not not trigger until it's on-rq.
5362 */
5363static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
5364{
Paul Turner56f570e2011-11-07 20:26:33 -08005365 if (!cfs_bandwidth_used())
5366 return;
5367
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005368 /* an active group must be handled by the update_curr()->put() path */
5369 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
5370 return;
5371
5372 /* ensure the group is not already throttled */
5373 if (cfs_rq_throttled(cfs_rq))
5374 return;
5375
5376 /* update runtime allocation */
5377 account_cfs_rq_runtime(cfs_rq, 0);
5378 if (cfs_rq->runtime_remaining <= 0)
5379 throttle_cfs_rq(cfs_rq);
5380}
5381
Peter Zijlstra55e16d32016-06-22 15:14:26 +02005382static void sync_throttle(struct task_group *tg, int cpu)
5383{
5384 struct cfs_rq *pcfs_rq, *cfs_rq;
5385
5386 if (!cfs_bandwidth_used())
5387 return;
5388
5389 if (!tg->parent)
5390 return;
5391
5392 cfs_rq = tg->cfs_rq[cpu];
5393 pcfs_rq = tg->parent->cfs_rq[cpu];
5394
5395 cfs_rq->throttle_count = pcfs_rq->throttle_count;
Xunlei Pangb8922122016-07-09 15:54:22 +08005396 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
Peter Zijlstra55e16d32016-06-22 15:14:26 +02005397}
5398
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005399/* conditionally throttle active cfs_rq's from put_prev_entity() */
Peter Zijlstra678d5712012-02-11 06:05:00 +01005400static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005401{
Paul Turner56f570e2011-11-07 20:26:33 -08005402 if (!cfs_bandwidth_used())
Peter Zijlstra678d5712012-02-11 06:05:00 +01005403 return false;
Paul Turner56f570e2011-11-07 20:26:33 -08005404
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005405 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
Peter Zijlstra678d5712012-02-11 06:05:00 +01005406 return false;
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005407
5408 /*
5409 * it's possible for a throttled entity to be forced into a running
5410 * state (e.g. set_curr_task), in this case we're finished.
5411 */
5412 if (cfs_rq_throttled(cfs_rq))
Peter Zijlstra678d5712012-02-11 06:05:00 +01005413 return true;
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005414
5415 throttle_cfs_rq(cfs_rq);
Peter Zijlstra678d5712012-02-11 06:05:00 +01005416 return true;
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005417}
Peter Zijlstra029632f2011-10-25 10:00:11 +02005418
Peter Zijlstra029632f2011-10-25 10:00:11 +02005419static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
5420{
5421 struct cfs_bandwidth *cfs_b =
5422 container_of(timer, struct cfs_bandwidth, slack_timer);
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02005423
Peter Zijlstra029632f2011-10-25 10:00:11 +02005424 do_sched_cfs_slack_timer(cfs_b);
5425
5426 return HRTIMER_NORESTART;
5427}
5428
5429static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
5430{
5431 struct cfs_bandwidth *cfs_b =
5432 container_of(timer, struct cfs_bandwidth, period_timer);
Peter Zijlstra029632f2011-10-25 10:00:11 +02005433 int overrun;
5434 int idle = 0;
5435
Ben Segall51f21762014-05-19 15:49:45 -07005436 raw_spin_lock(&cfs_b->lock);
Peter Zijlstra029632f2011-10-25 10:00:11 +02005437 for (;;) {
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02005438 overrun = hrtimer_forward_now(timer, cfs_b->period);
Peter Zijlstra029632f2011-10-25 10:00:11 +02005439 if (!overrun)
5440 break;
5441
5442 idle = do_sched_cfs_period_timer(cfs_b, overrun);
5443 }
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02005444 if (idle)
5445 cfs_b->period_active = 0;
Ben Segall51f21762014-05-19 15:49:45 -07005446 raw_spin_unlock(&cfs_b->lock);
Peter Zijlstra029632f2011-10-25 10:00:11 +02005447
5448 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
5449}
5450
5451void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5452{
5453 raw_spin_lock_init(&cfs_b->lock);
5454 cfs_b->runtime = 0;
5455 cfs_b->quota = RUNTIME_INF;
5456 cfs_b->period = ns_to_ktime(default_cfs_period());
5457
5458 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02005459 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
Peter Zijlstra029632f2011-10-25 10:00:11 +02005460 cfs_b->period_timer.function = sched_cfs_period_timer;
5461 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5462 cfs_b->slack_timer.function = sched_cfs_slack_timer;
5463}
5464
5465static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5466{
5467 cfs_rq->runtime_enabled = 0;
5468 INIT_LIST_HEAD(&cfs_rq->throttled_list);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005469 init_cfs_rq_hmp_stats(cfs_rq);
Peter Zijlstra029632f2011-10-25 10:00:11 +02005470}
5471
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02005472void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
Peter Zijlstra029632f2011-10-25 10:00:11 +02005473{
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02005474 lockdep_assert_held(&cfs_b->lock);
Peter Zijlstra029632f2011-10-25 10:00:11 +02005475
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02005476 if (!cfs_b->period_active) {
5477 cfs_b->period_active = 1;
5478 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5479 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5480 }
Peter Zijlstra029632f2011-10-25 10:00:11 +02005481}
5482
5483static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5484{
Tetsuo Handa7f1a1692014-12-25 15:51:21 +09005485 /* init_cfs_bandwidth() was not called */
5486 if (!cfs_b->throttled_cfs_rq.next)
5487 return;
5488
Peter Zijlstra029632f2011-10-25 10:00:11 +02005489 hrtimer_cancel(&cfs_b->period_timer);
5490 hrtimer_cancel(&cfs_b->slack_timer);
5491}
5492
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04005493static void __maybe_unused update_runtime_enabled(struct rq *rq)
5494{
5495 struct cfs_rq *cfs_rq;
5496
5497 for_each_leaf_cfs_rq(rq, cfs_rq) {
5498 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
5499
5500 raw_spin_lock(&cfs_b->lock);
5501 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5502 raw_spin_unlock(&cfs_b->lock);
5503 }
5504}
5505
Arnd Bergmann38dc3342013-01-25 14:14:22 +00005506static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
Peter Zijlstra029632f2011-10-25 10:00:11 +02005507{
5508 struct cfs_rq *cfs_rq;
5509
5510 for_each_leaf_cfs_rq(rq, cfs_rq) {
Peter Zijlstra029632f2011-10-25 10:00:11 +02005511 if (!cfs_rq->runtime_enabled)
5512 continue;
5513
5514 /*
5515 * clock_task is not advancing so we just need to make sure
5516 * there's some valid quota amount
5517 */
Ben Segall51f21762014-05-19 15:49:45 -07005518 cfs_rq->runtime_remaining = 1;
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04005519 /*
5520 * Offline rq is schedulable till cpu is completely disabled
5521 * in take_cpu_down(), so we prevent new cfs throttling here.
5522 */
5523 cfs_rq->runtime_enabled = 0;
5524
Peter Zijlstra029632f2011-10-25 10:00:11 +02005525 if (cfs_rq_throttled(cfs_rq))
5526 unthrottle_cfs_rq(cfs_rq);
5527 }
5528}
5529
5530#else /* CONFIG_CFS_BANDWIDTH */
Paul Turnerf1b17282012-10-04 13:18:31 +02005531static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5532{
Frederic Weisbecker78becc22013-04-12 01:51:02 +02005533 return rq_clock_task(rq_of(cfs_rq));
Paul Turnerf1b17282012-10-04 13:18:31 +02005534}
5535
Peter Zijlstra9dbdb152013-11-18 18:27:06 +01005536static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
Peter Zijlstra678d5712012-02-11 06:05:00 +01005537static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005538static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
Peter Zijlstra55e16d32016-06-22 15:14:26 +02005539static inline void sync_throttle(struct task_group *tg, int cpu) {}
Peter Zijlstra6c16a6d2012-03-21 13:07:16 -07005540static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner85dac902011-07-21 09:43:33 -07005541
5542static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5543{
5544 return 0;
5545}
Paul Turner64660c82011-07-21 09:43:36 -07005546
5547static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5548{
5549 return 0;
5550}
5551
5552static inline int throttled_lb_pair(struct task_group *tg,
5553 int src_cpu, int dest_cpu)
5554{
5555 return 0;
5556}
Peter Zijlstra029632f2011-10-25 10:00:11 +02005557
5558void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5559
5560#ifdef CONFIG_FAIR_GROUP_SCHED
5561static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turnerab84d312011-07-21 09:43:28 -07005562#endif
5563
Peter Zijlstra029632f2011-10-25 10:00:11 +02005564static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5565{
5566 return NULL;
5567}
5568static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04005569static inline void update_runtime_enabled(struct rq *rq) {}
Peter Boonstoppela4c96ae2012-08-09 15:34:47 -07005570static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
Peter Zijlstra029632f2011-10-25 10:00:11 +02005571
5572#endif /* CONFIG_CFS_BANDWIDTH */
5573
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005574/**************************************************
5575 * CFS operations on tasks:
5576 */
5577
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005578#ifdef CONFIG_SCHED_HRTICK
5579static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5580{
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005581 struct sched_entity *se = &p->se;
5582 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5583
Peter Zijlstra9148a3a2016-09-20 22:34:51 +02005584 SCHED_WARN_ON(task_rq(p) != rq);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005585
Srivatsa Vaddagiri8bf46a32016-09-16 18:28:51 -07005586 if (rq->cfs.h_nr_running > 1) {
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005587 u64 slice = sched_slice(cfs_rq, se);
5588 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5589 s64 delta = slice - ran;
5590
5591 if (delta < 0) {
5592 if (rq->curr == p)
Kirill Tkhai88751252014-06-29 00:03:57 +04005593 resched_curr(rq);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005594 return;
5595 }
Peter Zijlstra31656512008-07-18 18:01:23 +02005596 hrtick_start(rq, delta);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005597 }
5598}
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02005599
5600/*
5601 * called from enqueue/dequeue and updates the hrtick when the
5602 * current task is from our class and nr_running is low enough
5603 * to matter.
5604 */
5605static void hrtick_update(struct rq *rq)
5606{
5607 struct task_struct *curr = rq->curr;
5608
Mike Galbraithb39e66e2011-11-22 15:20:07 +01005609 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02005610 return;
5611
5612 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5613 hrtick_start_fair(rq, curr);
5614}
Dhaval Giani55e12e52008-06-24 23:39:43 +05305615#else /* !CONFIG_SCHED_HRTICK */
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005616static inline void
5617hrtick_start_fair(struct rq *rq, struct task_struct *p)
5618{
5619}
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02005620
5621static inline void hrtick_update(struct rq *rq)
5622{
5623}
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005624#endif
5625
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005626/*
5627 * The enqueue_task method is called before nr_running is
5628 * increased. Here we update the fair scheduling stats and
5629 * then put the task into the rbtree:
5630 */
Thomas Gleixnerea87bb72010-01-20 20:58:57 +00005631static void
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005632enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005633{
5634 struct cfs_rq *cfs_rq;
Peter Zijlstra62fb1852008-02-25 17:34:02 +01005635 struct sched_entity *se = &p->se;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005636
Rafael J. Wysocki8c34ab12016-09-09 23:59:33 +02005637 /*
5638 * If in_iowait is set, the code below may not trigger any cpufreq
5639 * utilization updates, so do it here explicitly with the IOWAIT flag
5640 * passed.
5641 */
5642 if (p->in_iowait)
5643 cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
5644
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005645 for_each_sched_entity(se) {
Peter Zijlstra62fb1852008-02-25 17:34:02 +01005646 if (se->on_rq)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005647 break;
5648 cfs_rq = cfs_rq_of(se);
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01005649 enqueue_entity(cfs_rq, se, flags);
Paul Turner85dac902011-07-21 09:43:33 -07005650
5651 /*
5652 * end evaluation on encountering a throttled cfs_rq
5653 *
5654 * note: in the case of encountering a throttled cfs_rq we will
5655 * post the final h_nr_running increment below.
Peter Zijlstrae210bff2016-06-16 18:51:48 +02005656 */
Paul Turner85dac902011-07-21 09:43:33 -07005657 if (cfs_rq_throttled(cfs_rq))
5658 break;
Paul Turner953bfcd2011-07-21 09:43:27 -07005659 cfs_rq->h_nr_running++;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005660 inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
Paul Turner85dac902011-07-21 09:43:33 -07005661
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01005662 flags = ENQUEUE_WAKEUP;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005663 }
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005664
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005665 for_each_sched_entity(se) {
Lin Ming0f317142011-07-22 09:14:31 +08005666 cfs_rq = cfs_rq_of(se);
Paul Turner953bfcd2011-07-21 09:43:27 -07005667 cfs_rq->h_nr_running++;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005668 inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005669
Paul Turner85dac902011-07-21 09:43:33 -07005670 if (cfs_rq_throttled(cfs_rq))
5671 break;
5672
Yuyang Du9d89c252015-07-15 08:04:37 +08005673 update_load_avg(se, 1);
Linus Torvalds17bc14b2012-12-14 07:20:43 -08005674 update_cfs_shares(cfs_rq);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005675 }
5676
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005677 if (!se) {
Kirill Tkhai72465442014-05-09 03:00:14 +04005678 add_nr_running(rq, 1);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005679 inc_rq_hmp_stats(rq, p, 1);
5680 }
Yuyang Ducd126af2015-07-15 08:04:36 +08005681
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02005682 hrtick_update(rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005683}
5684
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005685static void set_next_buddy(struct sched_entity *se);
5686
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005687/*
5688 * The dequeue_task method is called before nr_running is
5689 * decreased. We remove the task from the rbtree and
5690 * update the fair scheduling stats:
5691 */
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005692static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005693{
5694 struct cfs_rq *cfs_rq;
Peter Zijlstra62fb1852008-02-25 17:34:02 +01005695 struct sched_entity *se = &p->se;
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005696 int task_sleep = flags & DEQUEUE_SLEEP;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005697
5698 for_each_sched_entity(se) {
5699 cfs_rq = cfs_rq_of(se);
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005700 dequeue_entity(cfs_rq, se, flags);
Paul Turner85dac902011-07-21 09:43:33 -07005701
5702 /*
5703 * end evaluation on encountering a throttled cfs_rq
5704 *
5705 * note: in the case of encountering a throttled cfs_rq we will
5706 * post the final h_nr_running decrement below.
5707 */
5708 if (cfs_rq_throttled(cfs_rq))
5709 break;
Paul Turner953bfcd2011-07-21 09:43:27 -07005710 cfs_rq->h_nr_running--;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005711 dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005712
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005713 /* Don't dequeue parent if it has other entities besides us */
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005714 if (cfs_rq->load.weight) {
Konstantin Khlebnikov754bd592016-06-16 15:57:15 +03005715 /* Avoid re-evaluating load for this entity: */
5716 se = parent_entity(se);
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005717 /*
5718 * Bias pick_next to pick a task from this cfs_rq, as
5719 * p is sleeping when it is within its sched_slice.
5720 */
Konstantin Khlebnikov754bd592016-06-16 15:57:15 +03005721 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5722 set_next_buddy(se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005723 break;
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005724 }
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005725 flags |= DEQUEUE_SLEEP;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005726 }
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005727
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005728 for_each_sched_entity(se) {
Lin Ming0f317142011-07-22 09:14:31 +08005729 cfs_rq = cfs_rq_of(se);
Paul Turner953bfcd2011-07-21 09:43:27 -07005730 cfs_rq->h_nr_running--;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005731 dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005732
Paul Turner85dac902011-07-21 09:43:33 -07005733 if (cfs_rq_throttled(cfs_rq))
5734 break;
5735
Yuyang Du9d89c252015-07-15 08:04:37 +08005736 update_load_avg(se, 1);
Linus Torvalds17bc14b2012-12-14 07:20:43 -08005737 update_cfs_shares(cfs_rq);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005738 }
5739
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005740 if (!se) {
Kirill Tkhai72465442014-05-09 03:00:14 +04005741 sub_nr_running(rq, 1);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005742 dec_rq_hmp_stats(rq, p, 1);
5743 }
Yuyang Ducd126af2015-07-15 08:04:36 +08005744
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02005745 hrtick_update(rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005746}
5747
Gregory Haskinse7693a32008-01-25 21:08:09 +01005748#ifdef CONFIG_SMP
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02005749
5750/* Working cpumask for: load_balance, load_balance_newidle. */
5751DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5752DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5753
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005754#ifdef CONFIG_NO_HZ_COMMON
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005755/*
5756 * per rq 'load' arrray crap; XXX kill this.
5757 */
5758
5759/*
Peter Zijlstrad937cdc2015-10-19 13:49:30 +02005760 * The exact cpuload calculated at every tick would be:
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005761 *
Peter Zijlstrad937cdc2015-10-19 13:49:30 +02005762 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5763 *
5764 * If a cpu misses updates for n ticks (as it was idle) and update gets
5765 * called on the n+1-th tick when cpu may be busy, then we have:
5766 *
5767 * load_n = (1 - 1/2^i)^n * load_0
5768 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005769 *
5770 * decay_load_missed() below does efficient calculation of
Peter Zijlstrad937cdc2015-10-19 13:49:30 +02005771 *
5772 * load' = (1 - 1/2^i)^n * load
5773 *
5774 * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
5775 * This allows us to precompute the above in said factors, thereby allowing the
5776 * reduction of an arbitrary n in O(log_2 n) steps. (See also
5777 * fixed_power_int())
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005778 *
5779 * The calculation is approximated on a 128 point scale.
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005780 */
5781#define DEGRADE_SHIFT 7
Peter Zijlstrad937cdc2015-10-19 13:49:30 +02005782
5783static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
5784static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
5785 { 0, 0, 0, 0, 0, 0, 0, 0 },
5786 { 64, 32, 8, 0, 0, 0, 0, 0 },
5787 { 96, 72, 40, 12, 1, 0, 0, 0 },
5788 { 112, 98, 75, 43, 15, 1, 0, 0 },
5789 { 120, 112, 98, 76, 45, 16, 2, 0 }
5790};
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005791
5792/*
5793 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
5794 * would be when CPU is idle and so we just decay the old load without
5795 * adding any new load.
5796 */
5797static unsigned long
5798decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5799{
5800 int j = 0;
5801
5802 if (!missed_updates)
5803 return load;
5804
5805 if (missed_updates >= degrade_zero_ticks[idx])
5806 return 0;
5807
5808 if (idx == 1)
5809 return load >> missed_updates;
5810
5811 while (missed_updates) {
5812 if (missed_updates % 2)
5813 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5814
5815 missed_updates >>= 1;
5816 j++;
5817 }
5818 return load;
5819}
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005820#endif /* CONFIG_NO_HZ_COMMON */
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005821
Byungchul Park59543272015-10-14 18:47:35 +09005822/**
Frederic Weisbeckercee1afc2016-04-13 15:56:50 +02005823 * __cpu_load_update - update the rq->cpu_load[] statistics
Byungchul Park59543272015-10-14 18:47:35 +09005824 * @this_rq: The rq to update statistics for
5825 * @this_load: The current load
5826 * @pending_updates: The number of missed updates
Byungchul Park59543272015-10-14 18:47:35 +09005827 *
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005828 * Update rq->cpu_load[] statistics. This function is usually called every
Byungchul Park59543272015-10-14 18:47:35 +09005829 * scheduler tick (TICK_NSEC).
5830 *
5831 * This function computes a decaying average:
5832 *
5833 * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
5834 *
5835 * Because of NOHZ it might not get called on every tick which gives need for
5836 * the @pending_updates argument.
5837 *
5838 * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
5839 * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
5840 * = A * (A * load[i]_n-2 + B) + B
5841 * = A * (A * (A * load[i]_n-3 + B) + B) + B
5842 * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
5843 * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
5844 * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
5845 * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
5846 *
5847 * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
5848 * any change in load would have resulted in the tick being turned back on.
5849 *
5850 * For regular NOHZ, this reduces to:
5851 *
5852 * load[i]_n = (1 - 1/2^i)^n * load[i]_0
5853 *
5854 * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005855 * term.
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005856 */
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005857static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5858 unsigned long pending_updates)
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005859{
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005860 unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005861 int i, scale;
5862
5863 this_rq->nr_load_updates++;
5864
5865 /* Update our load: */
5866 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
5867 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5868 unsigned long old_load, new_load;
5869
5870 /* scale is effectively 1 << i now, and >> i divides by scale */
5871
Byungchul Park7400d3b2016-01-15 16:07:49 +09005872 old_load = this_rq->cpu_load[i];
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005873#ifdef CONFIG_NO_HZ_COMMON
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005874 old_load = decay_load_missed(old_load, pending_updates - 1, i);
Byungchul Park7400d3b2016-01-15 16:07:49 +09005875 if (tickless_load) {
5876 old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
5877 /*
5878 * old_load can never be a negative value because a
5879 * decayed tickless_load cannot be greater than the
5880 * original tickless_load.
5881 */
5882 old_load += tickless_load;
5883 }
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005884#endif
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005885 new_load = this_load;
5886 /*
5887 * Round up the averaging division if load is increasing. This
5888 * prevents us from getting stuck on 9 if the load is 10, for
5889 * example.
5890 */
5891 if (new_load > old_load)
5892 new_load += scale - 1;
5893
5894 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5895 }
5896
5897 sched_avg_update(this_rq);
5898}
5899
Yuyang Du7ea241a2015-07-15 08:04:42 +08005900/* Used instead of source_load when we know the type == 0 */
5901static unsigned long weighted_cpuload(const int cpu)
5902{
5903 return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
5904}
5905
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005906#ifdef CONFIG_NO_HZ_COMMON
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005907/*
5908 * There is no sane way to deal with nohz on smp when using jiffies because the
5909 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
5910 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5911 *
5912 * Therefore we need to avoid the delta approach from the regular tick when
5913 * possible since that would seriously skew the load calculation. This is why we
5914 * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
5915 * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
5916 * loop exit, nohz_idle_balance, nohz full exit...)
5917 *
5918 * This means we might still be one tick off for nohz periods.
5919 */
5920
5921static void cpu_load_update_nohz(struct rq *this_rq,
5922 unsigned long curr_jiffies,
5923 unsigned long load)
Frederic Weisbeckerbe68a682016-01-13 17:01:29 +01005924{
5925 unsigned long pending_updates;
5926
5927 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5928 if (pending_updates) {
5929 this_rq->last_load_update_tick = curr_jiffies;
5930 /*
5931 * In the regular NOHZ case, we were idle, this means load 0.
5932 * In the NOHZ_FULL case, we were non-idle, we should consider
5933 * its weighted load.
5934 */
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005935 cpu_load_update(this_rq, load, pending_updates);
Frederic Weisbeckerbe68a682016-01-13 17:01:29 +01005936 }
5937}
5938
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005939/*
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005940 * Called from nohz_idle_balance() to update the load ratings before doing the
5941 * idle balance.
5942 */
Frederic Weisbeckercee1afc2016-04-13 15:56:50 +02005943static void cpu_load_update_idle(struct rq *this_rq)
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005944{
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005945 /*
5946 * bail if there's load or we're actually up-to-date.
5947 */
Frederic Weisbeckerbe68a682016-01-13 17:01:29 +01005948 if (weighted_cpuload(cpu_of(this_rq)))
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005949 return;
5950
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005951 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005952}
5953
5954/*
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005955 * Record CPU load on nohz entry so we know the tickless load to account
5956 * on nohz exit. cpu_load[0] happens then to be updated more frequently
5957 * than other cpu_load[idx] but it should be fine as cpu_load readers
5958 * shouldn't rely into synchronized cpu_load[*] updates.
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005959 */
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005960void cpu_load_update_nohz_start(void)
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005961{
5962 struct rq *this_rq = this_rq();
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005963
5964 /*
5965 * This is all lockless but should be fine. If weighted_cpuload changes
5966 * concurrently we'll exit nohz. And cpu_load write can race with
5967 * cpu_load_update_idle() but both updater would be writing the same.
5968 */
5969 this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
5970}
5971
5972/*
5973 * Account the tickless load in the end of a nohz frame.
5974 */
5975void cpu_load_update_nohz_stop(void)
5976{
Jason Low316c1608d2015-04-28 13:00:20 -07005977 unsigned long curr_jiffies = READ_ONCE(jiffies);
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005978 struct rq *this_rq = this_rq();
5979 unsigned long load;
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005980
5981 if (curr_jiffies == this_rq->last_load_update_tick)
5982 return;
5983
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005984 load = weighted_cpuload(cpu_of(this_rq));
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005985 raw_spin_lock(&this_rq->lock);
Matt Flemingb52fad22016-05-03 20:46:54 +01005986 update_rq_clock(this_rq);
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005987 cpu_load_update_nohz(this_rq, curr_jiffies, load);
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005988 raw_spin_unlock(&this_rq->lock);
5989}
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005990#else /* !CONFIG_NO_HZ_COMMON */
5991static inline void cpu_load_update_nohz(struct rq *this_rq,
5992 unsigned long curr_jiffies,
5993 unsigned long load) { }
5994#endif /* CONFIG_NO_HZ_COMMON */
5995
5996static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
5997{
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005998#ifdef CONFIG_NO_HZ_COMMON
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005999 /* See the mess around cpu_load_update_nohz(). */
6000 this_rq->last_load_update_tick = READ_ONCE(jiffies);
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02006001#endif
Frederic Weisbecker1f419062016-04-13 15:56:51 +02006002 cpu_load_update(this_rq, load, 1);
6003}
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02006004
6005/*
6006 * Called from scheduler_tick()
6007 */
Frederic Weisbeckercee1afc2016-04-13 15:56:50 +02006008void cpu_load_update_active(struct rq *this_rq)
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02006009{
Yuyang Du7ea241a2015-07-15 08:04:42 +08006010 unsigned long load = weighted_cpuload(cpu_of(this_rq));
Frederic Weisbecker1f419062016-04-13 15:56:51 +02006011
6012 if (tick_nohz_tick_stopped())
6013 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
6014 else
6015 cpu_load_update_periodic(this_rq, load);
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02006016}
6017
Peter Zijlstra029632f2011-10-25 10:00:11 +02006018/*
6019 * Return a low guess at the load of a migration-source cpu weighted
6020 * according to the scheduling class and "nice" value.
6021 *
6022 * We want to under-estimate the load of migration sources, to
6023 * balance conservatively.
6024 */
6025static unsigned long source_load(int cpu, int type)
6026{
6027 struct rq *rq = cpu_rq(cpu);
6028 unsigned long total = weighted_cpuload(cpu);
6029
6030 if (type == 0 || !sched_feat(LB_BIAS))
6031 return total;
6032
6033 return min(rq->cpu_load[type-1], total);
6034}
6035
6036/*
6037 * Return a high guess at the load of a migration-target cpu weighted
6038 * according to the scheduling class and "nice" value.
6039 */
6040static unsigned long target_load(int cpu, int type)
6041{
6042 struct rq *rq = cpu_rq(cpu);
6043 unsigned long total = weighted_cpuload(cpu);
6044
6045 if (type == 0 || !sched_feat(LB_BIAS))
6046 return total;
6047
6048 return max(rq->cpu_load[type-1], total);
6049}
6050
Nicolas Pitreced549f2014-05-26 18:19:38 -04006051static unsigned long capacity_of(int cpu)
Peter Zijlstra029632f2011-10-25 10:00:11 +02006052{
Nicolas Pitreced549f2014-05-26 18:19:38 -04006053 return cpu_rq(cpu)->cpu_capacity;
Peter Zijlstra029632f2011-10-25 10:00:11 +02006054}
6055
Vincent Guittotca6d75e2015-02-27 16:54:09 +01006056static unsigned long capacity_orig_of(int cpu)
6057{
6058 return cpu_rq(cpu)->cpu_capacity_orig;
6059}
6060
Peter Zijlstra029632f2011-10-25 10:00:11 +02006061static unsigned long cpu_avg_load_per_task(int cpu)
6062{
6063 struct rq *rq = cpu_rq(cpu);
Jason Low316c1608d2015-04-28 13:00:20 -07006064 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
Yuyang Du7ea241a2015-07-15 08:04:42 +08006065 unsigned long load_avg = weighted_cpuload(cpu);
Peter Zijlstra029632f2011-10-25 10:00:11 +02006066
6067 if (nr_running)
Alex Shib92486c2013-06-20 10:18:50 +08006068 return load_avg / nr_running;
Peter Zijlstra029632f2011-10-25 10:00:11 +02006069
6070 return 0;
6071}
6072
Peter Zijlstrabb3469a2008-06-27 13:41:27 +02006073#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstraf5bfb7d2008-06-27 13:41:39 +02006074/*
6075 * effective_load() calculates the load change as seen from the root_task_group
6076 *
6077 * Adding load to a group doesn't make a group heavier, but can cause movement
6078 * of group shares between cpus. Assuming the shares were perfectly aligned one
6079 * can calculate the shift in shares.
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02006080 *
6081 * Calculate the effective load difference if @wl is added (subtracted) to @tg
6082 * on this @cpu and results in a total addition (subtraction) of @wg to the
6083 * total group weight.
6084 *
6085 * Given a runqueue weight distribution (rw_i) we can compute a shares
6086 * distribution (s_i) using:
6087 *
6088 * s_i = rw_i / \Sum rw_j (1)
6089 *
6090 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
6091 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
6092 * shares distribution (s_i):
6093 *
6094 * rw_i = { 2, 4, 1, 0 }
6095 * s_i = { 2/7, 4/7, 1/7, 0 }
6096 *
6097 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
6098 * task used to run on and the CPU the waker is running on), we need to
6099 * compute the effect of waking a task on either CPU and, in case of a sync
6100 * wakeup, compute the effect of the current task going to sleep.
6101 *
6102 * So for a change of @wl to the local @cpu with an overall group weight change
6103 * of @wl we can compute the new shares distribution (s'_i) using:
6104 *
6105 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
6106 *
6107 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
6108 * differences in waking a task to CPU 0. The additional task changes the
6109 * weight and shares distributions like:
6110 *
6111 * rw'_i = { 3, 4, 1, 0 }
6112 * s'_i = { 3/8, 4/8, 1/8, 0 }
6113 *
6114 * We can then compute the difference in effective weight by using:
6115 *
6116 * dw_i = S * (s'_i - s_i) (3)
6117 *
6118 * Where 'S' is the group weight as seen by its parent.
6119 *
6120 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
6121 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
6122 * 4/7) times the weight of the group.
Peter Zijlstraf5bfb7d2008-06-27 13:41:39 +02006123 */
Peter Zijlstra2069dd72010-11-15 15:47:00 -08006124static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstrabb3469a2008-06-27 13:41:27 +02006125{
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02006126 struct sched_entity *se = tg->se[cpu];
Peter Zijlstraf1d239f2008-06-27 13:41:38 +02006127
Rik van Riel9722c2d2014-01-06 11:39:12 +00006128 if (!tg->parent) /* the trivial, non-cgroup case */
Peter Zijlstraf1d239f2008-06-27 13:41:38 +02006129 return wl;
6130
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02006131 for_each_sched_entity(se) {
Peter Zijlstra7dd49122016-06-24 15:53:54 +02006132 struct cfs_rq *cfs_rq = se->my_q;
6133 long W, w = cfs_rq_load_avg(cfs_rq);
Peter Zijlstrabb3469a2008-06-27 13:41:27 +02006134
Peter Zijlstra7dd49122016-06-24 15:53:54 +02006135 tg = cfs_rq->tg;
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02006136
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02006137 /*
6138 * W = @wg + \Sum rw_j
6139 */
Peter Zijlstra7dd49122016-06-24 15:53:54 +02006140 W = wg + atomic_long_read(&tg->load_avg);
6141
6142 /* Ensure \Sum rw_j >= rw_i */
6143 W -= cfs_rq->tg_load_avg_contrib;
6144 W += w;
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02006145
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02006146 /*
6147 * w = rw_i + @wl
6148 */
Peter Zijlstra7dd49122016-06-24 15:53:54 +02006149 w += wl;
Peter Zijlstra940959e2008-09-23 15:33:42 +02006150
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02006151 /*
6152 * wl = S * s'_i; see (2)
6153 */
6154 if (W > 0 && w < W)
Dietmar Eggemannab522e32016-08-22 15:00:41 +01006155 wl = (w * (long)scale_load_down(tg->shares)) / W;
Paul Turner977dda72011-01-14 17:57:50 -08006156 else
Dietmar Eggemannab522e32016-08-22 15:00:41 +01006157 wl = scale_load_down(tg->shares);
Peter Zijlstra940959e2008-09-23 15:33:42 +02006158
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02006159 /*
6160 * Per the above, wl is the new se->load.weight value; since
6161 * those are clipped to [MIN_SHARES, ...) do so now. See
6162 * calc_cfs_shares().
6163 */
Paul Turner977dda72011-01-14 17:57:50 -08006164 if (wl < MIN_SHARES)
6165 wl = MIN_SHARES;
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02006166
6167 /*
6168 * wl = dw_i = S * (s'_i - s_i); see (3)
6169 */
Yuyang Du9d89c252015-07-15 08:04:37 +08006170 wl -= se->avg.load_avg;
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02006171
6172 /*
6173 * Recursively apply this logic to all parent groups to compute
6174 * the final effective load change on the root group. Since
6175 * only the @tg group gets extra weight, all parent groups can
6176 * only redistribute existing shares. @wl is the shift in shares
6177 * resulting from this level per the above.
6178 */
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02006179 wg = 0;
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02006180 }
6181
6182 return wl;
Peter Zijlstrabb3469a2008-06-27 13:41:27 +02006183}
6184#else
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02006185
Mel Gorman58d081b2013-10-07 11:29:10 +01006186static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02006187{
Peter Zijlstra83378262008-06-27 13:41:37 +02006188 return wl;
Peter Zijlstrabb3469a2008-06-27 13:41:27 +02006189}
Peter Zijlstra4be9daa2008-06-27 13:41:30 +02006190
Peter Zijlstrabb3469a2008-06-27 13:41:27 +02006191#endif
6192
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02006193static void record_wakee(struct task_struct *p)
6194{
6195 /*
6196 * Only decay a single time; tasks that have less then 1 wakeup per
6197 * jiffy will not have built up many flips.
6198 */
6199 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
6200 current->wakee_flips >>= 1;
6201 current->wakee_flip_decay_ts = jiffies;
6202 }
6203
6204 if (current->last_wakee != p) {
6205 current->last_wakee = p;
6206 current->wakee_flips++;
6207 }
6208}
6209
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006210/*
6211 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02006212 *
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006213 * A waker of many should wake a different task than the one last awakened
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02006214 * at a frequency roughly N times higher than one of its wakees.
6215 *
6216 * In order to determine whether we should let the load spread vs consolidating
6217 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
6218 * partner, and a factor of lls_size higher frequency in the other.
6219 *
6220 * With both conditions met, we can be relatively sure that the relationship is
6221 * non-monogamous, with partner count exceeding socket size.
6222 *
6223 * Waker/wakee being client/server, worker/dispatcher, interrupt source or
6224 * whatever is irrelevant, spread criteria is apparent partner count exceeds
6225 * socket size.
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006226 */
Michael Wang62470412013-07-04 12:55:51 +08006227static int wake_wide(struct task_struct *p)
6228{
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006229 unsigned int master = current->wakee_flips;
6230 unsigned int slave = p->wakee_flips;
Peter Zijlstra7d9ffa82013-07-04 12:56:46 +08006231 int factor = this_cpu_read(sd_llc_size);
Michael Wang62470412013-07-04 12:55:51 +08006232
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006233 if (master < slave)
6234 swap(master, slave);
6235 if (slave < factor || master < slave * factor)
6236 return 0;
6237 return 1;
Michael Wang62470412013-07-04 12:55:51 +08006238}
6239
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01006240static int wake_affine(struct sched_domain *sd, struct task_struct *p,
6241 int prev_cpu, int sync)
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006242{
Paul Turnere37b6a72011-01-21 20:44:59 -08006243 s64 this_load, load;
Vincent Guittotbd61c982014-08-26 13:06:50 +02006244 s64 this_eff_load, prev_eff_load;
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01006245 int idx, this_cpu;
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006246 struct task_group *tg;
Peter Zijlstra83378262008-06-27 13:41:37 +02006247 unsigned long weight;
Mike Galbraithb3137bc2008-05-29 11:11:41 +02006248 int balanced;
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006249
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006250 idx = sd->wake_idx;
6251 this_cpu = smp_processor_id();
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006252 load = source_load(prev_cpu, idx);
6253 this_load = target_load(this_cpu, idx);
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006254
6255 /*
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006256 * If sync wakeup then subtract the (maximum possible)
6257 * effect of the currently running task from the load
6258 * of the current CPU:
6259 */
Peter Zijlstra83378262008-06-27 13:41:37 +02006260 if (sync) {
6261 tg = task_group(current);
Yuyang Du9d89c252015-07-15 08:04:37 +08006262 weight = current->se.avg.load_avg;
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006263
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006264 this_load += effective_load(tg, this_cpu, -weight, -weight);
Peter Zijlstra83378262008-06-27 13:41:37 +02006265 load += effective_load(tg, prev_cpu, 0, -weight);
6266 }
6267
6268 tg = task_group(p);
Yuyang Du9d89c252015-07-15 08:04:37 +08006269 weight = p->se.avg.load_avg;
Peter Zijlstra83378262008-06-27 13:41:37 +02006270
Peter Zijlstra71a29aa2009-09-07 18:28:05 +02006271 /*
6272 * In low-load situations, where prev_cpu is idle and this_cpu is idle
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006273 * due to the sync cause above having dropped this_load to 0, we'll
6274 * always have an imbalance, but there's really nothing you can do
6275 * about that, so that's good too.
Peter Zijlstra71a29aa2009-09-07 18:28:05 +02006276 *
6277 * Otherwise check if either cpus are near enough in load to allow this
6278 * task to be woken on this_cpu.
6279 */
Vincent Guittotbd61c982014-08-26 13:06:50 +02006280 this_eff_load = 100;
6281 this_eff_load *= capacity_of(prev_cpu);
Peter Zijlstrae51fd5e2010-05-31 12:37:30 +02006282
Vincent Guittotbd61c982014-08-26 13:06:50 +02006283 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
6284 prev_eff_load *= capacity_of(this_cpu);
6285
6286 if (this_load > 0) {
Peter Zijlstrae51fd5e2010-05-31 12:37:30 +02006287 this_eff_load *= this_load +
6288 effective_load(tg, this_cpu, weight, weight);
6289
Peter Zijlstrae51fd5e2010-05-31 12:37:30 +02006290 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
Vincent Guittotbd61c982014-08-26 13:06:50 +02006291 }
Peter Zijlstrae51fd5e2010-05-31 12:37:30 +02006292
Vincent Guittotbd61c982014-08-26 13:06:50 +02006293 balanced = this_eff_load <= prev_eff_load;
Mike Galbraithb3137bc2008-05-29 11:11:41 +02006294
Josh Poimboeufae928822016-06-17 12:43:24 -05006295 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
Mike Galbraithb3137bc2008-05-29 11:11:41 +02006296
Vincent Guittot05bfb652014-08-26 13:06:45 +02006297 if (!balanced)
6298 return 0;
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006299
Josh Poimboeufae928822016-06-17 12:43:24 -05006300 schedstat_inc(sd->ttwu_move_affine);
6301 schedstat_inc(p->se.statistics.nr_wakeups_affine);
Vincent Guittot05bfb652014-08-26 13:06:45 +02006302
6303 return 1;
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006304}
6305
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006306/*
6307 * find_idlest_group finds and returns the least busy CPU group within the
6308 * domain.
6309 */
6310static struct sched_group *
Peter Zijlstra78e7ed52009-09-03 13:16:51 +02006311find_idlest_group(struct sched_domain *sd, struct task_struct *p,
Vincent Guittotc44f2a02013-10-18 13:52:21 +02006312 int this_cpu, int sd_flag)
Gregory Haskinse7693a32008-01-25 21:08:09 +01006313{
Andi Kleenb3bd3de2010-08-10 14:17:51 -07006314 struct sched_group *idlest = NULL, *group = sd->groups;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006315 unsigned long min_load = ULONG_MAX, this_load = 0;
Vincent Guittotc44f2a02013-10-18 13:52:21 +02006316 int load_idx = sd->forkexec_idx;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006317 int imbalance = 100 + (sd->imbalance_pct-100)/2;
Gregory Haskinse7693a32008-01-25 21:08:09 +01006318
Vincent Guittotc44f2a02013-10-18 13:52:21 +02006319 if (sd_flag & SD_BALANCE_WAKE)
6320 load_idx = sd->wake_idx;
6321
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006322 do {
6323 unsigned long load, avg_load;
6324 int local_group;
6325 int i;
Gregory Haskinse7693a32008-01-25 21:08:09 +01006326
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006327 /* Skip over this group if it has no CPUs allowed */
6328 if (!cpumask_intersects(sched_group_cpus(group),
Peter Zijlstrafa17b502011-06-16 12:23:22 +02006329 tsk_cpus_allowed(p)))
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006330 continue;
6331
6332 local_group = cpumask_test_cpu(this_cpu,
6333 sched_group_cpus(group));
6334
6335 /* Tally up the load of all CPUs in the group */
6336 avg_load = 0;
6337
6338 for_each_cpu(i, sched_group_cpus(group)) {
6339 /* Bias balancing toward cpus of our domain */
6340 if (local_group)
6341 load = source_load(i, load_idx);
6342 else
6343 load = target_load(i, load_idx);
6344
6345 avg_load += load;
6346 }
6347
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04006348 /* Adjust by relative CPU capacity of the group */
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04006349 avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006350
6351 if (local_group) {
6352 this_load = avg_load;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006353 } else if (avg_load < min_load) {
6354 min_load = avg_load;
6355 idlest = group;
6356 }
6357 } while (group = group->next, group != sd->groups);
6358
6359 if (!idlest || 100*this_load < imbalance*min_load)
6360 return NULL;
6361 return idlest;
6362}
6363
6364/*
6365 * find_idlest_cpu - find the idlest cpu among the cpus in group.
6366 */
6367static int
6368find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
6369{
6370 unsigned long load, min_load = ULONG_MAX;
Nicolas Pitre83a0a962014-09-04 11:32:10 -04006371 unsigned int min_exit_latency = UINT_MAX;
6372 u64 latest_idle_timestamp = 0;
6373 int least_loaded_cpu = this_cpu;
6374 int shallowest_idle_cpu = -1;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006375 int i;
6376
Morten Rasmusseneaecf412016-06-22 18:03:14 +01006377 /* Check if we have any choice: */
6378 if (group->group_weight == 1)
6379 return cpumask_first(sched_group_cpus(group));
6380
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006381 /* Traverse only the allowed CPUs */
Peter Zijlstrafa17b502011-06-16 12:23:22 +02006382 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
Nicolas Pitre83a0a962014-09-04 11:32:10 -04006383 if (idle_cpu(i)) {
6384 struct rq *rq = cpu_rq(i);
6385 struct cpuidle_state *idle = idle_get_state(rq);
6386 if (idle && idle->exit_latency < min_exit_latency) {
6387 /*
6388 * We give priority to a CPU whose idle state
6389 * has the smallest exit latency irrespective
6390 * of any idle timestamp.
6391 */
6392 min_exit_latency = idle->exit_latency;
6393 latest_idle_timestamp = rq->idle_stamp;
6394 shallowest_idle_cpu = i;
6395 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
6396 rq->idle_stamp > latest_idle_timestamp) {
6397 /*
6398 * If equal or no active idle state, then
6399 * the most recently idled CPU might have
6400 * a warmer cache.
6401 */
6402 latest_idle_timestamp = rq->idle_stamp;
6403 shallowest_idle_cpu = i;
6404 }
Yao Dongdong9f967422014-10-28 04:08:06 +00006405 } else if (shallowest_idle_cpu == -1) {
Nicolas Pitre83a0a962014-09-04 11:32:10 -04006406 load = weighted_cpuload(i);
6407 if (load < min_load || (load == min_load && i == this_cpu)) {
6408 min_load = load;
6409 least_loaded_cpu = i;
6410 }
Gregory Haskinse7693a32008-01-25 21:08:09 +01006411 }
6412 }
6413
Nicolas Pitre83a0a962014-09-04 11:32:10 -04006414 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006415}
Gregory Haskinse7693a32008-01-25 21:08:09 +01006416
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006417/*
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006418 * Implement a for_each_cpu() variant that starts the scan at a given cpu
6419 * (@start), and wraps around.
6420 *
6421 * This is used to scan for idle CPUs; such that not all CPUs looking for an
6422 * idle CPU find the same CPU. The down-side is that tasks tend to cycle
6423 * through the LLC domain.
6424 *
6425 * Especially tbench is found sensitive to this.
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006426 */
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006427
6428static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
6429{
6430 int next;
6431
6432again:
6433 next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
6434
6435 if (*wrapped) {
6436 if (next >= start)
6437 return nr_cpumask_bits;
6438 } else {
6439 if (next >= nr_cpumask_bits) {
6440 *wrapped = 1;
6441 n = -1;
6442 goto again;
6443 }
6444 }
6445
6446 return next;
6447}
6448
6449#define for_each_cpu_wrap(cpu, mask, start, wrap) \
6450 for ((wrap) = 0, (cpu) = (start)-1; \
6451 (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
6452 (cpu) < nr_cpumask_bits; )
6453
6454#ifdef CONFIG_SCHED_SMT
6455
6456static inline void set_idle_cores(int cpu, int val)
6457{
6458 struct sched_domain_shared *sds;
6459
6460 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6461 if (sds)
6462 WRITE_ONCE(sds->has_idle_cores, val);
6463}
6464
6465static inline bool test_idle_cores(int cpu, bool def)
6466{
6467 struct sched_domain_shared *sds;
6468
6469 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6470 if (sds)
6471 return READ_ONCE(sds->has_idle_cores);
6472
6473 return def;
6474}
6475
6476/*
6477 * Scans the local SMT mask to see if the entire core is idle, and records this
6478 * information in sd_llc_shared->has_idle_cores.
6479 *
6480 * Since SMT siblings share all cache levels, inspecting this limited remote
6481 * state should be fairly cheap.
6482 */
Peter Zijlstra1b568f02016-05-09 10:38:41 +02006483void __update_idle_core(struct rq *rq)
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006484{
6485 int core = cpu_of(rq);
6486 int cpu;
6487
6488 rcu_read_lock();
6489 if (test_idle_cores(core, true))
6490 goto unlock;
6491
6492 for_each_cpu(cpu, cpu_smt_mask(core)) {
6493 if (cpu == core)
6494 continue;
6495
6496 if (!idle_cpu(cpu))
6497 goto unlock;
6498 }
6499
6500 set_idle_cores(core, 1);
6501unlock:
6502 rcu_read_unlock();
6503}
6504
6505/*
6506 * Scan the entire LLC domain for idle cores; this dynamically switches off if
6507 * there are no idle cores left in the system; tracked through
6508 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
6509 */
6510static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
6511{
6512 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6513 int core, cpu, wrap;
6514
Peter Zijlstra1b568f02016-05-09 10:38:41 +02006515 if (!static_branch_likely(&sched_smt_present))
6516 return -1;
6517
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006518 if (!test_idle_cores(target, false))
6519 return -1;
6520
6521 cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
6522
6523 for_each_cpu_wrap(core, cpus, target, wrap) {
6524 bool idle = true;
6525
6526 for_each_cpu(cpu, cpu_smt_mask(core)) {
6527 cpumask_clear_cpu(cpu, cpus);
6528 if (!idle_cpu(cpu))
6529 idle = false;
6530 }
6531
6532 if (idle)
6533 return core;
6534 }
6535
6536 /*
6537 * Failed to find an idle core; stop looking for one.
6538 */
6539 set_idle_cores(target, 0);
6540
6541 return -1;
6542}
6543
6544/*
6545 * Scan the local SMT mask for idle CPUs.
6546 */
6547static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6548{
6549 int cpu;
6550
Peter Zijlstra1b568f02016-05-09 10:38:41 +02006551 if (!static_branch_likely(&sched_smt_present))
6552 return -1;
6553
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006554 for_each_cpu(cpu, cpu_smt_mask(target)) {
6555 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
6556 continue;
6557 if (idle_cpu(cpu))
6558 return cpu;
6559 }
6560
6561 return -1;
6562}
6563
6564#else /* CONFIG_SCHED_SMT */
6565
6566static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
6567{
6568 return -1;
6569}
6570
6571static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6572{
6573 return -1;
6574}
6575
6576#endif /* CONFIG_SCHED_SMT */
6577
6578/*
6579 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
6580 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
6581 * average idle time for this rq (as found in rq->avg_idle).
6582 */
6583static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
6584{
Wanpeng Li9cfb38a2016-10-09 08:04:03 +08006585 struct sched_domain *this_sd;
6586 u64 avg_cost, avg_idle = this_rq()->avg_idle;
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006587 u64 time, cost;
6588 s64 delta;
6589 int cpu, wrap;
6590
Wanpeng Li9cfb38a2016-10-09 08:04:03 +08006591 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
6592 if (!this_sd)
6593 return -1;
6594
6595 avg_cost = this_sd->avg_scan_cost;
6596
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006597 /*
6598 * Due to large variance we need a large fuzz factor; hackbench in
6599 * particularly is sensitive here.
6600 */
6601 if ((avg_idle / 512) < avg_cost)
6602 return -1;
6603
6604 time = local_clock();
6605
6606 for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
6607 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
6608 continue;
6609 if (idle_cpu(cpu))
6610 break;
6611 }
6612
6613 time = local_clock() - time;
6614 cost = this_sd->avg_scan_cost;
6615 delta = (s64)(time - cost) / 8;
6616 this_sd->avg_scan_cost += delta;
6617
6618 return cpu;
6619}
6620
6621/*
6622 * Try and locate an idle core/thread in the LLC cache domain.
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006623 */
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01006624static int select_idle_sibling(struct task_struct *p, int prev, int target)
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006625{
Suresh Siddha99bd5e22010-03-31 16:47:45 -07006626 struct sched_domain *sd;
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006627 int i;
Mike Galbraithe0a79f52013-01-28 12:19:25 +01006628
6629 if (idle_cpu(target))
6630 return target;
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006631
6632 /*
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006633 * If the previous cpu is cache affine and idle, don't be stupid.
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006634 */
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01006635 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
6636 return prev;
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006637
Peter Zijlstra518cd622011-12-07 15:07:31 +01006638 sd = rcu_dereference(per_cpu(sd_llc, target));
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006639 if (!sd)
6640 return target;
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01006641
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006642 i = select_idle_core(p, sd, target);
6643 if ((unsigned)i < nr_cpumask_bits)
Gregory Haskinse7693a32008-01-25 21:08:09 +01006644 return i;
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006645
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006646 i = select_idle_cpu(p, sd, target);
6647 if ((unsigned)i < nr_cpumask_bits)
6648 return i;
Mike Galbraith970e1782012-06-12 05:18:32 +02006649
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006650 i = select_idle_smt(p, sd, target);
6651 if ((unsigned)i < nr_cpumask_bits)
6652 return i;
Linus Torvalds37407ea2012-09-16 12:29:43 -07006653
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006654 return target;
6655}
Dietmar Eggemann231678b2015-08-14 17:23:13 +01006656
Vincent Guittot8bb5b002015-03-04 08:48:47 +01006657/*
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01006658 * cpu_util returns the amount of capacity of a CPU that is used by CFS
Vincent Guittot8bb5b002015-03-04 08:48:47 +01006659 * tasks. The unit of the return value must be the one of capacity so we can
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01006660 * compare the utilization with the capacity of the CPU that is available for
6661 * CFS task (ie cpu_capacity).
Dietmar Eggemann231678b2015-08-14 17:23:13 +01006662 *
6663 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
6664 * recent utilization of currently non-runnable tasks on a CPU. It represents
6665 * the amount of utilization of a CPU in the range [0..capacity_orig] where
6666 * capacity_orig is the cpu_capacity available at the highest frequency
6667 * (arch_scale_freq_capacity()).
6668 * The utilization of a CPU converges towards a sum equal to or less than the
6669 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
6670 * the running time on this CPU scaled by capacity_curr.
6671 *
6672 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
6673 * higher than capacity_orig because of unfortunate rounding in
6674 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
6675 * the average stabilizes with the new running time. We need to check that the
6676 * utilization stays within the range of [0..capacity_orig] and cap it if
6677 * necessary. Without utilization capping, a group could be seen as overloaded
6678 * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
6679 * available capacity. We allow utilization to overshoot capacity_curr (but not
6680 * capacity_orig) as it useful for predicting the capacity required after task
6681 * migrations (scheduler-driven DVFS).
Vincent Guittot8bb5b002015-03-04 08:48:47 +01006682 */
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01006683static int cpu_util(int cpu)
Vincent Guittot8bb5b002015-03-04 08:48:47 +01006684{
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01006685 unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
Vincent Guittot8bb5b002015-03-04 08:48:47 +01006686 unsigned long capacity = capacity_orig_of(cpu);
6687
Dietmar Eggemann231678b2015-08-14 17:23:13 +01006688 return (util >= capacity) ? capacity : util;
Vincent Guittot8bb5b002015-03-04 08:48:47 +01006689}
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006690
Morten Rasmussen32731632016-07-25 14:34:26 +01006691static inline int task_util(struct task_struct *p)
6692{
6693 return p->se.avg.util_avg;
6694}
6695
6696/*
6697 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
6698 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
6699 *
6700 * In that case WAKE_AFFINE doesn't make sense and we'll let
6701 * BALANCE_WAKE sort things out.
6702 */
6703static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6704{
6705 long min_cap, max_cap;
6706
6707 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6708 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
6709
6710 /* Minimum capacity is close to max, no need to abort wake_affine */
6711 if (max_cap - min_cap < max_cap >> 3)
6712 return 0;
6713
6714 return min_cap * 1024 < task_util(p) * capacity_margin;
6715}
6716
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006717/*
Morten Rasmussende91b9c2014-02-18 14:14:24 +00006718 * select_task_rq_fair: Select target runqueue for the waking task in domains
6719 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
6720 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006721 *
Morten Rasmussende91b9c2014-02-18 14:14:24 +00006722 * Balances load by selecting the idlest cpu in the idlest group, or under
6723 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006724 *
Morten Rasmussende91b9c2014-02-18 14:14:24 +00006725 * Returns the target cpu number.
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006726 *
6727 * preempt must be disabled.
6728 */
Peter Zijlstra0017d732010-03-24 18:34:10 +01006729static int
Peter Zijlstraac66f542013-10-07 11:29:16 +01006730select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006731{
Peter Zijlstra29cd8ba2009-09-17 09:01:14 +02006732 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006733 int cpu = smp_processor_id();
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006734 int new_cpu = prev_cpu;
Suresh Siddha99bd5e22010-03-31 16:47:45 -07006735 int want_affine = 0;
Peter Zijlstra5158f4e2009-09-16 13:46:59 +02006736 int sync = wake_flags & WF_SYNC;
Gregory Haskinse7693a32008-01-25 21:08:09 +01006737
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07006738#ifdef CONFIG_SCHED_HMP
6739 return select_best_cpu(p, prev_cpu, 0, sync);
6740#endif
6741
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02006742 if (sd_flag & SD_BALANCE_WAKE) {
6743 record_wakee(p);
Morten Rasmussen32731632016-07-25 14:34:26 +01006744 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
6745 && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02006746 }
Gregory Haskinse7693a32008-01-25 21:08:09 +01006747
Peter Zijlstradce840a2011-04-07 14:09:50 +02006748 rcu_read_lock();
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006749 for_each_domain(cpu, tmp) {
Peter Zijlstrae4f428882009-12-16 18:04:34 +01006750 if (!(tmp->flags & SD_LOAD_BALANCE))
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006751 break;
Peter Zijlstrae4f428882009-12-16 18:04:34 +01006752
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006753 /*
Suresh Siddha99bd5e22010-03-31 16:47:45 -07006754 * If both cpu and prev_cpu are part of this domain,
6755 * cpu is a valid SD_WAKE_AFFINE target.
Peter Zijlstrafe3bcfe2009-11-12 15:55:29 +01006756 */
Suresh Siddha99bd5e22010-03-31 16:47:45 -07006757 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6758 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6759 affine_sd = tmp;
Alex Shif03542a2012-07-26 08:55:34 +08006760 break;
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006761 }
6762
Alex Shif03542a2012-07-26 08:55:34 +08006763 if (tmp->flags & sd_flag)
Peter Zijlstra29cd8ba2009-09-17 09:01:14 +02006764 sd = tmp;
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006765 else if (!want_affine)
6766 break;
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006767 }
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006768
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006769 if (affine_sd) {
6770 sd = NULL; /* Prefer wake_affine over balance flags */
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01006771 if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006772 new_cpu = cpu;
Mike Galbraith8b911ac2010-03-11 17:17:16 +01006773 }
Peter Zijlstra3b640892009-09-16 13:44:33 +02006774
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006775 if (!sd) {
6776 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01006777 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006778
6779 } else while (sd) {
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006780 struct sched_group *group;
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006781 int weight;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006782
Peter Zijlstra0763a662009-09-14 19:37:39 +02006783 if (!(sd->flags & sd_flag)) {
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006784 sd = sd->child;
6785 continue;
6786 }
6787
Vincent Guittotc44f2a02013-10-18 13:52:21 +02006788 group = find_idlest_group(sd, p, cpu, sd_flag);
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006789 if (!group) {
6790 sd = sd->child;
6791 continue;
6792 }
6793
Peter Zijlstrad7c33c42009-09-11 12:45:38 +02006794 new_cpu = find_idlest_cpu(group, p, cpu);
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006795 if (new_cpu == -1 || new_cpu == cpu) {
6796 /* Now try balancing at a lower domain level of cpu */
6797 sd = sd->child;
6798 continue;
6799 }
6800
6801 /* Now try balancing at a lower domain level of new_cpu */
6802 cpu = new_cpu;
Peter Zijlstra669c55e2010-04-16 14:59:29 +02006803 weight = sd->span_weight;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006804 sd = NULL;
6805 for_each_domain(cpu, tmp) {
Peter Zijlstra669c55e2010-04-16 14:59:29 +02006806 if (weight <= tmp->span_weight)
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006807 break;
Peter Zijlstra0763a662009-09-14 19:37:39 +02006808 if (tmp->flags & sd_flag)
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006809 sd = tmp;
6810 }
6811 /* while loop will break here if sd == NULL */
Gregory Haskinse7693a32008-01-25 21:08:09 +01006812 }
Peter Zijlstradce840a2011-04-07 14:09:50 +02006813 rcu_read_unlock();
Gregory Haskinse7693a32008-01-25 21:08:09 +01006814
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006815 return new_cpu;
Gregory Haskinse7693a32008-01-25 21:08:09 +01006816}
Paul Turner0a74bef2012-10-04 13:18:30 +02006817
6818/*
6819 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
6820 * cfs_rq_of(p) references at time of call are still valid and identify the
Byungchul Park525628c2015-11-18 09:34:59 +09006821 * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
Paul Turner0a74bef2012-10-04 13:18:30 +02006822 */
xiaofeng.yan5a4fd032015-09-23 14:55:59 +08006823static void migrate_task_rq_fair(struct task_struct *p)
Paul Turner0a74bef2012-10-04 13:18:30 +02006824{
Paul Turneraff3e492012-10-04 13:18:30 +02006825 /*
Peter Zijlstra59efa0b2016-05-10 18:24:37 +02006826 * As blocked tasks retain absolute vruntime the migration needs to
6827 * deal with this by subtracting the old and adding the new
6828 * min_vruntime -- the latter is done by enqueue_entity() when placing
6829 * the task on the new runqueue.
6830 */
6831 if (p->state == TASK_WAKING) {
6832 struct sched_entity *se = &p->se;
6833 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6834 u64 min_vruntime;
6835
6836#ifndef CONFIG_64BIT
6837 u64 min_vruntime_copy;
6838
6839 do {
6840 min_vruntime_copy = cfs_rq->min_vruntime_copy;
6841 smp_rmb();
6842 min_vruntime = cfs_rq->min_vruntime;
6843 } while (min_vruntime != min_vruntime_copy);
6844#else
6845 min_vruntime = cfs_rq->min_vruntime;
6846#endif
6847
6848 se->vruntime -= min_vruntime;
6849 }
6850
6851 /*
Yuyang Du9d89c252015-07-15 08:04:37 +08006852 * We are supposed to update the task to "current" time, then its up to date
6853 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
6854 * what current time is, so simply throw away the out-of-date time. This
6855 * will result in the wakee task is less decayed, but giving the wakee more
6856 * load sounds not bad.
Paul Turneraff3e492012-10-04 13:18:30 +02006857 */
Yuyang Du9d89c252015-07-15 08:04:37 +08006858 remove_entity_load_avg(&p->se);
6859
6860 /* Tell new CPU we are migrated */
6861 p->se.avg.last_update_time = 0;
Ben Segall3944a922014-05-15 15:59:20 -07006862
6863 /* We have migrated, no longer consider this task hot */
Yuyang Du9d89c252015-07-15 08:04:37 +08006864 p->se.exec_start = 0;
Paul Turner0a74bef2012-10-04 13:18:30 +02006865}
Yuyang Du12695572015-07-15 08:04:40 +08006866
6867static void task_dead_fair(struct task_struct *p)
6868{
6869 remove_entity_load_avg(&p->se);
6870}
Gregory Haskinse7693a32008-01-25 21:08:09 +01006871#endif /* CONFIG_SMP */
6872
Peter Zijlstrae52fb7c2009-01-14 12:39:19 +01006873static unsigned long
6874wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
Peter Zijlstra0bbd3332008-04-19 19:44:57 +02006875{
6876 unsigned long gran = sysctl_sched_wakeup_granularity;
6877
6878 /*
Peter Zijlstrae52fb7c2009-01-14 12:39:19 +01006879 * Since its curr running now, convert the gran from real-time
6880 * to virtual-time in his units.
Mike Galbraith13814d42010-03-11 17:17:04 +01006881 *
6882 * By using 'se' instead of 'curr' we penalize light tasks, so
6883 * they get preempted easier. That is, if 'se' < 'curr' then
6884 * the resulting gran will be larger, therefore penalizing the
6885 * lighter, if otoh 'se' > 'curr' then the resulting gran will
6886 * be smaller, again penalizing the lighter task.
6887 *
6888 * This is especially important for buddies when the leftmost
6889 * task is higher priority than the buddy.
Peter Zijlstra0bbd3332008-04-19 19:44:57 +02006890 */
Shaohua Lif4ad9bd2011-04-08 12:53:09 +08006891 return calc_delta_fair(gran, se);
Peter Zijlstra0bbd3332008-04-19 19:44:57 +02006892}
6893
6894/*
Peter Zijlstra464b7522008-10-24 11:06:15 +02006895 * Should 'se' preempt 'curr'.
6896 *
6897 * |s1
6898 * |s2
6899 * |s3
6900 * g
6901 * |<--->|c
6902 *
6903 * w(c, s1) = -1
6904 * w(c, s2) = 0
6905 * w(c, s3) = 1
6906 *
6907 */
6908static int
6909wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
6910{
6911 s64 gran, vdiff = curr->vruntime - se->vruntime;
6912
6913 if (vdiff <= 0)
6914 return -1;
6915
Peter Zijlstrae52fb7c2009-01-14 12:39:19 +01006916 gran = wakeup_gran(curr, se);
Peter Zijlstra464b7522008-10-24 11:06:15 +02006917 if (vdiff > gran)
6918 return 1;
6919
6920 return 0;
6921}
6922
Peter Zijlstra02479092008-11-04 21:25:10 +01006923static void set_last_buddy(struct sched_entity *se)
6924{
Venkatesh Pallipadi69c80f32011-04-13 18:21:09 -07006925 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
6926 return;
6927
6928 for_each_sched_entity(se)
6929 cfs_rq_of(se)->last = se;
Peter Zijlstra02479092008-11-04 21:25:10 +01006930}
6931
6932static void set_next_buddy(struct sched_entity *se)
6933{
Venkatesh Pallipadi69c80f32011-04-13 18:21:09 -07006934 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
6935 return;
6936
6937 for_each_sched_entity(se)
6938 cfs_rq_of(se)->next = se;
Peter Zijlstra02479092008-11-04 21:25:10 +01006939}
6940
Rik van Rielac53db52011-02-01 09:51:03 -05006941static void set_skip_buddy(struct sched_entity *se)
6942{
Venkatesh Pallipadi69c80f32011-04-13 18:21:09 -07006943 for_each_sched_entity(se)
6944 cfs_rq_of(se)->skip = se;
Rik van Rielac53db52011-02-01 09:51:03 -05006945}
6946
Peter Zijlstra464b7522008-10-24 11:06:15 +02006947/*
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02006948 * Preempt the current task with a newly woken task if needed:
6949 */
Peter Zijlstra5a9b86f2009-09-16 13:47:58 +02006950static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02006951{
6952 struct task_struct *curr = rq->curr;
Srivatsa Vaddagiri8651a862007-10-15 17:00:12 +02006953 struct sched_entity *se = &curr->se, *pse = &p->se;
Mike Galbraith03e89e42008-12-16 08:45:30 +01006954 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
Mike Galbraithf685cea2009-10-23 23:09:22 +02006955 int scale = cfs_rq->nr_running >= sched_nr_latency;
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07006956 int next_buddy_marked = 0;
Mike Galbraith03e89e42008-12-16 08:45:30 +01006957
Ingo Molnar4ae7d5c2008-03-19 01:42:00 +01006958 if (unlikely(se == pse))
6959 return;
6960
Paul Turner5238cdd2011-07-21 09:43:37 -07006961 /*
Kirill Tkhai163122b2014-08-20 13:48:29 +04006962 * This is possible from callers such as attach_tasks(), in which we
Paul Turner5238cdd2011-07-21 09:43:37 -07006963 * unconditionally check_prempt_curr() after an enqueue (which may have
6964 * lead to a throttle). This both saves work and prevents false
6965 * next-buddy nomination below.
6966 */
6967 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
6968 return;
6969
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07006970 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
Mike Galbraith3cb63d52009-09-11 12:01:17 +02006971 set_next_buddy(pse);
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07006972 next_buddy_marked = 1;
6973 }
Peter Zijlstra57fdc262008-09-23 15:33:45 +02006974
Bharata B Raoaec0a512008-08-28 14:42:49 +05306975 /*
6976 * We can come here with TIF_NEED_RESCHED already set from new task
6977 * wake up path.
Paul Turner5238cdd2011-07-21 09:43:37 -07006978 *
6979 * Note: this also catches the edge-case of curr being in a throttled
6980 * group (e.g. via set_curr_task), since update_curr() (in the
6981 * enqueue of curr) will have resulted in resched being set. This
6982 * prevents us from potentially nominating it as a false LAST_BUDDY
6983 * below.
Bharata B Raoaec0a512008-08-28 14:42:49 +05306984 */
6985 if (test_tsk_need_resched(curr))
6986 return;
6987
Darren Harta2f5c9a2011-02-22 13:04:33 -08006988 /* Idle tasks are by definition preempted by non-idle tasks. */
6989 if (unlikely(curr->policy == SCHED_IDLE) &&
6990 likely(p->policy != SCHED_IDLE))
6991 goto preempt;
6992
Ingo Molnar91c234b2007-10-15 17:00:18 +02006993 /*
Darren Harta2f5c9a2011-02-22 13:04:33 -08006994 * Batch and idle tasks do not preempt non-idle tasks (their preemption
6995 * is driven by the tick):
Ingo Molnar91c234b2007-10-15 17:00:18 +02006996 */
Ingo Molnar8ed92e52012-10-14 14:28:50 +02006997 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
Ingo Molnar91c234b2007-10-15 17:00:18 +02006998 return;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02006999
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01007000 find_matching_se(&se, &pse);
Paul Turner9bbd7372011-07-05 19:07:21 -07007001 update_curr(cfs_rq_of(se));
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01007002 BUG_ON(!pse);
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07007003 if (wakeup_preempt_entity(se, pse) == 1) {
7004 /*
7005 * Bias pick_next to pick the sched entity that is
7006 * triggering this preemption.
7007 */
7008 if (!next_buddy_marked)
7009 set_next_buddy(pse);
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01007010 goto preempt;
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07007011 }
Jupyung Leea65ac742009-11-17 18:51:40 +09007012
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01007013 return;
7014
7015preempt:
Kirill Tkhai88751252014-06-29 00:03:57 +04007016 resched_curr(rq);
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01007017 /*
7018 * Only set the backward buddy when the current task is still
7019 * on the rq. This can happen when a wakeup gets interleaved
7020 * with schedule on the ->pre_schedule() or idle_balance()
7021 * point, either of which can * drop the rq lock.
7022 *
7023 * Also, during early boot the idle thread is in the fair class,
7024 * for obvious reasons its a bad idea to schedule back to it.
7025 */
7026 if (unlikely(!se->on_rq || curr == rq->idle))
7027 return;
7028
7029 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
7030 set_last_buddy(se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007031}
7032
Peter Zijlstra606dba22012-02-11 06:05:00 +01007033static struct task_struct *
Peter Zijlstrae7904a22015-08-01 19:25:08 +02007034pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007035{
7036 struct cfs_rq *cfs_rq = &rq->cfs;
7037 struct sched_entity *se;
Peter Zijlstra678d5712012-02-11 06:05:00 +01007038 struct task_struct *p;
Peter Zijlstra37e117c2014-02-14 12:25:08 +01007039 int new_tasks;
Peter Zijlstra678d5712012-02-11 06:05:00 +01007040
Peter Zijlstra6e831252014-02-11 16:11:48 +01007041again:
Peter Zijlstra678d5712012-02-11 06:05:00 +01007042#ifdef CONFIG_FAIR_GROUP_SCHED
7043 if (!cfs_rq->nr_running)
Peter Zijlstra38033c32014-01-23 20:32:21 +01007044 goto idle;
Peter Zijlstra678d5712012-02-11 06:05:00 +01007045
Peter Zijlstra3f1d2a32014-02-12 10:49:30 +01007046 if (prev->sched_class != &fair_sched_class)
Peter Zijlstra678d5712012-02-11 06:05:00 +01007047 goto simple;
7048
7049 /*
7050 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
7051 * likely that a next task is from the same cgroup as the current.
7052 *
7053 * Therefore attempt to avoid putting and setting the entire cgroup
7054 * hierarchy, only change the part that actually changes.
7055 */
7056
7057 do {
7058 struct sched_entity *curr = cfs_rq->curr;
7059
7060 /*
7061 * Since we got here without doing put_prev_entity() we also
7062 * have to consider cfs_rq->curr. If it is still a runnable
7063 * entity, update_curr() will update its vruntime, otherwise
7064 * forget we've ever seen it.
7065 */
Ben Segall54d27362015-04-06 15:28:10 -07007066 if (curr) {
7067 if (curr->on_rq)
7068 update_curr(cfs_rq);
7069 else
7070 curr = NULL;
Peter Zijlstra678d5712012-02-11 06:05:00 +01007071
Ben Segall54d27362015-04-06 15:28:10 -07007072 /*
7073 * This call to check_cfs_rq_runtime() will do the
7074 * throttle and dequeue its entity in the parent(s).
7075 * Therefore the 'simple' nr_running test will indeed
7076 * be correct.
7077 */
7078 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
7079 goto simple;
7080 }
Peter Zijlstra678d5712012-02-11 06:05:00 +01007081
7082 se = pick_next_entity(cfs_rq, curr);
7083 cfs_rq = group_cfs_rq(se);
7084 } while (cfs_rq);
7085
7086 p = task_of(se);
7087
7088 /*
7089 * Since we haven't yet done put_prev_entity and if the selected task
7090 * is a different task than we started out with, try and touch the
7091 * least amount of cfs_rqs.
7092 */
7093 if (prev != p) {
7094 struct sched_entity *pse = &prev->se;
7095
7096 while (!(cfs_rq = is_same_group(se, pse))) {
7097 int se_depth = se->depth;
7098 int pse_depth = pse->depth;
7099
7100 if (se_depth <= pse_depth) {
7101 put_prev_entity(cfs_rq_of(pse), pse);
7102 pse = parent_entity(pse);
7103 }
7104 if (se_depth >= pse_depth) {
7105 set_next_entity(cfs_rq_of(se), se);
7106 se = parent_entity(se);
7107 }
7108 }
7109
7110 put_prev_entity(cfs_rq, pse);
7111 set_next_entity(cfs_rq, se);
7112 }
7113
7114 if (hrtick_enabled(rq))
7115 hrtick_start_fair(rq, p);
7116
7117 return p;
7118simple:
7119 cfs_rq = &rq->cfs;
7120#endif
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007121
Tim Blechmann36ace272009-11-24 11:55:45 +01007122 if (!cfs_rq->nr_running)
Peter Zijlstra38033c32014-01-23 20:32:21 +01007123 goto idle;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007124
Peter Zijlstra3f1d2a32014-02-12 10:49:30 +01007125 put_prev_task(rq, prev);
Peter Zijlstra606dba22012-02-11 06:05:00 +01007126
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007127 do {
Peter Zijlstra678d5712012-02-11 06:05:00 +01007128 se = pick_next_entity(cfs_rq, NULL);
Peter Zijlstraf4b67552008-11-04 21:25:07 +01007129 set_next_entity(cfs_rq, se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007130 cfs_rq = group_cfs_rq(se);
7131 } while (cfs_rq);
7132
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01007133 p = task_of(se);
Peter Zijlstra678d5712012-02-11 06:05:00 +01007134
Mike Galbraithb39e66e2011-11-22 15:20:07 +01007135 if (hrtick_enabled(rq))
7136 hrtick_start_fair(rq, p);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01007137
7138 return p;
Peter Zijlstra38033c32014-01-23 20:32:21 +01007139
7140idle:
Peter Zijlstracbce1a62015-06-11 14:46:54 +02007141 /*
7142 * This is OK, because current is on_cpu, which avoids it being picked
7143 * for load-balance and preemption/IRQs are still disabled avoiding
7144 * further scheduler activity on it and we're being very careful to
7145 * re-start the picking loop.
7146 */
Peter Zijlstrae7904a22015-08-01 19:25:08 +02007147 lockdep_unpin_lock(&rq->lock, cookie);
Kirill Tkhaie4aa3582014-03-06 13:31:55 +04007148 new_tasks = idle_balance(rq);
Peter Zijlstrae7904a22015-08-01 19:25:08 +02007149 lockdep_repin_lock(&rq->lock, cookie);
Peter Zijlstra37e117c2014-02-14 12:25:08 +01007150 /*
7151 * Because idle_balance() releases (and re-acquires) rq->lock, it is
7152 * possible for any higher priority task to appear. In that case we
7153 * must re-start the pick_next_entity() loop.
7154 */
Kirill Tkhaie4aa3582014-03-06 13:31:55 +04007155 if (new_tasks < 0)
Peter Zijlstra37e117c2014-02-14 12:25:08 +01007156 return RETRY_TASK;
7157
Kirill Tkhaie4aa3582014-03-06 13:31:55 +04007158 if (new_tasks > 0)
Peter Zijlstra38033c32014-01-23 20:32:21 +01007159 goto again;
Peter Zijlstra38033c32014-01-23 20:32:21 +01007160
7161 return NULL;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007162}
7163
7164/*
7165 * Account for a descheduled task:
7166 */
Ingo Molnar31ee5292007-08-09 11:16:49 +02007167static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007168{
7169 struct sched_entity *se = &prev->se;
7170 struct cfs_rq *cfs_rq;
7171
7172 for_each_sched_entity(se) {
7173 cfs_rq = cfs_rq_of(se);
Ingo Molnarab6cde22007-08-09 11:16:48 +02007174 put_prev_entity(cfs_rq, se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007175 }
7176}
7177
Rik van Rielac53db52011-02-01 09:51:03 -05007178/*
7179 * sched_yield() is very simple
7180 *
7181 * The magic of dealing with the ->skip buddy is in pick_next_entity.
7182 */
7183static void yield_task_fair(struct rq *rq)
7184{
7185 struct task_struct *curr = rq->curr;
7186 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7187 struct sched_entity *se = &curr->se;
7188
7189 /*
7190 * Are we the only task in the tree?
7191 */
7192 if (unlikely(rq->nr_running == 1))
7193 return;
7194
7195 clear_buddies(cfs_rq, se);
7196
7197 if (curr->policy != SCHED_BATCH) {
7198 update_rq_clock(rq);
7199 /*
7200 * Update run-time statistics of the 'current'.
7201 */
7202 update_curr(cfs_rq);
Mike Galbraith916671c2011-11-22 15:21:26 +01007203 /*
7204 * Tell update_rq_clock() that we've just updated,
7205 * so we don't do microscopic update in schedule()
7206 * and double the fastpath cost.
7207 */
Peter Zijlstra9edfbfe2015-01-05 11:18:11 +01007208 rq_clock_skip_update(rq, true);
Rik van Rielac53db52011-02-01 09:51:03 -05007209 }
7210
7211 set_skip_buddy(se);
7212}
7213
Mike Galbraithd95f4122011-02-01 09:50:51 -05007214static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
7215{
7216 struct sched_entity *se = &p->se;
7217
Paul Turner5238cdd2011-07-21 09:43:37 -07007218 /* throttled hierarchies are not runnable */
7219 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
Mike Galbraithd95f4122011-02-01 09:50:51 -05007220 return false;
7221
7222 /* Tell the scheduler that we'd really like pse to run next. */
7223 set_next_buddy(se);
7224
Mike Galbraithd95f4122011-02-01 09:50:51 -05007225 yield_task_fair(rq);
7226
7227 return true;
7228}
7229
Peter Williams681f3e62007-10-24 18:23:51 +02007230#ifdef CONFIG_SMP
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007231/**************************************************
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007232 * Fair scheduling class load-balancing methods.
7233 *
7234 * BASICS
7235 *
7236 * The purpose of load-balancing is to achieve the same basic fairness the
7237 * per-cpu scheduler provides, namely provide a proportional amount of compute
7238 * time to each task. This is expressed in the following equation:
7239 *
7240 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
7241 *
7242 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
7243 * W_i,0 is defined as:
7244 *
7245 * W_i,0 = \Sum_j w_i,j (2)
7246 *
7247 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
Yuyang Du1c3de5e2016-03-30 07:07:51 +08007248 * is derived from the nice value as per sched_prio_to_weight[].
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007249 *
7250 * The weight average is an exponential decay average of the instantaneous
7251 * weight:
7252 *
7253 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
7254 *
Nicolas Pitreced549f2014-05-26 18:19:38 -04007255 * C_i is the compute capacity of cpu i, typically it is the
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007256 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
7257 * can also include other factors [XXX].
7258 *
7259 * To achieve this balance we define a measure of imbalance which follows
7260 * directly from (1):
7261 *
Nicolas Pitreced549f2014-05-26 18:19:38 -04007262 * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007263 *
7264 * We them move tasks around to minimize the imbalance. In the continuous
7265 * function space it is obvious this converges, in the discrete case we get
7266 * a few fun cases generally called infeasible weight scenarios.
7267 *
7268 * [XXX expand on:
7269 * - infeasible weights;
7270 * - local vs global optima in the discrete case. ]
7271 *
7272 *
7273 * SCHED DOMAINS
7274 *
7275 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
7276 * for all i,j solution, we create a tree of cpus that follows the hardware
7277 * topology where each level pairs two lower groups (or better). This results
7278 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
7279 * tree to only the first of the previous level and we decrease the frequency
7280 * of load-balance at each level inv. proportional to the number of cpus in
7281 * the groups.
7282 *
7283 * This yields:
7284 *
7285 * log_2 n 1 n
7286 * \Sum { --- * --- * 2^i } = O(n) (5)
7287 * i = 0 2^i 2^i
7288 * `- size of each group
7289 * | | `- number of cpus doing load-balance
7290 * | `- freq
7291 * `- sum over all levels
7292 *
7293 * Coupled with a limit on how many tasks we can migrate every balance pass,
7294 * this makes (5) the runtime complexity of the balancer.
7295 *
7296 * An important property here is that each CPU is still (indirectly) connected
7297 * to every other cpu in at most O(log n) steps:
7298 *
7299 * The adjacency matrix of the resulting graph is given by:
7300 *
Byungchul Park97a71422015-07-05 18:33:48 +09007301 * log_2 n
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007302 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
7303 * k = 0
7304 *
7305 * And you'll find that:
7306 *
7307 * A^(log_2 n)_i,j != 0 for all i,j (7)
7308 *
7309 * Showing there's indeed a path between every cpu in at most O(log n) steps.
7310 * The task movement gives a factor of O(m), giving a convergence complexity
7311 * of:
7312 *
7313 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
7314 *
7315 *
7316 * WORK CONSERVING
7317 *
7318 * In order to avoid CPUs going idle while there's still work to do, new idle
7319 * balancing is more aggressive and has the newly idle cpu iterate up the domain
7320 * tree itself instead of relying on other CPUs to bring it work.
7321 *
7322 * This adds some complexity to both (5) and (8) but it reduces the total idle
7323 * time.
7324 *
7325 * [XXX more?]
7326 *
7327 *
7328 * CGROUPS
7329 *
7330 * Cgroups make a horror show out of (2), instead of a simple sum we get:
7331 *
7332 * s_k,i
7333 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
7334 * S_k
7335 *
7336 * Where
7337 *
7338 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
7339 *
7340 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
7341 *
7342 * The big problem is S_k, its a global sum needed to compute a local (W_i)
7343 * property.
7344 *
7345 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
7346 * rewrite all of this once again.]
Byungchul Park97a71422015-07-05 18:33:48 +09007347 */
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007348
Hiroshi Shimamotoed387b72012-01-31 11:40:32 +09007349static unsigned long __read_mostly max_load_balance_interval = HZ/10;
7350
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01007351enum fbq_type { regular, remote, all };
7352
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007353#define LBF_ALL_PINNED 0x01
Peter Zijlstra367456c2012-02-20 21:49:09 +01007354#define LBF_NEED_BREAK 0x02
Peter Zijlstra62633222013-08-19 12:41:09 +02007355#define LBF_DST_PINNED 0x04
7356#define LBF_SOME_PINNED 0x08
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007357#define LBF_BIG_TASK_ACTIVE_BALANCE 0x80
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007358#define LBF_IGNORE_BIG_TASKS 0x100
7359#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
7360#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007361
7362struct lb_env {
7363 struct sched_domain *sd;
7364
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007365 struct rq *src_rq;
Prashanth Nageshappa85c1e7d2012-06-19 17:47:34 +05307366 int src_cpu;
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007367
7368 int dst_cpu;
7369 struct rq *dst_rq;
7370
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307371 struct cpumask *dst_grpmask;
7372 int new_dst_cpu;
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007373 enum cpu_idle_type idle;
Peter Zijlstrabd939f42012-05-02 14:20:37 +02007374 long imbalance;
Michael Wangb94031302012-07-12 16:10:13 +08007375 /* The set of CPUs under consideration for load-balancing */
7376 struct cpumask *cpus;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007377 unsigned int busiest_grp_capacity;
7378 unsigned int busiest_nr_running;
Michael Wangb94031302012-07-12 16:10:13 +08007379
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007380 unsigned int flags;
Peter Zijlstra367456c2012-02-20 21:49:09 +01007381
7382 unsigned int loop;
7383 unsigned int loop_break;
7384 unsigned int loop_max;
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01007385
7386 enum fbq_type fbq_type;
Kirill Tkhai163122b2014-08-20 13:48:29 +04007387 struct list_head tasks;
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07007388 enum sched_boost_policy boost_policy;
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007389};
7390
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007391/*
Peter Zijlstra029632f2011-10-25 10:00:11 +02007392 * Is this task likely cache-hot:
7393 */
Hillf Danton5d5e2b12014-06-10 10:58:43 +02007394static int task_hot(struct task_struct *p, struct lb_env *env)
Peter Zijlstra029632f2011-10-25 10:00:11 +02007395{
7396 s64 delta;
7397
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007398 lockdep_assert_held(&env->src_rq->lock);
7399
Peter Zijlstra029632f2011-10-25 10:00:11 +02007400 if (p->sched_class != &fair_sched_class)
7401 return 0;
7402
7403 if (unlikely(p->policy == SCHED_IDLE))
7404 return 0;
7405
7406 /*
7407 * Buddy candidates are cache hot:
7408 */
Hillf Danton5d5e2b12014-06-10 10:58:43 +02007409 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
Peter Zijlstra029632f2011-10-25 10:00:11 +02007410 (&p->se == cfs_rq_of(&p->se)->next ||
7411 &p->se == cfs_rq_of(&p->se)->last))
7412 return 1;
7413
7414 if (sysctl_sched_migration_cost == -1)
7415 return 1;
7416 if (sysctl_sched_migration_cost == 0)
7417 return 0;
7418
Hillf Danton5d5e2b12014-06-10 10:58:43 +02007419 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
Peter Zijlstra029632f2011-10-25 10:00:11 +02007420
7421 return delta < (s64)sysctl_sched_migration_cost;
7422}
7423
Mel Gorman3a7053b2013-10-07 11:29:00 +01007424#ifdef CONFIG_NUMA_BALANCING
Rik van Rielc1ceac62015-05-14 22:59:36 -04007425/*
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307426 * Returns 1, if task migration degrades locality
7427 * Returns 0, if task migration improves locality i.e migration preferred.
7428 * Returns -1, if task migration is not affected by locality.
Rik van Rielc1ceac62015-05-14 22:59:36 -04007429 */
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307430static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
Mel Gorman3a7053b2013-10-07 11:29:00 +01007431{
Rik van Rielb1ad0652014-05-15 13:03:06 -04007432 struct numa_group *numa_group = rcu_dereference(p->numa_group);
Rik van Rielc1ceac62015-05-14 22:59:36 -04007433 unsigned long src_faults, dst_faults;
Mel Gorman3a7053b2013-10-07 11:29:00 +01007434 int src_nid, dst_nid;
7435
Srikar Dronamraju2a595722015-08-11 21:54:21 +05307436 if (!static_branch_likely(&sched_numa_balancing))
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307437 return -1;
7438
Srikar Dronamrajuc3b9bc52015-08-11 16:30:12 +05307439 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307440 return -1;
Mel Gorman7a0f3082013-10-07 11:29:01 +01007441
7442 src_nid = cpu_to_node(env->src_cpu);
7443 dst_nid = cpu_to_node(env->dst_cpu);
7444
Mel Gorman83e1d2c2013-10-07 11:29:27 +01007445 if (src_nid == dst_nid)
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307446 return -1;
Mel Gorman7a0f3082013-10-07 11:29:01 +01007447
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307448 /* Migrating away from the preferred node is always bad. */
7449 if (src_nid == p->numa_preferred_nid) {
7450 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7451 return 1;
7452 else
7453 return -1;
7454 }
Mel Gorman83e1d2c2013-10-07 11:29:27 +01007455
Rik van Rielc1ceac62015-05-14 22:59:36 -04007456 /* Encourage migration to the preferred node. */
7457 if (dst_nid == p->numa_preferred_nid)
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307458 return 0;
Rik van Rielc1ceac62015-05-14 22:59:36 -04007459
7460 if (numa_group) {
7461 src_faults = group_faults(p, src_nid);
7462 dst_faults = group_faults(p, dst_nid);
7463 } else {
7464 src_faults = task_faults(p, src_nid);
7465 dst_faults = task_faults(p, dst_nid);
7466 }
7467
7468 return dst_faults < src_faults;
Mel Gorman7a0f3082013-10-07 11:29:01 +01007469}
7470
Mel Gorman3a7053b2013-10-07 11:29:00 +01007471#else
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307472static inline int migrate_degrades_locality(struct task_struct *p,
Mel Gorman3a7053b2013-10-07 11:29:00 +01007473 struct lb_env *env)
7474{
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307475 return -1;
Mel Gorman7a0f3082013-10-07 11:29:01 +01007476}
Mel Gorman3a7053b2013-10-07 11:29:00 +01007477#endif
7478
Peter Zijlstra029632f2011-10-25 10:00:11 +02007479/*
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007480 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
7481 */
7482static
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01007483int can_migrate_task(struct task_struct *p, struct lb_env *env)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007484{
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307485 int tsk_cache_hot;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007486 int twf, group_cpus;
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007487
7488 lockdep_assert_held(&env->src_rq->lock);
7489
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007490 /*
7491 * We do not migrate tasks that are:
Joonsoo Kimd3198082013-04-23 17:27:40 +09007492 * 1) throttled_lb_pair, or
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007493 * 2) cannot be migrated to this CPU due to cpus_allowed, or
Joonsoo Kimd3198082013-04-23 17:27:40 +09007494 * 3) running (obviously), or
7495 * 4) are cache-hot on their current CPU.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007496 */
Joonsoo Kimd3198082013-04-23 17:27:40 +09007497 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7498 return 0;
7499
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007500 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
Joonsoo Kime02e60c2013-04-23 17:27:42 +09007501 int cpu;
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307502
Josh Poimboeufae928822016-06-17 12:43:24 -05007503 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307504
Peter Zijlstra62633222013-08-19 12:41:09 +02007505 env->flags |= LBF_SOME_PINNED;
7506
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307507 /*
7508 * Remember if this task can be migrated to any other cpu in
7509 * our sched_group. We may want to revisit it if we couldn't
7510 * meet load balance goals by pulling other tasks on src_cpu.
7511 *
7512 * Also avoid computing new_dst_cpu if we have already computed
7513 * one in current iteration.
7514 */
Peter Zijlstra62633222013-08-19 12:41:09 +02007515 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307516 return 0;
7517
Joonsoo Kime02e60c2013-04-23 17:27:42 +09007518 /* Prevent to re-select dst_cpu via env's cpus */
7519 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7520 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
Peter Zijlstra62633222013-08-19 12:41:09 +02007521 env->flags |= LBF_DST_PINNED;
Joonsoo Kime02e60c2013-04-23 17:27:42 +09007522 env->new_dst_cpu = cpu;
7523 break;
7524 }
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307525 }
Joonsoo Kime02e60c2013-04-23 17:27:42 +09007526
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007527 return 0;
7528 }
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307529
7530 /* Record that we found atleast one task that could run on dst_cpu */
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01007531 env->flags &= ~LBF_ALL_PINNED;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007532
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07007533 if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) {
7534 if (nr_big_tasks(env->src_rq) && !is_big_task(p))
7535 return 0;
7536
7537 if (env->boost_policy == SCHED_BOOST_ON_BIG &&
7538 !task_sched_boost(p))
7539 return 0;
7540 }
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007541
7542 twf = task_will_fit(p, env->dst_cpu);
7543
7544 /*
7545 * Attempt to not pull tasks that don't fit. We may get lucky and find
7546 * one that actually fits.
7547 */
7548 if (env->flags & LBF_IGNORE_BIG_TASKS && !twf)
7549 return 0;
7550
7551 if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
7552 !preferred_cluster(rq_cluster(cpu_rq(env->dst_cpu)), p))
7553 return 0;
7554
7555 /*
7556 * Group imbalance can sometimes cause work to be pulled across groups
7557 * even though the group could have managed the imbalance on its own.
7558 * Prevent inter-cluster migrations for big tasks when the number of
7559 * tasks is lower than the capacity of the group.
7560 */
7561 group_cpus = DIV_ROUND_UP(env->busiest_grp_capacity,
7562 SCHED_CAPACITY_SCALE);
7563 if (!twf && env->busiest_nr_running <= group_cpus)
7564 return 0;
7565
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007566 if (task_running(env->src_rq, p)) {
Josh Poimboeufae928822016-06-17 12:43:24 -05007567 schedstat_inc(p->se.statistics.nr_failed_migrations_running);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007568 return 0;
7569 }
7570
7571 /*
7572 * Aggressive migration if:
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007573 * 1) IDLE or NEWLY_IDLE balance.
7574 * 2) destination numa is preferred
7575 * 3) task is cache cold, or
7576 * 4) too many balance attempts have failed.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007577 */
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307578 tsk_cache_hot = migrate_degrades_locality(p, env);
7579 if (tsk_cache_hot == -1)
7580 tsk_cache_hot = task_hot(p, env);
Mel Gorman3a7053b2013-10-07 11:29:00 +01007581
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007582 if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 ||
Kirill Tkhai7a96c232014-09-22 22:36:12 +04007583 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307584 if (tsk_cache_hot == 1) {
Josh Poimboeufae928822016-06-17 12:43:24 -05007585 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
7586 schedstat_inc(p->se.statistics.nr_forced_migrations);
Mel Gorman3a7053b2013-10-07 11:29:00 +01007587 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007588 return 1;
7589 }
7590
Josh Poimboeufae928822016-06-17 12:43:24 -05007591 schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
Zhang Hang4e2dcb72013-04-10 14:04:55 +08007592 return 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007593}
7594
Peter Zijlstra897c3952009-12-17 17:45:42 +01007595/*
Kirill Tkhai163122b2014-08-20 13:48:29 +04007596 * detach_task() -- detach the task for the migration specified in env
Peter Zijlstra897c3952009-12-17 17:45:42 +01007597 */
Kirill Tkhai163122b2014-08-20 13:48:29 +04007598static void detach_task(struct task_struct *p, struct lb_env *env)
7599{
7600 lockdep_assert_held(&env->src_rq->lock);
7601
Kirill Tkhai163122b2014-08-20 13:48:29 +04007602 p->on_rq = TASK_ON_RQ_MIGRATING;
Joonwoo Park3ea94de2015-11-12 19:38:54 -08007603 deactivate_task(env->src_rq, p, 0);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007604 double_lock_balance(env->src_rq, env->dst_rq);
Kirill Tkhai163122b2014-08-20 13:48:29 +04007605 set_task_cpu(p, env->dst_cpu);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007606 if (task_in_related_thread_group(p))
7607 env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK;
7608 double_unlock_balance(env->src_rq, env->dst_rq);
Kirill Tkhai163122b2014-08-20 13:48:29 +04007609}
7610
7611/*
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007612 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
Peter Zijlstra897c3952009-12-17 17:45:42 +01007613 * part of active balancing operations within "domain".
Peter Zijlstra897c3952009-12-17 17:45:42 +01007614 *
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007615 * Returns a task if successful and NULL otherwise.
Peter Zijlstra897c3952009-12-17 17:45:42 +01007616 */
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007617static struct task_struct *detach_one_task(struct lb_env *env)
Peter Zijlstra897c3952009-12-17 17:45:42 +01007618{
7619 struct task_struct *p, *n;
Peter Zijlstra897c3952009-12-17 17:45:42 +01007620
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007621 lockdep_assert_held(&env->src_rq->lock);
7622
Peter Zijlstra367456c2012-02-20 21:49:09 +01007623 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
Peter Zijlstra367456c2012-02-20 21:49:09 +01007624 if (!can_migrate_task(p, env))
7625 continue;
Peter Zijlstra897c3952009-12-17 17:45:42 +01007626
Kirill Tkhai163122b2014-08-20 13:48:29 +04007627 detach_task(p, env);
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007628
Peter Zijlstra367456c2012-02-20 21:49:09 +01007629 /*
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007630 * Right now, this is only the second place where
Kirill Tkhai163122b2014-08-20 13:48:29 +04007631 * lb_gained[env->idle] is updated (other is detach_tasks)
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007632 * so we can safely collect stats here rather than
Kirill Tkhai163122b2014-08-20 13:48:29 +04007633 * inside detach_tasks().
Peter Zijlstra367456c2012-02-20 21:49:09 +01007634 */
Josh Poimboeufae928822016-06-17 12:43:24 -05007635 schedstat_inc(env->sd->lb_gained[env->idle]);
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007636 return p;
Peter Zijlstra897c3952009-12-17 17:45:42 +01007637 }
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007638 return NULL;
Peter Zijlstra897c3952009-12-17 17:45:42 +01007639}
7640
Peter Zijlstraeb953082012-04-17 13:38:40 +02007641static const unsigned int sched_nr_migrate_break = 32;
7642
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007643/*
Kirill Tkhai163122b2014-08-20 13:48:29 +04007644 * detach_tasks() -- tries to detach up to imbalance weighted load from
7645 * busiest_rq, as part of a balancing operation within domain "sd".
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007646 *
Kirill Tkhai163122b2014-08-20 13:48:29 +04007647 * Returns number of detached tasks if successful and 0 otherwise.
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007648 */
Kirill Tkhai163122b2014-08-20 13:48:29 +04007649static int detach_tasks(struct lb_env *env)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007650{
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007651 struct list_head *tasks = &env->src_rq->cfs_tasks;
7652 struct task_struct *p;
Peter Zijlstra367456c2012-02-20 21:49:09 +01007653 unsigned long load;
Kirill Tkhai163122b2014-08-20 13:48:29 +04007654 int detached = 0;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007655 int orig_loop = env->loop;
Kirill Tkhai163122b2014-08-20 13:48:29 +04007656
7657 lockdep_assert_held(&env->src_rq->lock);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007658
Peter Zijlstrabd939f42012-05-02 14:20:37 +02007659 if (env->imbalance <= 0)
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007660 return 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007661
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07007662 if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu))
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007663 env->flags |= LBF_IGNORE_BIG_TASKS;
7664 else if (!same_cluster(env->dst_cpu, env->src_cpu))
7665 env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
7666
7667redo:
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007668 while (!list_empty(tasks)) {
Yuyang Du985d3a42015-07-06 06:11:51 +08007669 /*
7670 * We don't want to steal all, otherwise we may be treated likewise,
7671 * which could at worst lead to a livelock crash.
7672 */
7673 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
7674 break;
7675
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007676 p = list_first_entry(tasks, struct task_struct, se.group_node);
7677
Peter Zijlstra367456c2012-02-20 21:49:09 +01007678 env->loop++;
7679 /* We've more or less seen every task there is, call it quits */
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007680 if (env->loop > env->loop_max)
Peter Zijlstra367456c2012-02-20 21:49:09 +01007681 break;
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007682
7683 /* take a breather every nr_migrate tasks */
Peter Zijlstra367456c2012-02-20 21:49:09 +01007684 if (env->loop > env->loop_break) {
Peter Zijlstraeb953082012-04-17 13:38:40 +02007685 env->loop_break += sched_nr_migrate_break;
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01007686 env->flags |= LBF_NEED_BREAK;
Peter Zijlstraee00e662009-12-17 17:25:20 +01007687 break;
Peter Zijlstraa195f002011-09-22 15:30:18 +02007688 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007689
Joonsoo Kimd3198082013-04-23 17:27:40 +09007690 if (!can_migrate_task(p, env))
Peter Zijlstra367456c2012-02-20 21:49:09 +01007691 goto next;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007692
Peter Zijlstra367456c2012-02-20 21:49:09 +01007693 load = task_h_load(p);
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007694
Peter Zijlstraeb953082012-04-17 13:38:40 +02007695 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
Peter Zijlstra367456c2012-02-20 21:49:09 +01007696 goto next;
7697
Peter Zijlstrabd939f42012-05-02 14:20:37 +02007698 if ((load / 2) > env->imbalance)
Peter Zijlstra367456c2012-02-20 21:49:09 +01007699 goto next;
7700
Kirill Tkhai163122b2014-08-20 13:48:29 +04007701 detach_task(p, env);
7702 list_add(&p->se.group_node, &env->tasks);
7703
7704 detached++;
Peter Zijlstrabd939f42012-05-02 14:20:37 +02007705 env->imbalance -= load;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007706
7707#ifdef CONFIG_PREEMPT
Peter Zijlstraee00e662009-12-17 17:25:20 +01007708 /*
7709 * NEWIDLE balancing is a source of latency, so preemptible
Kirill Tkhai163122b2014-08-20 13:48:29 +04007710 * kernels will stop after the first task is detached to minimize
Peter Zijlstraee00e662009-12-17 17:25:20 +01007711 * the critical section.
7712 */
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007713 if (env->idle == CPU_NEWLY_IDLE)
Peter Zijlstraee00e662009-12-17 17:25:20 +01007714 break;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007715#endif
7716
Peter Zijlstraee00e662009-12-17 17:25:20 +01007717 /*
7718 * We only want to steal up to the prescribed amount of
7719 * weighted load.
7720 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02007721 if (env->imbalance <= 0)
Peter Zijlstraee00e662009-12-17 17:25:20 +01007722 break;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007723
Peter Zijlstra367456c2012-02-20 21:49:09 +01007724 continue;
7725next:
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007726 list_move_tail(&p->se.group_node, tasks);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007727 }
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007728
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007729 if (env->flags & (LBF_IGNORE_BIG_TASKS |
7730 LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) {
7731 tasks = &env->src_rq->cfs_tasks;
7732 env->flags &= ~(LBF_IGNORE_BIG_TASKS |
7733 LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
7734 env->loop = orig_loop;
7735 goto redo;
7736 }
7737
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007738 /*
Kirill Tkhai163122b2014-08-20 13:48:29 +04007739 * Right now, this is one of only two places we collect this stat
7740 * so we can safely collect detach_one_task() stats here rather
7741 * than inside detach_one_task().
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007742 */
Josh Poimboeufae928822016-06-17 12:43:24 -05007743 schedstat_add(env->sd->lb_gained[env->idle], detached);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007744
Kirill Tkhai163122b2014-08-20 13:48:29 +04007745 return detached;
7746}
7747
7748/*
7749 * attach_task() -- attach the task detached by detach_task() to its new rq.
7750 */
7751static void attach_task(struct rq *rq, struct task_struct *p)
7752{
7753 lockdep_assert_held(&rq->lock);
7754
7755 BUG_ON(task_rq(p) != rq);
Kirill Tkhai163122b2014-08-20 13:48:29 +04007756 activate_task(rq, p, 0);
Joonwoo Park3ea94de2015-11-12 19:38:54 -08007757 p->on_rq = TASK_ON_RQ_QUEUED;
Kirill Tkhai163122b2014-08-20 13:48:29 +04007758 check_preempt_curr(rq, p, 0);
7759}
7760
7761/*
7762 * attach_one_task() -- attaches the task returned from detach_one_task() to
7763 * its new rq.
7764 */
7765static void attach_one_task(struct rq *rq, struct task_struct *p)
7766{
7767 raw_spin_lock(&rq->lock);
7768 attach_task(rq, p);
7769 raw_spin_unlock(&rq->lock);
7770}
7771
7772/*
7773 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
7774 * new rq.
7775 */
7776static void attach_tasks(struct lb_env *env)
7777{
7778 struct list_head *tasks = &env->tasks;
7779 struct task_struct *p;
7780
7781 raw_spin_lock(&env->dst_rq->lock);
7782
7783 while (!list_empty(tasks)) {
7784 p = list_first_entry(tasks, struct task_struct, se.group_node);
7785 list_del_init(&p->se.group_node);
7786
7787 attach_task(env->dst_rq, p);
7788 }
7789
7790 raw_spin_unlock(&env->dst_rq->lock);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007791}
7792
Peter Zijlstra230059de2009-12-17 17:47:12 +01007793#ifdef CONFIG_FAIR_GROUP_SCHED
Paul Turner48a16752012-10-04 13:18:31 +02007794static void update_blocked_averages(int cpu)
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08007795{
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08007796 struct rq *rq = cpu_rq(cpu);
Paul Turner48a16752012-10-04 13:18:31 +02007797 struct cfs_rq *cfs_rq;
7798 unsigned long flags;
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08007799
Paul Turner48a16752012-10-04 13:18:31 +02007800 raw_spin_lock_irqsave(&rq->lock, flags);
7801 update_rq_clock(rq);
Yuyang Du9d89c252015-07-15 08:04:37 +08007802
Peter Zijlstra9763b672011-07-13 13:09:25 +02007803 /*
7804 * Iterates the task_group tree in a bottom up fashion, see
7805 * list_add_leaf_cfs_rq() for details.
7806 */
Paul Turner64660c82011-07-21 09:43:36 -07007807 for_each_leaf_cfs_rq(rq, cfs_rq) {
Yuyang Du9d89c252015-07-15 08:04:37 +08007808 /* throttled entities do not contribute to load */
7809 if (throttled_hierarchy(cfs_rq))
7810 continue;
Paul Turner48a16752012-10-04 13:18:31 +02007811
Steve Mucklea2c6c912016-03-24 15:26:07 -07007812 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
Yuyang Du9d89c252015-07-15 08:04:37 +08007813 update_tg_load_avg(cfs_rq, 0);
7814 }
Paul Turner48a16752012-10-04 13:18:31 +02007815 raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08007816}
7817
Peter Zijlstra9763b672011-07-13 13:09:25 +02007818/*
Vladimir Davydov68520792013-07-15 17:49:19 +04007819 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
Peter Zijlstra9763b672011-07-13 13:09:25 +02007820 * This needs to be done in a top-down fashion because the load of a child
7821 * group is a fraction of its parents load.
7822 */
Vladimir Davydov68520792013-07-15 17:49:19 +04007823static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
Peter Zijlstra9763b672011-07-13 13:09:25 +02007824{
Vladimir Davydov68520792013-07-15 17:49:19 +04007825 struct rq *rq = rq_of(cfs_rq);
7826 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
Peter Zijlstraa35b6462012-08-08 21:46:40 +02007827 unsigned long now = jiffies;
Vladimir Davydov68520792013-07-15 17:49:19 +04007828 unsigned long load;
Peter Zijlstraa35b6462012-08-08 21:46:40 +02007829
Vladimir Davydov68520792013-07-15 17:49:19 +04007830 if (cfs_rq->last_h_load_update == now)
Peter Zijlstraa35b6462012-08-08 21:46:40 +02007831 return;
7832
Vladimir Davydov68520792013-07-15 17:49:19 +04007833 cfs_rq->h_load_next = NULL;
7834 for_each_sched_entity(se) {
7835 cfs_rq = cfs_rq_of(se);
7836 cfs_rq->h_load_next = se;
7837 if (cfs_rq->last_h_load_update == now)
7838 break;
7839 }
Peter Zijlstraa35b6462012-08-08 21:46:40 +02007840
Vladimir Davydov68520792013-07-15 17:49:19 +04007841 if (!se) {
Yuyang Du7ea241a2015-07-15 08:04:42 +08007842 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
Vladimir Davydov68520792013-07-15 17:49:19 +04007843 cfs_rq->last_h_load_update = now;
7844 }
7845
7846 while ((se = cfs_rq->h_load_next) != NULL) {
7847 load = cfs_rq->h_load;
Yuyang Du7ea241a2015-07-15 08:04:42 +08007848 load = div64_ul(load * se->avg.load_avg,
7849 cfs_rq_load_avg(cfs_rq) + 1);
Vladimir Davydov68520792013-07-15 17:49:19 +04007850 cfs_rq = group_cfs_rq(se);
7851 cfs_rq->h_load = load;
7852 cfs_rq->last_h_load_update = now;
7853 }
Peter Zijlstra9763b672011-07-13 13:09:25 +02007854}
7855
Peter Zijlstra367456c2012-02-20 21:49:09 +01007856static unsigned long task_h_load(struct task_struct *p)
Peter Zijlstra230059de2009-12-17 17:47:12 +01007857{
Peter Zijlstra367456c2012-02-20 21:49:09 +01007858 struct cfs_rq *cfs_rq = task_cfs_rq(p);
Peter Zijlstra230059de2009-12-17 17:47:12 +01007859
Vladimir Davydov68520792013-07-15 17:49:19 +04007860 update_cfs_rq_h_load(cfs_rq);
Yuyang Du9d89c252015-07-15 08:04:37 +08007861 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
Yuyang Du7ea241a2015-07-15 08:04:42 +08007862 cfs_rq_load_avg(cfs_rq) + 1);
Peter Zijlstra230059de2009-12-17 17:47:12 +01007863}
7864#else
Paul Turner48a16752012-10-04 13:18:31 +02007865static inline void update_blocked_averages(int cpu)
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08007866{
Vincent Guittot6c1d47c2015-07-15 08:04:38 +08007867 struct rq *rq = cpu_rq(cpu);
7868 struct cfs_rq *cfs_rq = &rq->cfs;
7869 unsigned long flags;
7870
7871 raw_spin_lock_irqsave(&rq->lock, flags);
7872 update_rq_clock(rq);
Steve Mucklea2c6c912016-03-24 15:26:07 -07007873 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
Vincent Guittot6c1d47c2015-07-15 08:04:38 +08007874 raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08007875}
7876
Peter Zijlstra367456c2012-02-20 21:49:09 +01007877static unsigned long task_h_load(struct task_struct *p)
7878{
Yuyang Du9d89c252015-07-15 08:04:37 +08007879 return p->se.avg.load_avg;
Peter Zijlstra230059de2009-12-17 17:47:12 +01007880}
7881#endif
7882
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007883/********** Helpers for find_busiest_group ************************/
Rik van Rielcaeb1782014-07-28 14:16:28 -04007884
7885enum group_type {
7886 group_other = 0,
7887 group_imbalanced,
7888 group_overloaded,
7889};
7890
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007891/*
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007892 * sg_lb_stats - stats of a sched_group required for load_balancing
7893 */
7894struct sg_lb_stats {
7895 unsigned long avg_load; /*Avg load across the CPUs of the group */
7896 unsigned long group_load; /* Total load over the CPUs of the group */
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007897 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09007898 unsigned long load_per_task;
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007899 unsigned long group_capacity;
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01007900 unsigned long group_util; /* Total utilization of the group */
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02007901 unsigned int sum_nr_running; /* Nr tasks running in the group */
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007902#ifdef CONFIG_SCHED_HMP
7903 unsigned long sum_nr_big_tasks;
7904 u64 group_cpu_load; /* Scaled load of all CPUs of the group */
7905#endif
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02007906 unsigned int idle_cpus;
7907 unsigned int group_weight;
Rik van Rielcaeb1782014-07-28 14:16:28 -04007908 enum group_type group_type;
Vincent Guittotea678212015-02-27 16:54:11 +01007909 int group_no_capacity;
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01007910#ifdef CONFIG_NUMA_BALANCING
7911 unsigned int nr_numa_running;
7912 unsigned int nr_preferred_running;
7913#endif
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007914};
7915
Joonsoo Kim56cf5152013-08-06 17:36:43 +09007916/*
7917 * sd_lb_stats - Structure to store the statistics of a sched_domain
7918 * during load balancing.
7919 */
7920struct sd_lb_stats {
7921 struct sched_group *busiest; /* Busiest group in this sd */
7922 struct sched_group *local; /* Local group in this sd */
7923 unsigned long total_load; /* Total load of all groups in sd */
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007924 unsigned long total_capacity; /* Total capacity of all groups in sd */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09007925 unsigned long avg_load; /* Average load across all groups in sd */
7926
Joonsoo Kim56cf5152013-08-06 17:36:43 +09007927 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02007928 struct sg_lb_stats local_stat; /* Statistics of the local group */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09007929};
7930
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02007931static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
7932{
7933 /*
7934 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
7935 * local_stat because update_sg_lb_stats() does a full clear/assignment.
7936 * We must however clear busiest_stat::avg_load because
7937 * update_sd_pick_busiest() reads this before assignment.
7938 */
7939 *sds = (struct sd_lb_stats){
7940 .busiest = NULL,
7941 .local = NULL,
7942 .total_load = 0UL,
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007943 .total_capacity = 0UL,
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02007944 .busiest_stat = {
7945 .avg_load = 0UL,
Rik van Rielcaeb1782014-07-28 14:16:28 -04007946 .sum_nr_running = 0,
7947 .group_type = group_other,
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007948#ifdef CONFIG_SCHED_HMP
7949 .sum_nr_big_tasks = 0UL,
7950 .group_cpu_load = 0ULL,
7951#endif
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02007952 },
7953 };
7954}
7955
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007956#ifdef CONFIG_SCHED_HMP
7957
7958static int
7959bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
7960{
7961 int local_cpu, busiest_cpu;
7962 int local_capacity, busiest_capacity;
7963 int local_pwr_cost, busiest_pwr_cost;
7964 int nr_cpus;
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07007965 int boost = sched_boost();
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007966
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07007967 if (!sysctl_sched_restrict_cluster_spill ||
7968 boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST)
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007969 return 0;
7970
7971 local_cpu = group_first_cpu(sds->local);
7972 busiest_cpu = group_first_cpu(sds->busiest);
7973
7974 local_capacity = cpu_max_possible_capacity(local_cpu);
7975 busiest_capacity = cpu_max_possible_capacity(busiest_cpu);
7976
7977 local_pwr_cost = cpu_max_power_cost(local_cpu);
7978 busiest_pwr_cost = cpu_max_power_cost(busiest_cpu);
7979
Pavankumar Kondeti7b0a1442016-04-13 15:13:56 +05307980 if (local_pwr_cost <= busiest_pwr_cost)
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07007981 return 0;
7982
7983 if (local_capacity > busiest_capacity &&
7984 sds->busiest_stat.sum_nr_big_tasks)
7985 return 0;
7986
7987 nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
7988 if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) &&
7989 (sds->busiest_stat.sum_nr_running <
7990 nr_cpus * sysctl_sched_spill_nr_run))
7991 return 1;
7992
7993 return 0;
7994}
7995
7996#else /* CONFIG_SCHED_HMP */
7997
7998static inline int
7999bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
8000{
8001 return 0;
8002}
8003
8004#endif /* CONFIG_SCHED_HMP */
8005
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008006/**
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008007 * get_sd_load_idx - Obtain the load index for a given sched domain.
8008 * @sd: The sched_domain whose load_idx is to be obtained.
Kamalesh Babulaled1b7732013-10-13 23:06:15 +05308009 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
Yacine Belkadie69f6182013-07-12 20:45:47 +02008010 *
8011 * Return: The load index.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008012 */
8013static inline int get_sd_load_idx(struct sched_domain *sd,
8014 enum cpu_idle_type idle)
8015{
8016 int load_idx;
8017
8018 switch (idle) {
8019 case CPU_NOT_IDLE:
8020 load_idx = sd->busy_idx;
8021 break;
8022
8023 case CPU_NEWLY_IDLE:
8024 load_idx = sd->newidle_idx;
8025 break;
8026 default:
8027 load_idx = sd->idle_idx;
8028 break;
8029 }
8030
8031 return load_idx;
8032}
8033
Nicolas Pitreced549f2014-05-26 18:19:38 -04008034static unsigned long scale_rt_capacity(int cpu)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008035{
8036 struct rq *rq = cpu_rq(cpu);
Vincent Guittotb5b48602015-02-27 16:54:08 +01008037 u64 total, used, age_stamp, avg;
Peter Zijlstracadefd32014-02-27 10:40:35 +01008038 s64 delta;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008039
Peter Zijlstrab654f7d2012-05-22 14:04:28 +02008040 /*
8041 * Since we're reading these variables without serialization make sure
8042 * we read them once before doing sanity checks on them.
8043 */
Jason Low316c1608d2015-04-28 13:00:20 -07008044 age_stamp = READ_ONCE(rq->age_stamp);
8045 avg = READ_ONCE(rq->rt_avg);
Peter Zijlstracebde6d2015-01-05 11:18:10 +01008046 delta = __rq_clock_broken(rq) - age_stamp;
Venkatesh Pallipadiaa483802010-10-04 17:03:22 -07008047
Peter Zijlstracadefd32014-02-27 10:40:35 +01008048 if (unlikely(delta < 0))
8049 delta = 0;
8050
8051 total = sched_avg_period() + delta;
Peter Zijlstrab654f7d2012-05-22 14:04:28 +02008052
Vincent Guittotb5b48602015-02-27 16:54:08 +01008053 used = div_u64(avg, total);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008054
Vincent Guittotb5b48602015-02-27 16:54:08 +01008055 if (likely(used < SCHED_CAPACITY_SCALE))
8056 return SCHED_CAPACITY_SCALE - used;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008057
Vincent Guittotb5b48602015-02-27 16:54:08 +01008058 return 1;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008059}
8060
Nicolas Pitreced549f2014-05-26 18:19:38 -04008061static void update_cpu_capacity(struct sched_domain *sd, int cpu)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008062{
Morten Rasmussen8cd56012015-08-14 17:23:10 +01008063 unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008064 struct sched_group *sdg = sd->groups;
8065
Vincent Guittotca6d75e2015-02-27 16:54:09 +01008066 cpu_rq(cpu)->cpu_capacity_orig = capacity;
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10008067
Nicolas Pitreced549f2014-05-26 18:19:38 -04008068 capacity *= scale_rt_capacity(cpu);
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04008069 capacity >>= SCHED_CAPACITY_SHIFT;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008070
Nicolas Pitreced549f2014-05-26 18:19:38 -04008071 if (!capacity)
8072 capacity = 1;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008073
Nicolas Pitreced549f2014-05-26 18:19:38 -04008074 cpu_rq(cpu)->cpu_capacity = capacity;
8075 sdg->sgc->capacity = capacity;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008076}
8077
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008078void update_group_capacity(struct sched_domain *sd, int cpu)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008079{
8080 struct sched_domain *child = sd->child;
8081 struct sched_group *group, *sdg = sd->groups;
Vincent Guittotdc7ff762015-03-03 11:35:03 +01008082 unsigned long capacity;
Vincent Guittot4ec44122011-12-12 20:21:08 +01008083 unsigned long interval;
8084
8085 interval = msecs_to_jiffies(sd->balance_interval);
8086 interval = clamp(interval, 1UL, max_load_balance_interval);
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008087 sdg->sgc->next_update = jiffies + interval;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008088
8089 if (!child) {
Nicolas Pitreced549f2014-05-26 18:19:38 -04008090 update_cpu_capacity(sd, cpu);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008091 return;
8092 }
8093
Vincent Guittotdc7ff762015-03-03 11:35:03 +01008094 capacity = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008095
Peter Zijlstra74a5ce22012-05-23 18:00:43 +02008096 if (child->flags & SD_OVERLAP) {
8097 /*
8098 * SD_OVERLAP domains cannot assume that child groups
8099 * span the current group.
8100 */
8101
Peter Zijlstra863bffc2013-08-28 11:44:39 +02008102 for_each_cpu(cpu, sched_group_cpus(sdg)) {
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008103 struct sched_group_capacity *sgc;
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05308104 struct rq *rq = cpu_rq(cpu);
Peter Zijlstra863bffc2013-08-28 11:44:39 +02008105
Olav Haugan3f2cb302016-05-31 14:34:46 -07008106 if (cpumask_test_cpu(cpu, cpu_isolated_mask))
8107 continue;
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05308108 /*
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008109 * build_sched_domains() -> init_sched_groups_capacity()
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05308110 * gets here before we've attached the domains to the
8111 * runqueues.
8112 *
Nicolas Pitreced549f2014-05-26 18:19:38 -04008113 * Use capacity_of(), which is set irrespective of domains
8114 * in update_cpu_capacity().
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05308115 *
Vincent Guittotdc7ff762015-03-03 11:35:03 +01008116 * This avoids capacity from being 0 and
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05308117 * causing divide-by-zero issues on boot.
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05308118 */
8119 if (unlikely(!rq->sd)) {
Nicolas Pitreced549f2014-05-26 18:19:38 -04008120 capacity += capacity_of(cpu);
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05308121 continue;
8122 }
8123
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008124 sgc = rq->sd->groups->sgc;
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008125 capacity += sgc->capacity;
Peter Zijlstra863bffc2013-08-28 11:44:39 +02008126 }
Peter Zijlstra74a5ce22012-05-23 18:00:43 +02008127 } else {
8128 /*
8129 * !SD_OVERLAP domains can assume that child groups
8130 * span the current group.
Byungchul Park97a71422015-07-05 18:33:48 +09008131 */
Peter Zijlstra74a5ce22012-05-23 18:00:43 +02008132
8133 group = child->groups;
8134 do {
Olav Haugan3f2cb302016-05-31 14:34:46 -07008135 cpumask_t *cpus = sched_group_cpus(group);
8136
8137 /* Revisit this later. This won't work for MT domain */
8138 if (!cpu_isolated(cpumask_first(cpus)))
8139 capacity += group->sgc->capacity;
Peter Zijlstra74a5ce22012-05-23 18:00:43 +02008140 group = group->next;
8141 } while (group != child->groups);
8142 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008143
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008144 sdg->sgc->capacity = capacity;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008145}
8146
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10008147/*
Vincent Guittotea678212015-02-27 16:54:11 +01008148 * Check whether the capacity of the rq has been noticeably reduced by side
8149 * activity. The imbalance_pct is used for the threshold.
8150 * Return true is the capacity is reduced
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10008151 */
8152static inline int
Vincent Guittotea678212015-02-27 16:54:11 +01008153check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10008154{
Vincent Guittotea678212015-02-27 16:54:11 +01008155 return ((rq->cpu_capacity * sd->imbalance_pct) <
8156 (rq->cpu_capacity_orig * 100));
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10008157}
8158
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008159/*
8160 * Group imbalance indicates (and tries to solve) the problem where balancing
8161 * groups is inadequate due to tsk_cpus_allowed() constraints.
8162 *
8163 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
8164 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
8165 * Something like:
8166 *
8167 * { 0 1 2 3 } { 4 5 6 7 }
8168 * * * * *
8169 *
8170 * If we were to balance group-wise we'd place two tasks in the first group and
8171 * two tasks in the second group. Clearly this is undesired as it will overload
8172 * cpu 3 and leave one of the cpus in the second group unused.
8173 *
8174 * The current solution to this issue is detecting the skew in the first group
Peter Zijlstra62633222013-08-19 12:41:09 +02008175 * by noticing the lower domain failed to reach balance and had difficulty
8176 * moving tasks due to affinity constraints.
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008177 *
8178 * When this is so detected; this group becomes a candidate for busiest; see
Kamalesh Babulaled1b7732013-10-13 23:06:15 +05308179 * update_sd_pick_busiest(). And calculate_imbalance() and
Peter Zijlstra62633222013-08-19 12:41:09 +02008180 * find_busiest_group() avoid some of the usual balance conditions to allow it
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008181 * to create an effective group imbalance.
8182 *
8183 * This is a somewhat tricky proposition since the next run might not find the
8184 * group imbalance and decide the groups need to be balanced again. A most
8185 * subtle and fragile situation.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008186 */
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008187
Peter Zijlstra62633222013-08-19 12:41:09 +02008188static inline int sg_imbalanced(struct sched_group *group)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008189{
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008190 return group->sgc->imbalance;
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008191}
8192
Peter Zijlstrab37d9312013-08-28 11:50:34 +02008193/*
Vincent Guittotea678212015-02-27 16:54:11 +01008194 * group_has_capacity returns true if the group has spare capacity that could
8195 * be used by some tasks.
8196 * We consider that a group has spare capacity if the * number of task is
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01008197 * smaller than the number of CPUs or if the utilization is lower than the
8198 * available capacity for CFS tasks.
Vincent Guittotea678212015-02-27 16:54:11 +01008199 * For the latter, we use a threshold to stabilize the state, to take into
8200 * account the variance of the tasks' load and to return true if the available
8201 * capacity in meaningful for the load balancer.
8202 * As an example, an available capacity of 1% can appear but it doesn't make
8203 * any benefit for the load balance.
Peter Zijlstrab37d9312013-08-28 11:50:34 +02008204 */
Vincent Guittotea678212015-02-27 16:54:11 +01008205static inline bool
8206group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
Peter Zijlstrab37d9312013-08-28 11:50:34 +02008207{
Vincent Guittotea678212015-02-27 16:54:11 +01008208 if (sgs->sum_nr_running < sgs->group_weight)
8209 return true;
Peter Zijlstrab37d9312013-08-28 11:50:34 +02008210
Vincent Guittotea678212015-02-27 16:54:11 +01008211 if ((sgs->group_capacity * 100) >
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01008212 (sgs->group_util * env->sd->imbalance_pct))
Vincent Guittotea678212015-02-27 16:54:11 +01008213 return true;
Peter Zijlstrab37d9312013-08-28 11:50:34 +02008214
Vincent Guittotea678212015-02-27 16:54:11 +01008215 return false;
Peter Zijlstrab37d9312013-08-28 11:50:34 +02008216}
8217
Vincent Guittotea678212015-02-27 16:54:11 +01008218/*
8219 * group_is_overloaded returns true if the group has more tasks than it can
8220 * handle.
8221 * group_is_overloaded is not equals to !group_has_capacity because a group
8222 * with the exact right number of tasks, has no more spare capacity but is not
8223 * overloaded so both group_has_capacity and group_is_overloaded return
8224 * false.
8225 */
8226static inline bool
8227group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
Rik van Rielcaeb1782014-07-28 14:16:28 -04008228{
Vincent Guittotea678212015-02-27 16:54:11 +01008229 if (sgs->sum_nr_running <= sgs->group_weight)
8230 return false;
8231
8232 if ((sgs->group_capacity * 100) <
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01008233 (sgs->group_util * env->sd->imbalance_pct))
Vincent Guittotea678212015-02-27 16:54:11 +01008234 return true;
8235
8236 return false;
8237}
8238
Leo Yan79a89f92015-09-15 18:56:45 +08008239static inline enum
8240group_type group_classify(struct sched_group *group,
8241 struct sg_lb_stats *sgs)
Vincent Guittotea678212015-02-27 16:54:11 +01008242{
8243 if (sgs->group_no_capacity)
Rik van Rielcaeb1782014-07-28 14:16:28 -04008244 return group_overloaded;
8245
8246 if (sg_imbalanced(group))
8247 return group_imbalanced;
8248
8249 return group_other;
8250}
8251
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008252/**
8253 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
8254 * @env: The load balancing environment.
8255 * @group: sched_group whose statistics are to be updated.
8256 * @load_idx: Load index of sched_domain of this_cpu for load calc.
8257 * @local_group: Does group contain this_cpu.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008258 * @sgs: variable to hold the statistics for this group.
Masanari Iidacd3bd4e2014-07-28 12:38:06 +09008259 * @overload: Indicate more than one runnable task for any CPU.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008260 */
8261static inline void update_sg_lb_stats(struct lb_env *env,
8262 struct sched_group *group, int load_idx,
Tim Chen4486edd2014-06-23 12:16:49 -07008263 int local_group, struct sg_lb_stats *sgs,
8264 bool *overload)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008265{
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008266 unsigned long load;
Waiman Longa426f992015-11-25 14:09:38 -05008267 int i, nr_running;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008268
Peter Zijlstrab72ff132013-08-28 10:32:32 +02008269 memset(sgs, 0, sizeof(*sgs));
8270
Michael Wangb94031302012-07-12 16:10:13 +08008271 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008272 struct rq *rq = cpu_rq(i);
8273
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008274 trace_sched_cpu_load_lb(cpu_rq(i), idle_cpu(i),
8275 sched_irqload(i),
8276 power_cost(i, 0),
8277 cpu_temp(i));
8278
Olav Haugan3f2cb302016-05-31 14:34:46 -07008279 if (cpu_isolated(i))
8280 continue;
8281
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008282 /* Bias balancing toward cpus of our domain */
Peter Zijlstra62633222013-08-19 12:41:09 +02008283 if (local_group)
Peter Zijlstra04f733b2012-05-11 00:12:02 +02008284 load = target_load(i, load_idx);
Peter Zijlstra62633222013-08-19 12:41:09 +02008285 else
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008286 load = source_load(i, load_idx);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008287
8288 sgs->group_load += load;
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01008289 sgs->group_util += cpu_util(i);
Vincent Guittot65fdac02014-08-26 13:06:46 +02008290 sgs->sum_nr_running += rq->cfs.h_nr_running;
Tim Chen4486edd2014-06-23 12:16:49 -07008291
Waiman Longa426f992015-11-25 14:09:38 -05008292 nr_running = rq->nr_running;
8293 if (nr_running > 1)
Tim Chen4486edd2014-06-23 12:16:49 -07008294 *overload = true;
8295
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008296#ifdef CONFIG_SCHED_HMP
8297 sgs->sum_nr_big_tasks += rq->hmp_stats.nr_big_tasks;
8298 sgs->group_cpu_load += cpu_load(i);
8299#endif
8300
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008301#ifdef CONFIG_NUMA_BALANCING
8302 sgs->nr_numa_running += rq->nr_numa_running;
8303 sgs->nr_preferred_running += rq->nr_preferred_running;
8304#endif
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008305 sgs->sum_weighted_load += weighted_cpuload(i);
Waiman Longa426f992015-11-25 14:09:38 -05008306 /*
8307 * No need to call idle_cpu() if nr_running is not 0
8308 */
8309 if (!nr_running && idle_cpu(i))
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07008310 sgs->idle_cpus++;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008311 }
8312
Olav Haugan3f2cb302016-05-31 14:34:46 -07008313 /* Isolated CPU has no weight */
8314 if (!group->group_weight) {
8315 sgs->group_capacity = 0;
8316 sgs->avg_load = 0;
8317 sgs->group_no_capacity = 1;
8318 sgs->group_type = group_other;
8319 sgs->group_weight = group->group_weight;
8320 } else {
8321 /* Adjust by relative CPU capacity of the group */
8322 sgs->group_capacity = group->sgc->capacity;
8323 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
8324 sgs->group_capacity;
8325
8326 sgs->group_weight = group->group_weight;
8327
8328 sgs->group_no_capacity = group_is_overloaded(env, sgs);
8329 sgs->group_type = group_classify(group, sgs);
8330 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008331
Suresh Siddhadd5feea2010-02-23 16:13:52 -08008332 if (sgs->sum_nr_running)
Peter Zijlstra38d0f772013-08-15 19:47:56 +02008333 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008334}
8335
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008336#ifdef CONFIG_SCHED_HMP
8337static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
8338 struct sd_lb_stats *sds,
8339 struct sched_group *sg,
8340 struct sg_lb_stats *sgs)
8341{
8342 if (env->idle != CPU_NOT_IDLE &&
8343 cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) {
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008344 if (sgs->sum_nr_big_tasks >
8345 sds->busiest_stat.sum_nr_big_tasks) {
8346 env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE;
8347 return true;
8348 }
8349 }
8350
8351 return false;
8352}
8353#else
8354static bool update_sd_pick_busiest_active_balance(struct lb_env *env,
8355 struct sd_lb_stats *sds,
8356 struct sched_group *sg,
8357 struct sg_lb_stats *sgs)
8358{
8359 return false;
8360}
8361#endif
8362
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008363/**
Michael Neuling532cb4c2010-06-08 14:57:02 +10008364 * update_sd_pick_busiest - return 1 on busiest group
Randy Dunlapcd968912012-06-08 13:18:33 -07008365 * @env: The load balancing environment.
Michael Neuling532cb4c2010-06-08 14:57:02 +10008366 * @sds: sched_domain statistics
8367 * @sg: sched_group candidate to be checked for being the busiest
Michael Neulingb6b12292010-06-10 12:06:21 +10008368 * @sgs: sched_group statistics
Michael Neuling532cb4c2010-06-08 14:57:02 +10008369 *
8370 * Determine if @sg is a busier group than the previously selected
8371 * busiest group.
Yacine Belkadie69f6182013-07-12 20:45:47 +02008372 *
8373 * Return: %true if @sg is a busier group than the previously selected
8374 * busiest group. %false otherwise.
Michael Neuling532cb4c2010-06-08 14:57:02 +10008375 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008376static bool update_sd_pick_busiest(struct lb_env *env,
Michael Neuling532cb4c2010-06-08 14:57:02 +10008377 struct sd_lb_stats *sds,
8378 struct sched_group *sg,
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008379 struct sg_lb_stats *sgs)
Michael Neuling532cb4c2010-06-08 14:57:02 +10008380{
Rik van Rielcaeb1782014-07-28 14:16:28 -04008381 struct sg_lb_stats *busiest = &sds->busiest_stat;
Michael Neuling532cb4c2010-06-08 14:57:02 +10008382
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008383 if (update_sd_pick_busiest_active_balance(env, sds, sg, sgs))
8384 return true;
8385
Rik van Rielcaeb1782014-07-28 14:16:28 -04008386 if (sgs->group_type > busiest->group_type)
Michael Neuling532cb4c2010-06-08 14:57:02 +10008387 return true;
8388
Rik van Rielcaeb1782014-07-28 14:16:28 -04008389 if (sgs->group_type < busiest->group_type)
8390 return false;
8391
8392 if (sgs->avg_load <= busiest->avg_load)
8393 return false;
8394
8395 /* This is the busiest node in its class. */
8396 if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling532cb4c2010-06-08 14:57:02 +10008397 return true;
8398
Srikar Dronamraju1f621e02016-04-06 18:47:40 +05308399 /* No ASYM_PACKING if target cpu is already busy */
8400 if (env->idle == CPU_NOT_IDLE)
8401 return true;
Michael Neuling532cb4c2010-06-08 14:57:02 +10008402 /*
8403 * ASYM_PACKING needs to move all the work to the lowest
8404 * numbered CPUs in the group, therefore mark all groups
8405 * higher than ourself as busy.
8406 */
Rik van Rielcaeb1782014-07-28 14:16:28 -04008407 if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
Michael Neuling532cb4c2010-06-08 14:57:02 +10008408 if (!sds->busiest)
8409 return true;
8410
Srikar Dronamraju1f621e02016-04-06 18:47:40 +05308411 /* Prefer to move from highest possible cpu's work */
8412 if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
Michael Neuling532cb4c2010-06-08 14:57:02 +10008413 return true;
8414 }
8415
8416 return false;
8417}
8418
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008419#ifdef CONFIG_NUMA_BALANCING
8420static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8421{
8422 if (sgs->sum_nr_running > sgs->nr_numa_running)
8423 return regular;
8424 if (sgs->sum_nr_running > sgs->nr_preferred_running)
8425 return remote;
8426 return all;
8427}
8428
8429static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8430{
8431 if (rq->nr_running > rq->nr_numa_running)
8432 return regular;
8433 if (rq->nr_running > rq->nr_preferred_running)
8434 return remote;
8435 return all;
8436}
8437#else
8438static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8439{
8440 return all;
8441}
8442
8443static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8444{
8445 return regular;
8446}
8447#endif /* CONFIG_NUMA_BALANCING */
8448
Michael Neuling532cb4c2010-06-08 14:57:02 +10008449/**
Hui Kang461819a2011-10-11 23:00:59 -04008450 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
Randy Dunlapcd968912012-06-08 13:18:33 -07008451 * @env: The load balancing environment.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008452 * @sds: variable to hold the statistics for this sched_domain.
8453 */
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008454static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008455{
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008456 struct sched_domain *child = env->sd->child;
8457 struct sched_group *sg = env->sd->groups;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008458 struct sg_lb_stats tmp_sgs;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008459 int load_idx, prefer_sibling = 0;
Tim Chen4486edd2014-06-23 12:16:49 -07008460 bool overload = false;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008461
8462 if (child && child->flags & SD_PREFER_SIBLING)
8463 prefer_sibling = 1;
8464
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008465 load_idx = get_sd_load_idx(env->sd, env->idle);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008466
8467 do {
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008468 struct sg_lb_stats *sgs = &tmp_sgs;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008469 int local_group;
8470
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008471 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008472 if (local_group) {
8473 sds->local = sg;
8474 sgs = &sds->local_stat;
Peter Zijlstrab72ff132013-08-28 10:32:32 +02008475
8476 if (env->idle != CPU_NEWLY_IDLE ||
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008477 time_after_eq(jiffies, sg->sgc->next_update))
8478 update_group_capacity(env->sd, env->dst_cpu);
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008479 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008480
Tim Chen4486edd2014-06-23 12:16:49 -07008481 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
8482 &overload);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008483
Peter Zijlstrab72ff132013-08-28 10:32:32 +02008484 if (local_group)
8485 goto next_group;
8486
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008487 /*
8488 * In case the child domain prefers tasks go to siblings
Vincent Guittotea678212015-02-27 16:54:11 +01008489 * first, lower the sg capacity so that we'll try
Nikhil Rao75dd3212010-10-15 13:12:30 -07008490 * and move all the excess tasks away. We lower the capacity
8491 * of a group only if the local group has the capacity to fit
Vincent Guittotea678212015-02-27 16:54:11 +01008492 * these excess tasks. The extra check prevents the case where
8493 * you always pull from the heaviest group when it is already
8494 * under-utilized (possible with a large weight task outweighs
8495 * the tasks on the system).
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008496 */
Peter Zijlstrab72ff132013-08-28 10:32:32 +02008497 if (prefer_sibling && sds->local &&
Vincent Guittotea678212015-02-27 16:54:11 +01008498 group_has_capacity(env, &sds->local_stat) &&
8499 (sgs->sum_nr_running > 1)) {
8500 sgs->group_no_capacity = 1;
Leo Yan79a89f92015-09-15 18:56:45 +08008501 sgs->group_type = group_classify(sg, sgs);
Wanpeng Licb0b9f22014-11-05 07:44:50 +08008502 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008503
Peter Zijlstrab72ff132013-08-28 10:32:32 +02008504 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
Michael Neuling532cb4c2010-06-08 14:57:02 +10008505 sds->busiest = sg;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008506 sds->busiest_stat = *sgs;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008507 env->busiest_nr_running = sgs->sum_nr_running;
8508 env->busiest_grp_capacity = sgs->group_capacity;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008509 }
8510
Peter Zijlstrab72ff132013-08-28 10:32:32 +02008511next_group:
8512 /* Now, start updating sd_lb_stats */
8513 sds->total_load += sgs->group_load;
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008514 sds->total_capacity += sgs->group_capacity;
Peter Zijlstrab72ff132013-08-28 10:32:32 +02008515
Michael Neuling532cb4c2010-06-08 14:57:02 +10008516 sg = sg->next;
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008517 } while (sg != env->sd->groups);
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008518
8519 if (env->sd->flags & SD_NUMA)
8520 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
Tim Chen4486edd2014-06-23 12:16:49 -07008521
8522 if (!env->sd->parent) {
8523 /* update overload indicator if we are at root domain */
8524 if (env->dst_rq->rd->overload != overload)
8525 env->dst_rq->rd->overload = overload;
8526 }
8527
Michael Neuling532cb4c2010-06-08 14:57:02 +10008528}
8529
Michael Neuling532cb4c2010-06-08 14:57:02 +10008530/**
8531 * check_asym_packing - Check to see if the group is packed into the
8532 * sched doman.
8533 *
8534 * This is primarily intended to used at the sibling level. Some
8535 * cores like POWER7 prefer to use lower numbered SMT threads. In the
8536 * case of POWER7, it can move to lower SMT modes only when higher
8537 * threads are idle. When in lower SMT modes, the threads will
8538 * perform better since they share less core resources. Hence when we
8539 * have idle threads, we want them to be the higher ones.
8540 *
8541 * This packing function is run on idle threads. It checks to see if
8542 * the busiest CPU in this domain (core in the P7 case) has a higher
8543 * CPU number than the packing function is being run on. Here we are
8544 * assuming lower CPU number will be equivalent to lower a SMT thread
8545 * number.
8546 *
Yacine Belkadie69f6182013-07-12 20:45:47 +02008547 * Return: 1 when packing is required and a task should be moved to
Michael Neulingb6b12292010-06-10 12:06:21 +10008548 * this CPU. The amount of the imbalance is returned in *imbalance.
8549 *
Randy Dunlapcd968912012-06-08 13:18:33 -07008550 * @env: The load balancing environment.
Michael Neuling532cb4c2010-06-08 14:57:02 +10008551 * @sds: Statistics of the sched_domain which is to be packed
Michael Neuling532cb4c2010-06-08 14:57:02 +10008552 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008553static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
Michael Neuling532cb4c2010-06-08 14:57:02 +10008554{
8555 int busiest_cpu;
8556
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008557 if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling532cb4c2010-06-08 14:57:02 +10008558 return 0;
8559
Srikar Dronamraju1f621e02016-04-06 18:47:40 +05308560 if (env->idle == CPU_NOT_IDLE)
8561 return 0;
8562
Michael Neuling532cb4c2010-06-08 14:57:02 +10008563 if (!sds->busiest)
8564 return 0;
8565
8566 busiest_cpu = group_first_cpu(sds->busiest);
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008567 if (env->dst_cpu > busiest_cpu)
Michael Neuling532cb4c2010-06-08 14:57:02 +10008568 return 0;
8569
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008570 env->imbalance = DIV_ROUND_CLOSEST(
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008571 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04008572 SCHED_CAPACITY_SCALE);
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008573
Michael Neuling532cb4c2010-06-08 14:57:02 +10008574 return 1;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008575}
8576
8577/**
8578 * fix_small_imbalance - Calculate the minor imbalance that exists
8579 * amongst the groups of a sched_domain, during
8580 * load balancing.
Randy Dunlapcd968912012-06-08 13:18:33 -07008581 * @env: The load balancing environment.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008582 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008583 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008584static inline
8585void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008586{
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008587 unsigned long tmp, capa_now = 0, capa_move = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008588 unsigned int imbn = 2;
Suresh Siddhadd5feea2010-02-23 16:13:52 -08008589 unsigned long scaled_busy_load_per_task;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008590 struct sg_lb_stats *local, *busiest;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008591
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008592 local = &sds->local_stat;
8593 busiest = &sds->busiest_stat;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008594
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008595 if (!local->sum_nr_running)
8596 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
8597 else if (busiest->load_per_task > local->load_per_task)
8598 imbn = 1;
Suresh Siddhadd5feea2010-02-23 16:13:52 -08008599
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008600 scaled_busy_load_per_task =
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04008601 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008602 busiest->group_capacity;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008603
Vladimir Davydov3029ede2013-09-15 17:49:14 +04008604 if (busiest->avg_load + scaled_busy_load_per_task >=
8605 local->avg_load + (scaled_busy_load_per_task * imbn)) {
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008606 env->imbalance = busiest->load_per_task;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008607 return;
8608 }
8609
8610 /*
8611 * OK, we don't have enough imbalance to justify moving tasks,
Nicolas Pitreced549f2014-05-26 18:19:38 -04008612 * however we may be able to increase total CPU capacity used by
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008613 * moving them.
8614 */
8615
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008616 capa_now += busiest->group_capacity *
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008617 min(busiest->load_per_task, busiest->avg_load);
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008618 capa_now += local->group_capacity *
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008619 min(local->load_per_task, local->avg_load);
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04008620 capa_now /= SCHED_CAPACITY_SCALE;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008621
8622 /* Amount of load we'd subtract */
Vincent Guittota2cd4262014-03-11 17:26:06 +01008623 if (busiest->avg_load > scaled_busy_load_per_task) {
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008624 capa_move += busiest->group_capacity *
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008625 min(busiest->load_per_task,
Vincent Guittota2cd4262014-03-11 17:26:06 +01008626 busiest->avg_load - scaled_busy_load_per_task);
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008627 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008628
8629 /* Amount of load we'd add */
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008630 if (busiest->avg_load * busiest->group_capacity <
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04008631 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008632 tmp = (busiest->avg_load * busiest->group_capacity) /
8633 local->group_capacity;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008634 } else {
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04008635 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008636 local->group_capacity;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008637 }
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008638 capa_move += local->group_capacity *
Peter Zijlstra3ae11c92013-08-15 20:37:48 +02008639 min(local->load_per_task, local->avg_load + tmp);
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04008640 capa_move /= SCHED_CAPACITY_SCALE;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008641
8642 /* Move if we gain throughput */
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008643 if (capa_move > capa_now)
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008644 env->imbalance = busiest->load_per_task;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008645}
8646
8647/**
8648 * calculate_imbalance - Calculate the amount of imbalance present within the
8649 * groups of a given sched_domain during load balance.
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008650 * @env: load balance environment
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008651 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008652 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008653static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008654{
Suresh Siddhadd5feea2010-02-23 16:13:52 -08008655 unsigned long max_pull, load_above_capacity = ~0UL;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008656 struct sg_lb_stats *local, *busiest;
Suresh Siddhadd5feea2010-02-23 16:13:52 -08008657
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008658 local = &sds->local_stat;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008659 busiest = &sds->busiest_stat;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008660
Rik van Rielcaeb1782014-07-28 14:16:28 -04008661 if (busiest->group_type == group_imbalanced) {
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008662 /*
8663 * In the group_imb case we cannot rely on group-wide averages
8664 * to ensure cpu-load equilibrium, look at wider averages. XXX
8665 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008666 busiest->load_per_task =
8667 min(busiest->load_per_task, sds->avg_load);
Suresh Siddhadd5feea2010-02-23 16:13:52 -08008668 }
8669
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008670 /*
Dietmar Eggemann885e5422016-04-29 20:32:39 +01008671 * Avg load of busiest sg can be less and avg load of local sg can
8672 * be greater than avg load across all sgs of sd because avg load
8673 * factors in sg capacity and sgs with smaller group_type are
8674 * skipped when updating the busiest sg:
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008675 */
Vladimir Davydovb1885552013-09-15 17:49:13 +04008676 if (busiest->avg_load <= sds->avg_load ||
8677 local->avg_load >= sds->avg_load) {
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008678 env->imbalance = 0;
8679 return fix_small_imbalance(env, sds);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008680 }
8681
Peter Zijlstra9a5d9ba2014-07-29 17:15:11 +02008682 /*
8683 * If there aren't any idle cpus, avoid creating some.
8684 */
8685 if (busiest->group_type == group_overloaded &&
8686 local->group_type == group_overloaded) {
Peter Zijlstra1be0eb22016-05-06 12:21:23 +02008687 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
Morten Rasmussencfa10332016-04-29 20:32:40 +01008688 if (load_above_capacity > busiest->group_capacity) {
Vincent Guittotea678212015-02-27 16:54:11 +01008689 load_above_capacity -= busiest->group_capacity;
Dietmar Eggemann26656212016-08-10 11:27:27 +01008690 load_above_capacity *= scale_load_down(NICE_0_LOAD);
Morten Rasmussencfa10332016-04-29 20:32:40 +01008691 load_above_capacity /= busiest->group_capacity;
8692 } else
Vincent Guittotea678212015-02-27 16:54:11 +01008693 load_above_capacity = ~0UL;
Suresh Siddhadd5feea2010-02-23 16:13:52 -08008694 }
8695
8696 /*
8697 * We're trying to get all the cpus to the average_load, so we don't
8698 * want to push ourselves above the average load, nor do we wish to
8699 * reduce the max loaded cpu below the average load. At the same time,
Dietmar Eggemann0a9b23c2016-04-29 20:32:38 +01008700 * we also don't want to reduce the group load below the group
8701 * capacity. Thus we look for the minimum possible imbalance.
Suresh Siddhadd5feea2010-02-23 16:13:52 -08008702 */
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008703 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008704
8705 /* How much load to actually move to equalise the imbalance */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008706 env->imbalance = min(
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008707 max_pull * busiest->group_capacity,
8708 (sds->avg_load - local->avg_load) * local->group_capacity
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04008709 ) / SCHED_CAPACITY_SCALE;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008710
8711 /*
8712 * if *imbalance is less than the average load per runnable task
Lucas De Marchi25985ed2011-03-30 22:57:33 -03008713 * there is no guarantee that any tasks will be moved so we'll have
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008714 * a think about bumping its value to force at least one task to be
8715 * moved
8716 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008717 if (env->imbalance < busiest->load_per_task)
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008718 return fix_small_imbalance(env, sds);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008719}
Nikhil Raofab47622010-10-15 13:12:29 -07008720
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008721/******* find_busiest_group() helpers end here *********************/
8722
8723/**
8724 * find_busiest_group - Returns the busiest group within the sched_domain
Dietmar Eggemann0a9b23c2016-04-29 20:32:38 +01008725 * if there is an imbalance.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008726 *
8727 * Also calculates the amount of weighted load which should be moved
8728 * to restore balance.
8729 *
Randy Dunlapcd968912012-06-08 13:18:33 -07008730 * @env: The load balancing environment.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008731 *
Yacine Belkadie69f6182013-07-12 20:45:47 +02008732 * Return: - The busiest group if imbalance exists.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008733 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008734static struct sched_group *find_busiest_group(struct lb_env *env)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008735{
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008736 struct sg_lb_stats *local, *busiest;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008737 struct sd_lb_stats sds;
8738
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02008739 init_sd_lb_stats(&sds);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008740
8741 /*
8742 * Compute the various statistics relavent for load balancing at
8743 * this level.
8744 */
Joonsoo Kim23f0d202013-08-06 17:36:42 +09008745 update_sd_lb_stats(env, &sds);
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008746 local = &sds.local_stat;
8747 busiest = &sds.busiest_stat;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008748
Vincent Guittotea678212015-02-27 16:54:11 +01008749 /* ASYM feature bypasses nice load balance check */
Srikar Dronamraju1f621e02016-04-06 18:47:40 +05308750 if (check_asym_packing(env, &sds))
Michael Neuling532cb4c2010-06-08 14:57:02 +10008751 return sds.busiest;
8752
Peter Zijlstracc57aa82011-02-21 18:55:32 +01008753 /* There is no busy sibling group to pull tasks from */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008754 if (!sds.busiest || busiest->sum_nr_running == 0)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008755 goto out_balanced;
8756
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07008757 if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008758 goto force_balance;
8759
8760 if (bail_inter_cluster_balance(env, &sds))
8761 goto out_balanced;
8762
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04008763 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
8764 / sds.total_capacity;
Ken Chenb0432d82011-04-07 17:23:22 -07008765
Peter Zijlstra866ab432011-02-21 18:56:47 +01008766 /*
8767 * If the busiest group is imbalanced the below checks don't
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008768 * work because they assume all things are equal, which typically
Peter Zijlstra866ab432011-02-21 18:56:47 +01008769 * isn't true due to cpus_allowed constraints and the like.
8770 */
Rik van Rielcaeb1782014-07-28 14:16:28 -04008771 if (busiest->group_type == group_imbalanced)
Peter Zijlstra866ab432011-02-21 18:56:47 +01008772 goto force_balance;
8773
Peter Zijlstracc57aa82011-02-21 18:55:32 +01008774 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
Vincent Guittotea678212015-02-27 16:54:11 +01008775 if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
8776 busiest->group_no_capacity)
Nikhil Raofab47622010-10-15 13:12:29 -07008777 goto force_balance;
8778
Peter Zijlstracc57aa82011-02-21 18:55:32 +01008779 /*
Zhihui Zhang9c58c792014-09-20 21:24:36 -04008780 * If the local group is busier than the selected busiest group
Peter Zijlstracc57aa82011-02-21 18:55:32 +01008781 * don't try and pull any tasks.
8782 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008783 if (local->avg_load >= busiest->avg_load)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008784 goto out_balanced;
8785
Peter Zijlstracc57aa82011-02-21 18:55:32 +01008786 /*
8787 * Don't pull any tasks if this group is already above the domain
8788 * average load.
8789 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008790 if (local->avg_load >= sds.avg_load)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008791 goto out_balanced;
8792
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008793 if (env->idle == CPU_IDLE) {
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07008794 /*
Vincent Guittot43f4d662014-10-01 15:38:55 +02008795 * This cpu is idle. If the busiest group is not overloaded
8796 * and there is no imbalance between this and busiest group
8797 * wrt idle cpus, it is balanced. The imbalance becomes
8798 * significant if the diff is greater than 1 otherwise we
8799 * might end up to just move the imbalance on another group
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07008800 */
Vincent Guittot43f4d662014-10-01 15:38:55 +02008801 if ((busiest->group_type != group_overloaded) &&
8802 (local->idle_cpus <= (busiest->idle_cpus + 1)))
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07008803 goto out_balanced;
Peter Zijlstrac186faf2011-02-21 18:52:53 +01008804 } else {
8805 /*
8806 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
8807 * imbalance_pct to be conservative.
8808 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008809 if (100 * busiest->avg_load <=
8810 env->sd->imbalance_pct * local->avg_load)
Peter Zijlstrac186faf2011-02-21 18:52:53 +01008811 goto out_balanced;
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07008812 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008813
Nikhil Raofab47622010-10-15 13:12:29 -07008814force_balance:
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008815 /* Looks like there is an imbalance. Compute it */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008816 calculate_imbalance(env, &sds);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008817 return sds.busiest;
8818
8819out_balanced:
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008820 env->imbalance = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008821 return NULL;
8822}
8823
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008824#ifdef CONFIG_SCHED_HMP
8825static struct rq *find_busiest_queue_hmp(struct lb_env *env,
8826 struct sched_group *group)
8827{
8828 struct rq *busiest = NULL, *busiest_big = NULL;
8829 u64 max_runnable_avg = 0, max_runnable_avg_big = 0;
8830 int max_nr_big = 0, nr_big;
8831 bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE);
8832 int i;
Olav Haugand67250b2016-11-01 17:30:36 -07008833 cpumask_t cpus;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008834
Olav Haugand67250b2016-11-01 17:30:36 -07008835 cpumask_andnot(&cpus, sched_group_cpus(group), cpu_isolated_mask);
8836
8837 for_each_cpu(i, &cpus) {
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008838 struct rq *rq = cpu_rq(i);
8839 u64 cumulative_runnable_avg =
8840 rq->hmp_stats.cumulative_runnable_avg;
8841
8842 if (!cpumask_test_cpu(i, env->cpus))
8843 continue;
8844
8845
8846 if (find_big) {
8847 nr_big = nr_big_tasks(rq);
8848 if (nr_big > max_nr_big ||
8849 (nr_big > 0 && nr_big == max_nr_big &&
8850 cumulative_runnable_avg > max_runnable_avg_big)) {
8851 max_runnable_avg_big = cumulative_runnable_avg;
8852 busiest_big = rq;
8853 max_nr_big = nr_big;
8854 continue;
8855 }
8856 }
8857
8858 if (cumulative_runnable_avg > max_runnable_avg) {
8859 max_runnable_avg = cumulative_runnable_avg;
8860 busiest = rq;
8861 }
8862 }
8863
8864 if (busiest_big)
8865 return busiest_big;
8866
8867 env->flags &= ~LBF_BIG_TASK_ACTIVE_BALANCE;
8868 return busiest;
8869}
8870#else
8871static inline struct rq *find_busiest_queue_hmp(struct lb_env *env,
8872 struct sched_group *group)
8873{
8874 return NULL;
8875}
8876#endif
8877
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008878/*
8879 * find_busiest_queue - find the busiest runqueue among the cpus in group.
8880 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008881static struct rq *find_busiest_queue(struct lb_env *env,
Michael Wangb94031302012-07-12 16:10:13 +08008882 struct sched_group *group)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008883{
8884 struct rq *busiest = NULL, *rq;
Nicolas Pitreced549f2014-05-26 18:19:38 -04008885 unsigned long busiest_load = 0, busiest_capacity = 1;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008886 int i;
8887
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008888#ifdef CONFIG_SCHED_HMP
8889 return find_busiest_queue_hmp(env, group);
8890#endif
8891
Peter Zijlstra6906a402013-08-19 15:20:21 +02008892 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Vincent Guittotea678212015-02-27 16:54:11 +01008893 unsigned long capacity, wl;
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008894 enum fbq_type rt;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008895
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008896 rq = cpu_rq(i);
8897 rt = fbq_classify_rq(rq);
8898
8899 /*
8900 * We classify groups/runqueues into three groups:
8901 * - regular: there are !numa tasks
8902 * - remote: there are numa tasks that run on the 'wrong' node
8903 * - all: there is no distinction
8904 *
8905 * In order to avoid migrating ideally placed numa tasks,
8906 * ignore those when there's better options.
8907 *
8908 * If we ignore the actual busiest queue to migrate another
8909 * task, the next balance pass can still reduce the busiest
8910 * queue by moving tasks around inside the node.
8911 *
8912 * If we cannot move enough load due to this classification
8913 * the next pass will adjust the group classification and
8914 * allow migration of more tasks.
8915 *
8916 * Both cases only affect the total convergence complexity.
8917 */
8918 if (rt > env->fbq_type)
8919 continue;
8920
Nicolas Pitreced549f2014-05-26 18:19:38 -04008921 capacity = capacity_of(i);
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10008922
Thomas Gleixner6e40f5b2010-02-16 16:48:56 +01008923 wl = weighted_cpuload(i);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008924
Thomas Gleixner6e40f5b2010-02-16 16:48:56 +01008925 /*
8926 * When comparing with imbalance, use weighted_cpuload()
Nicolas Pitreced549f2014-05-26 18:19:38 -04008927 * which is not scaled with the cpu capacity.
Thomas Gleixner6e40f5b2010-02-16 16:48:56 +01008928 */
Vincent Guittotea678212015-02-27 16:54:11 +01008929
8930 if (rq->nr_running == 1 && wl > env->imbalance &&
8931 !check_cpu_capacity(rq, env->sd))
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008932 continue;
8933
Thomas Gleixner6e40f5b2010-02-16 16:48:56 +01008934 /*
8935 * For the load comparisons with the other cpu's, consider
Nicolas Pitreced549f2014-05-26 18:19:38 -04008936 * the weighted_cpuload() scaled with the cpu capacity, so
8937 * that the load can be moved away from the cpu that is
8938 * potentially running at a lower capacity.
Joonsoo Kim95a79b82013-08-06 17:36:41 +09008939 *
Nicolas Pitreced549f2014-05-26 18:19:38 -04008940 * Thus we're looking for max(wl_i / capacity_i), crosswise
Joonsoo Kim95a79b82013-08-06 17:36:41 +09008941 * multiplication to rid ourselves of the division works out
Nicolas Pitreced549f2014-05-26 18:19:38 -04008942 * to: wl_i * capacity_j > wl_j * capacity_i; where j is
8943 * our previous maximum.
Thomas Gleixner6e40f5b2010-02-16 16:48:56 +01008944 */
Nicolas Pitreced549f2014-05-26 18:19:38 -04008945 if (wl * busiest_capacity > busiest_load * capacity) {
Joonsoo Kim95a79b82013-08-06 17:36:41 +09008946 busiest_load = wl;
Nicolas Pitreced549f2014-05-26 18:19:38 -04008947 busiest_capacity = capacity;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008948 busiest = rq;
8949 }
8950 }
8951
8952 return busiest;
8953}
8954
8955/*
8956 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
8957 * so long as it is large enough.
8958 */
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008959#define MAX_PINNED_INTERVAL 16
8960#define NEED_ACTIVE_BALANCE_THRESHOLD 10
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008961
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008962static int need_active_balance(struct lb_env *env)
Peter Zijlstra1af3ed32009-12-23 15:10:31 +01008963{
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008964 struct sched_domain *sd = env->sd;
8965
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07008966 if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE)
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008967 return 1;
8968
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008969 if (env->idle == CPU_NEWLY_IDLE) {
Michael Neuling532cb4c2010-06-08 14:57:02 +10008970
8971 /*
8972 * ASYM_PACKING needs to force migrate tasks from busy but
8973 * higher numbered CPUs in order to pack all tasks in the
8974 * lowest numbered CPUs.
8975 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008976 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
Michael Neuling532cb4c2010-06-08 14:57:02 +10008977 return 1;
Peter Zijlstra1af3ed32009-12-23 15:10:31 +01008978 }
8979
Vincent Guittot1aaf90a2015-02-27 16:54:14 +01008980 /*
8981 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
8982 * It's worth migrating the task if the src_cpu's capacity is reduced
8983 * because of other sched_class or IRQs if more capacity stays
8984 * available on dst_cpu.
8985 */
8986 if ((env->idle != CPU_NOT_IDLE) &&
8987 (env->src_rq->cfs.h_nr_running == 1)) {
8988 if ((check_cpu_capacity(env->src_rq, sd)) &&
8989 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
8990 return 1;
8991 }
8992
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008993 return unlikely(sd->nr_balance_failed >
8994 sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
Peter Zijlstra1af3ed32009-12-23 15:10:31 +01008995}
8996
Olav Haugand67250b2016-11-01 17:30:36 -07008997static int group_balance_cpu_not_isolated(struct sched_group *sg)
8998{
8999 cpumask_t cpus;
9000
9001 cpumask_and(&cpus, sched_group_cpus(sg), sched_group_mask(sg));
9002 cpumask_andnot(&cpus, &cpus, cpu_isolated_mask);
9003 return cpumask_first(&cpus);
9004}
9005
Tejun Heo969c7922010-05-06 18:49:21 +02009006static int active_load_balance_cpu_stop(void *data);
9007
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009008static int should_we_balance(struct lb_env *env)
9009{
9010 struct sched_group *sg = env->sd->groups;
9011 struct cpumask *sg_cpus, *sg_mask;
9012 int cpu, balance_cpu = -1;
9013
9014 /*
9015 * In the newly idle case, we will allow all the cpu's
9016 * to do the newly idle load balance.
9017 */
9018 if (env->idle == CPU_NEWLY_IDLE)
9019 return 1;
9020
9021 sg_cpus = sched_group_cpus(sg);
9022 sg_mask = sched_group_mask(sg);
9023 /* Try to find first idle cpu */
9024 for_each_cpu_and(cpu, sg_cpus, env->cpus) {
Olav Haugand67250b2016-11-01 17:30:36 -07009025 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu) ||
9026 cpu_isolated(cpu))
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009027 continue;
9028
9029 balance_cpu = cpu;
9030 break;
9031 }
9032
9033 if (balance_cpu == -1)
Olav Haugand67250b2016-11-01 17:30:36 -07009034 balance_cpu = group_balance_cpu_not_isolated(sg);
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009035
9036 /*
9037 * First idle cpu or the first cpu(busiest) in this sched group
9038 * is eligible for doing load balancing at this and above domains.
9039 */
Joonsoo Kimb0cff9d2013-09-10 15:54:49 +09009040 return balance_cpu == env->dst_cpu;
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009041}
9042
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009043/*
9044 * Check this_cpu to ensure it is balanced within domain. Attempt to move
9045 * tasks if there is an imbalance.
9046 */
9047static int load_balance(int this_cpu, struct rq *this_rq,
9048 struct sched_domain *sd, enum cpu_idle_type idle,
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009049 int *continue_balancing)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009050{
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009051 int ld_moved = 0, cur_ld_moved, active_balance = 0;
Peter Zijlstra62633222013-08-19 12:41:09 +02009052 struct sched_domain *sd_parent = sd->parent;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009053 struct sched_group *group = NULL;
9054 struct rq *busiest = NULL;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009055 unsigned long flags;
Christoph Lameter4ba29682014-08-26 19:12:21 -05009056 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009057
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01009058 struct lb_env env = {
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07009059 .sd = sd,
9060 .dst_cpu = this_cpu,
9061 .dst_rq = this_rq,
9062 .dst_grpmask = sched_group_cpus(sd->groups),
9063 .idle = idle,
9064 .loop_break = sched_nr_migrate_break,
9065 .cpus = cpus,
9066 .fbq_type = all,
9067 .tasks = LIST_HEAD_INIT(env.tasks),
9068 .imbalance = 0,
9069 .flags = 0,
9070 .loop = 0,
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009071 .busiest_nr_running = 0,
9072 .busiest_grp_capacity = 0,
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07009073 .boost_policy = sched_boost_policy(),
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01009074 };
9075
Joonsoo Kimcfc03112013-04-23 17:27:39 +09009076 /*
9077 * For NEWLY_IDLE load_balancing, we don't need to consider
9078 * other cpus in our group
9079 */
Joonsoo Kime02e60c2013-04-23 17:27:42 +09009080 if (idle == CPU_NEWLY_IDLE)
Joonsoo Kimcfc03112013-04-23 17:27:39 +09009081 env.dst_grpmask = NULL;
Joonsoo Kimcfc03112013-04-23 17:27:39 +09009082
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009083 cpumask_copy(cpus, cpu_active_mask);
9084
Josh Poimboeufae928822016-06-17 12:43:24 -05009085 schedstat_inc(sd->lb_count[idle]);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009086
9087redo:
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009088 if (!should_we_balance(&env)) {
9089 *continue_balancing = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009090 goto out_balanced;
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009091 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009092
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009093 group = find_busiest_group(&env);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009094 if (!group) {
Josh Poimboeufae928822016-06-17 12:43:24 -05009095 schedstat_inc(sd->lb_nobusyg[idle]);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009096 goto out_balanced;
9097 }
9098
Michael Wangb94031302012-07-12 16:10:13 +08009099 busiest = find_busiest_queue(&env, group);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009100 if (!busiest) {
Josh Poimboeufae928822016-06-17 12:43:24 -05009101 schedstat_inc(sd->lb_nobusyq[idle]);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009102 goto out_balanced;
9103 }
9104
Michael Wang78feefc2012-08-06 16:41:59 +08009105 BUG_ON(busiest == env.dst_rq);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009106
Josh Poimboeufae928822016-06-17 12:43:24 -05009107 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009108
Vincent Guittot1aaf90a2015-02-27 16:54:14 +01009109 env.src_cpu = busiest->cpu;
9110 env.src_rq = busiest;
9111
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009112 ld_moved = 0;
9113 if (busiest->nr_running > 1) {
9114 /*
9115 * Attempt to move tasks. If find_busiest_group has found
9116 * an imbalance but busiest->nr_running <= 1, the group is
9117 * still unbalanced. ld_moved simply stays zero, so it is
9118 * correctly treated as an imbalance.
9119 */
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01009120 env.flags |= LBF_ALL_PINNED;
Peter Zijlstrac82513e2012-04-26 13:12:27 +02009121 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01009122
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01009123more_balance:
Kirill Tkhai163122b2014-08-20 13:48:29 +04009124 raw_spin_lock_irqsave(&busiest->lock, flags);
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309125
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009126 /* The world might have changed. Validate assumptions */
9127 if (busiest->nr_running <= 1) {
9128 raw_spin_unlock_irqrestore(&busiest->lock, flags);
9129 env.flags &= ~LBF_ALL_PINNED;
9130 goto no_move;
9131 }
9132
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309133 /*
9134 * cur_ld_moved - load moved in current iteration
9135 * ld_moved - cumulative load moved across iterations
9136 */
Kirill Tkhai163122b2014-08-20 13:48:29 +04009137 cur_ld_moved = detach_tasks(&env);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009138
9139 /*
Kirill Tkhai163122b2014-08-20 13:48:29 +04009140 * We've detached some tasks from busiest_rq. Every
9141 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
9142 * unlock busiest->lock, and we are able to be sure
9143 * that nobody can manipulate the tasks in parallel.
9144 * See task_rq_lock() family for the details.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009145 */
Kirill Tkhai163122b2014-08-20 13:48:29 +04009146
9147 raw_spin_unlock(&busiest->lock);
9148
9149 if (cur_ld_moved) {
9150 attach_tasks(&env);
9151 ld_moved += cur_ld_moved;
9152 }
9153
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009154 local_irq_restore(flags);
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309155
Joonsoo Kimf1cd0852013-04-23 17:27:37 +09009156 if (env.flags & LBF_NEED_BREAK) {
9157 env.flags &= ~LBF_NEED_BREAK;
9158 goto more_balance;
9159 }
9160
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309161 /*
9162 * Revisit (affine) tasks on src_cpu that couldn't be moved to
9163 * us and move them to an alternate dst_cpu in our sched_group
9164 * where they can run. The upper limit on how many times we
9165 * iterate on same src_cpu is dependent on number of cpus in our
9166 * sched_group.
9167 *
9168 * This changes load balance semantics a bit on who can move
9169 * load to a given_cpu. In addition to the given_cpu itself
9170 * (or a ilb_cpu acting on its behalf where given_cpu is
9171 * nohz-idle), we now have balance_cpu in a position to move
9172 * load to given_cpu. In rare situations, this may cause
9173 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
9174 * _independently_ and at _same_ time to move some load to
9175 * given_cpu) causing exceess load to be moved to given_cpu.
9176 * This however should not happen so much in practice and
9177 * moreover subsequent load balance cycles should correct the
9178 * excess load moved.
9179 */
Peter Zijlstra62633222013-08-19 12:41:09 +02009180 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309181
Vladimir Davydov7aff2e32013-09-15 21:30:13 +04009182 /* Prevent to re-select dst_cpu via env's cpus */
9183 cpumask_clear_cpu(env.dst_cpu, env.cpus);
9184
Michael Wang78feefc2012-08-06 16:41:59 +08009185 env.dst_rq = cpu_rq(env.new_dst_cpu);
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309186 env.dst_cpu = env.new_dst_cpu;
Peter Zijlstra62633222013-08-19 12:41:09 +02009187 env.flags &= ~LBF_DST_PINNED;
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309188 env.loop = 0;
9189 env.loop_break = sched_nr_migrate_break;
Joonsoo Kime02e60c2013-04-23 17:27:42 +09009190
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309191 /*
9192 * Go back to "more_balance" rather than "redo" since we
9193 * need to continue with same src_cpu.
9194 */
9195 goto more_balance;
9196 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009197
Peter Zijlstra62633222013-08-19 12:41:09 +02009198 /*
9199 * We failed to reach balance because of affinity.
9200 */
9201 if (sd_parent) {
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009202 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
Peter Zijlstra62633222013-08-19 12:41:09 +02009203
Vincent Guittotafdeee02014-08-26 13:06:44 +02009204 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
Peter Zijlstra62633222013-08-19 12:41:09 +02009205 *group_imbalance = 1;
Peter Zijlstra62633222013-08-19 12:41:09 +02009206 }
9207
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009208 /* All tasks on this runqueue were pinned by CPU affinity */
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01009209 if (unlikely(env.flags & LBF_ALL_PINNED)) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009210 cpumask_clear_cpu(cpu_of(busiest), cpus);
Prashanth Nageshappabbf18b12012-06-19 17:52:07 +05309211 if (!cpumask_empty(cpus)) {
9212 env.loop = 0;
9213 env.loop_break = sched_nr_migrate_break;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009214 goto redo;
Prashanth Nageshappabbf18b12012-06-19 17:52:07 +05309215 }
Vincent Guittotafdeee02014-08-26 13:06:44 +02009216 goto out_all_pinned;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009217 }
9218 }
9219
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009220no_move:
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009221 if (!ld_moved) {
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07009222 if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009223 schedstat_inc(sd->lb_failed[idle]);
Venkatesh Pallipadi58b26c42010-09-10 18:19:17 -07009224 /*
9225 * Increment the failure counter only on periodic balance.
9226 * We do not want newidle balance, which can be very
9227 * frequent, pollute the failure counter causing
9228 * excessive cache_hot migrations and active balances.
9229 */
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009230 if (idle != CPU_NEWLY_IDLE &&
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07009231 !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
Venkatesh Pallipadi58b26c42010-09-10 18:19:17 -07009232 sd->nr_balance_failed++;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009233
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009234 if (need_active_balance(&env)) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009235 raw_spin_lock_irqsave(&busiest->lock, flags);
9236
Tejun Heo969c7922010-05-06 18:49:21 +02009237 /* don't kick the active_load_balance_cpu_stop,
9238 * if the curr task on busiest cpu can't be
9239 * moved to this_cpu
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009240 */
9241 if (!cpumask_test_cpu(this_cpu,
Peter Zijlstrafa17b502011-06-16 12:23:22 +02009242 tsk_cpus_allowed(busiest->curr))) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009243 raw_spin_unlock_irqrestore(&busiest->lock,
9244 flags);
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01009245 env.flags |= LBF_ALL_PINNED;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009246 goto out_one_pinned;
9247 }
9248
Tejun Heo969c7922010-05-06 18:49:21 +02009249 /*
9250 * ->active_balance synchronizes accesses to
9251 * ->active_balance_work. Once set, it's cleared
9252 * only after active load balance is finished.
9253 */
Olav Haugand67250b2016-11-01 17:30:36 -07009254 if (!busiest->active_balance &&
9255 !cpu_isolated(cpu_of(busiest))) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009256 busiest->active_balance = 1;
9257 busiest->push_cpu = this_cpu;
9258 active_balance = 1;
9259 }
9260 raw_spin_unlock_irqrestore(&busiest->lock, flags);
Tejun Heo969c7922010-05-06 18:49:21 +02009261
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009262 if (active_balance) {
Tejun Heo969c7922010-05-06 18:49:21 +02009263 stop_one_cpu_nowait(cpu_of(busiest),
9264 active_load_balance_cpu_stop, busiest,
9265 &busiest->active_balance_work);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009266 *continue_balancing = 0;
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009267 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009268
Srikar Dronamrajud02c0712016-03-23 17:54:44 +05309269 /* We've kicked active balancing, force task migration. */
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009270 sd->nr_balance_failed = sd->cache_nice_tries +
9271 NEED_ACTIVE_BALANCE_THRESHOLD - 1;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009272 }
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009273 } else {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009274 sd->nr_balance_failed = 0;
9275
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009276 /* Assumes one 'busiest' cpu that we pulled tasks from */
9277 if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
9278 int check_groups = !!(env.flags &
9279 LBF_MOVED_RELATED_THREAD_GROUP_TASK);
9280
9281 check_for_freq_change(this_rq, false, check_groups);
9282 check_for_freq_change(busiest, false, check_groups);
9283 } else {
9284 check_for_freq_change(this_rq, true, false);
9285 }
9286 }
9287
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009288 if (likely(!active_balance)) {
9289 /* We were unbalanced, so reset the balancing interval */
9290 sd->balance_interval = sd->min_interval;
9291 } else {
9292 /*
9293 * If we've begun active balancing, start to back off. This
9294 * case may not be covered by the all_pinned logic if there
9295 * is only 1 task on the busy runqueue (because we don't call
Kirill Tkhai163122b2014-08-20 13:48:29 +04009296 * detach_tasks).
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009297 */
9298 if (sd->balance_interval < sd->max_interval)
9299 sd->balance_interval *= 2;
9300 }
9301
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009302 goto out;
9303
9304out_balanced:
Vincent Guittotafdeee02014-08-26 13:06:44 +02009305 /*
9306 * We reach balance although we may have faced some affinity
9307 * constraints. Clear the imbalance flag if it was set.
9308 */
9309 if (sd_parent) {
9310 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9311
9312 if (*group_imbalance)
9313 *group_imbalance = 0;
9314 }
9315
9316out_all_pinned:
9317 /*
9318 * We reach balance because all tasks are pinned at this level so
9319 * we can't migrate them. Let the imbalance flag set so parent level
9320 * can try to migrate them.
9321 */
Josh Poimboeufae928822016-06-17 12:43:24 -05009322 schedstat_inc(sd->lb_balanced[idle]);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009323
9324 sd->nr_balance_failed = 0;
9325
9326out_one_pinned:
9327 /* tune up the balancing interval */
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01009328 if (((env.flags & LBF_ALL_PINNED) &&
Peter Zijlstra5b54b562011-09-22 15:23:13 +02009329 sd->balance_interval < MAX_PINNED_INTERVAL) ||
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009330 (sd->balance_interval < sd->max_interval))
9331 sd->balance_interval *= 2;
9332
Venkatesh Pallipadi46e49b32011-02-14 14:38:50 -08009333 ld_moved = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009334out:
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009335 trace_sched_load_balance(this_cpu, idle, *continue_balancing,
9336 group ? group->cpumask[0] : 0,
9337 busiest ? busiest->nr_running : 0,
9338 env.imbalance, env.flags, ld_moved,
9339 sd->balance_interval);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009340 return ld_moved;
9341}
9342
Jason Low52a08ef2014-05-08 17:49:22 -07009343static inline unsigned long
9344get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
9345{
9346 unsigned long interval = sd->balance_interval;
9347
9348 if (cpu_busy)
9349 interval *= sd->busy_factor;
9350
9351 /* scale ms to jiffies */
9352 interval = msecs_to_jiffies(interval);
9353 interval = clamp(interval, 1UL, max_load_balance_interval);
9354
9355 return interval;
9356}
9357
9358static inline void
Leo Yan31851a92016-08-05 14:31:29 +08009359update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
Jason Low52a08ef2014-05-08 17:49:22 -07009360{
9361 unsigned long interval, next;
9362
Leo Yan31851a92016-08-05 14:31:29 +08009363 /* used by idle balance, so cpu_busy = 0 */
9364 interval = get_sd_balance_interval(sd, 0);
Jason Low52a08ef2014-05-08 17:49:22 -07009365 next = sd->last_balance + interval;
9366
9367 if (time_after(*next_balance, next))
9368 *next_balance = next;
9369}
9370
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009371/*
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009372 * idle_balance is called by schedule() if this_cpu is about to become
9373 * idle. Attempts to pull tasks from other CPUs.
9374 */
Peter Zijlstra6e831252014-02-11 16:11:48 +01009375static int idle_balance(struct rq *this_rq)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009376{
Jason Low52a08ef2014-05-08 17:49:22 -07009377 unsigned long next_balance = jiffies + HZ;
9378 int this_cpu = this_rq->cpu;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009379 struct sched_domain *sd;
9380 int pulled_task = 0;
Jason Low9bd721c2013-09-13 11:26:52 -07009381 u64 curr_cost = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009382
Olav Haugan3f2cb302016-05-31 14:34:46 -07009383 if (cpu_isolated(this_cpu))
9384 return 0;
9385
Peter Zijlstra6e831252014-02-11 16:11:48 +01009386 /*
9387 * We must set idle_stamp _before_ calling idle_balance(), such that we
9388 * measure the duration of idle_balance() as idle time.
9389 */
9390 this_rq->idle_stamp = rq_clock(this_rq);
9391
Tim Chen4486edd2014-06-23 12:16:49 -07009392 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
9393 !this_rq->rd->overload) {
Jason Low52a08ef2014-05-08 17:49:22 -07009394 rcu_read_lock();
9395 sd = rcu_dereference_check_sched_domain(this_rq->sd);
9396 if (sd)
Leo Yan31851a92016-08-05 14:31:29 +08009397 update_next_balance(sd, &next_balance);
Jason Low52a08ef2014-05-08 17:49:22 -07009398 rcu_read_unlock();
9399
Peter Zijlstra6e831252014-02-11 16:11:48 +01009400 goto out;
Jason Low52a08ef2014-05-08 17:49:22 -07009401 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009402
Peter Zijlstraf492e122009-12-23 15:29:42 +01009403 raw_spin_unlock(&this_rq->lock);
9404
Paul Turner48a16752012-10-04 13:18:31 +02009405 update_blocked_averages(this_cpu);
Peter Zijlstradce840a2011-04-07 14:09:50 +02009406 rcu_read_lock();
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009407 for_each_domain(this_cpu, sd) {
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009408 int continue_balancing = 1;
Jason Low9bd721c2013-09-13 11:26:52 -07009409 u64 t0, domain_cost;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009410
9411 if (!(sd->flags & SD_LOAD_BALANCE))
9412 continue;
9413
Jason Low52a08ef2014-05-08 17:49:22 -07009414 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
Leo Yan31851a92016-08-05 14:31:29 +08009415 update_next_balance(sd, &next_balance);
Jason Low9bd721c2013-09-13 11:26:52 -07009416 break;
Jason Low52a08ef2014-05-08 17:49:22 -07009417 }
Jason Low9bd721c2013-09-13 11:26:52 -07009418
Peter Zijlstraf492e122009-12-23 15:29:42 +01009419 if (sd->flags & SD_BALANCE_NEWIDLE) {
Jason Low9bd721c2013-09-13 11:26:52 -07009420 t0 = sched_clock_cpu(this_cpu);
9421
Peter Zijlstraf492e122009-12-23 15:29:42 +01009422 pulled_task = load_balance(this_cpu, this_rq,
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009423 sd, CPU_NEWLY_IDLE,
9424 &continue_balancing);
Jason Low9bd721c2013-09-13 11:26:52 -07009425
9426 domain_cost = sched_clock_cpu(this_cpu) - t0;
9427 if (domain_cost > sd->max_newidle_lb_cost)
9428 sd->max_newidle_lb_cost = domain_cost;
9429
9430 curr_cost += domain_cost;
Peter Zijlstraf492e122009-12-23 15:29:42 +01009431 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009432
Leo Yan31851a92016-08-05 14:31:29 +08009433 update_next_balance(sd, &next_balance);
Jason Low39a4d9c2014-04-23 18:30:35 -07009434
9435 /*
9436 * Stop searching for tasks to pull if there are
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009437 * now runnable tasks on the balance rq or if
9438 * continue_balancing has been unset (only possible
9439 * due to active migration).
Jason Low39a4d9c2014-04-23 18:30:35 -07009440 */
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009441 if (pulled_task || this_rq->nr_running > 0 ||
9442 !continue_balancing)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009443 break;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009444 }
Peter Zijlstradce840a2011-04-07 14:09:50 +02009445 rcu_read_unlock();
Peter Zijlstraf492e122009-12-23 15:29:42 +01009446
9447 raw_spin_lock(&this_rq->lock);
9448
Jason Low0e5b5332014-04-28 15:45:54 -07009449 if (curr_cost > this_rq->max_idle_balance_cost)
9450 this_rq->max_idle_balance_cost = curr_cost;
9451
Daniel Lezcanoe5fc6612014-01-17 10:04:02 +01009452 /*
Jason Low0e5b5332014-04-28 15:45:54 -07009453 * While browsing the domains, we released the rq lock, a task could
9454 * have been enqueued in the meantime. Since we're not going idle,
9455 * pretend we pulled a task.
Daniel Lezcanoe5fc6612014-01-17 10:04:02 +01009456 */
Jason Low0e5b5332014-04-28 15:45:54 -07009457 if (this_rq->cfs.h_nr_running && !pulled_task)
Peter Zijlstra6e831252014-02-11 16:11:48 +01009458 pulled_task = 1;
Daniel Lezcanoe5fc6612014-01-17 10:04:02 +01009459
Peter Zijlstra6e831252014-02-11 16:11:48 +01009460out:
Jason Low52a08ef2014-05-08 17:49:22 -07009461 /* Move the next balance forward */
9462 if (time_after(this_rq->next_balance, next_balance))
9463 this_rq->next_balance = next_balance;
9464
Kirill Tkhaie4aa3582014-03-06 13:31:55 +04009465 /* Is there a task of a high priority class? */
Kirill Tkhai46383642014-03-15 02:15:07 +04009466 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
Kirill Tkhaie4aa3582014-03-06 13:31:55 +04009467 pulled_task = -1;
9468
Dietmar Eggemann38c6ade2015-10-20 13:04:41 +01009469 if (pulled_task)
Peter Zijlstra6e831252014-02-11 16:11:48 +01009470 this_rq->idle_stamp = 0;
9471
Daniel Lezcano3c4017c2014-01-17 10:04:03 +01009472 return pulled_task;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009473}
9474
9475/*
Tejun Heo969c7922010-05-06 18:49:21 +02009476 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
9477 * running tasks off the busiest CPU onto idle CPUs. It requires at
9478 * least 1 task to be running on each physical CPU where possible, and
9479 * avoids physical / logical imbalances.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009480 */
Tejun Heo969c7922010-05-06 18:49:21 +02009481static int active_load_balance_cpu_stop(void *data)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009482{
Tejun Heo969c7922010-05-06 18:49:21 +02009483 struct rq *busiest_rq = data;
9484 int busiest_cpu = cpu_of(busiest_rq);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009485 int target_cpu = busiest_rq->push_cpu;
Tejun Heo969c7922010-05-06 18:49:21 +02009486 struct rq *target_rq = cpu_rq(target_cpu);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009487 struct sched_domain *sd = NULL;
Kirill Tkhaie5673f22014-08-20 13:48:01 +04009488 struct task_struct *p = NULL;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009489 struct task_struct *push_task;
9490 int push_task_detached = 0;
9491 struct lb_env env = {
9492 .sd = sd,
9493 .dst_cpu = target_cpu,
9494 .dst_rq = target_rq,
9495 .src_cpu = busiest_rq->cpu,
9496 .src_rq = busiest_rq,
9497 .idle = CPU_IDLE,
9498 .busiest_nr_running = 0,
9499 .busiest_grp_capacity = 0,
9500 .flags = 0,
9501 .loop = 0,
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07009502 .boost_policy = sched_boost_policy(),
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009503 };
9504 bool moved = false;
Tejun Heo969c7922010-05-06 18:49:21 +02009505
9506 raw_spin_lock_irq(&busiest_rq->lock);
9507
9508 /* make sure the requested cpu hasn't gone down in the meantime */
9509 if (unlikely(busiest_cpu != smp_processor_id() ||
9510 !busiest_rq->active_balance))
9511 goto out_unlock;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009512
9513 /* Is there any task to move? */
9514 if (busiest_rq->nr_running <= 1)
Tejun Heo969c7922010-05-06 18:49:21 +02009515 goto out_unlock;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009516
9517 /*
9518 * This condition is "impossible", if it occurs
9519 * we need to fix it. Originally reported by
9520 * Bjorn Helgaas on a 128-cpu setup.
9521 */
9522 BUG_ON(busiest_rq == target_rq);
9523
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009524 push_task = busiest_rq->push_task;
9525 target_cpu = busiest_rq->push_cpu;
9526 if (push_task) {
9527 if (task_on_rq_queued(push_task) &&
9528 push_task->state == TASK_RUNNING &&
9529 task_cpu(push_task) == busiest_cpu &&
9530 cpu_online(target_cpu)) {
9531 detach_task(push_task, &env);
9532 push_task_detached = 1;
9533 moved = true;
9534 }
9535 goto out_unlock;
9536 }
9537
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009538 /* Search for an sd spanning us and the target CPU. */
Peter Zijlstradce840a2011-04-07 14:09:50 +02009539 rcu_read_lock();
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009540 for_each_domain(target_cpu, sd) {
9541 if ((sd->flags & SD_LOAD_BALANCE) &&
9542 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
9543 break;
9544 }
9545
9546 if (likely(sd)) {
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009547 env.sd = sd;
Josh Poimboeufae928822016-06-17 12:43:24 -05009548 schedstat_inc(sd->alb_count);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009549
Kirill Tkhaie5673f22014-08-20 13:48:01 +04009550 p = detach_one_task(&env);
Srikar Dronamrajud02c0712016-03-23 17:54:44 +05309551 if (p) {
Josh Poimboeufae928822016-06-17 12:43:24 -05009552 schedstat_inc(sd->alb_pushed);
Srikar Dronamrajud02c0712016-03-23 17:54:44 +05309553 /* Active balancing done, reset the failure counter. */
9554 sd->nr_balance_failed = 0;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009555 moved = true;
Srikar Dronamrajud02c0712016-03-23 17:54:44 +05309556 } else {
Josh Poimboeufae928822016-06-17 12:43:24 -05009557 schedstat_inc(sd->alb_failed);
Srikar Dronamrajud02c0712016-03-23 17:54:44 +05309558 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009559 }
Peter Zijlstradce840a2011-04-07 14:09:50 +02009560 rcu_read_unlock();
Tejun Heo969c7922010-05-06 18:49:21 +02009561out_unlock:
9562 busiest_rq->active_balance = 0;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009563 push_task = busiest_rq->push_task;
9564 target_cpu = busiest_rq->push_cpu;
9565
9566 if (push_task)
9567 busiest_rq->push_task = NULL;
9568
Kirill Tkhaie5673f22014-08-20 13:48:01 +04009569 raw_spin_unlock(&busiest_rq->lock);
9570
Syed Rameez Mustafaebc437b2016-12-13 15:57:19 -08009571 if (push_task) {
9572 if (push_task_detached)
9573 attach_one_task(target_rq, push_task);
9574 put_task_struct(push_task);
9575 clear_reserved(target_cpu);
9576 }
9577
Kirill Tkhaie5673f22014-08-20 13:48:01 +04009578 if (p)
9579 attach_one_task(target_rq, p);
9580
9581 local_irq_enable();
9582
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009583 if (moved && !same_freq_domain(busiest_cpu, target_cpu)) {
9584 int check_groups = !!(env.flags &
9585 LBF_MOVED_RELATED_THREAD_GROUP_TASK);
9586 check_for_freq_change(busiest_rq, false, check_groups);
9587 check_for_freq_change(target_rq, false, check_groups);
9588 } else if (moved) {
9589 check_for_freq_change(target_rq, true, false);
9590 }
9591
Tejun Heo969c7922010-05-06 18:49:21 +02009592 return 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009593}
9594
Mike Galbraithd987fc72011-12-05 10:01:47 +01009595static inline int on_null_domain(struct rq *rq)
9596{
9597 return unlikely(!rcu_dereference_sched(rq->sd));
9598}
9599
Frederic Weisbecker3451d022011-08-10 23:21:01 +02009600#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009601/*
9602 * idle load balancing details
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009603 * - When one of the busy CPUs notice that there may be an idle rebalancing
9604 * needed, they will kick the idle load balancer, which then does idle
9605 * load balancing for all the idle CPUs.
9606 */
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009607static struct {
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009608 cpumask_var_t idle_cpus_mask;
Suresh Siddha0b005cf2011-12-01 17:07:34 -08009609 atomic_t nr_cpus;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009610 unsigned long next_balance; /* in jiffy units */
9611} nohz ____cacheline_aligned;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009612
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009613#ifdef CONFIG_SCHED_HMP
9614static inline int find_new_hmp_ilb(int type)
9615{
9616 int call_cpu = raw_smp_processor_id();
9617 struct sched_domain *sd;
9618 int ilb;
9619
9620 rcu_read_lock();
9621
9622 /* Pick an idle cpu "closest" to call_cpu */
9623 for_each_domain(call_cpu, sd) {
9624 for_each_cpu_and(ilb, nohz.idle_cpus_mask,
9625 sched_domain_span(sd)) {
9626 if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT ||
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009627 cpu_max_power_cost(ilb) <=
9628 cpu_max_power_cost(call_cpu))) {
9629 rcu_read_unlock();
9630 reset_balance_interval(ilb);
9631 return ilb;
9632 }
9633 }
9634 }
9635
9636 rcu_read_unlock();
9637 return nr_cpu_ids;
9638}
9639#endif /* CONFIG_SCHED_HMP */
9640
9641static inline int find_new_ilb(int type)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009642{
Suresh Siddha0b005cf2011-12-01 17:07:34 -08009643 int ilb = cpumask_first(nohz.idle_cpus_mask);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009644
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009645#ifdef CONFIG_SCHED_HMP
9646 return find_new_hmp_ilb(type);
9647#endif
9648
Suresh Siddha786d6dc72011-12-01 17:07:35 -08009649 if (ilb < nr_cpu_ids && idle_cpu(ilb))
9650 return ilb;
9651
9652 return nr_cpu_ids;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009653}
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009654
9655/*
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009656 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
9657 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
9658 * CPU (if there is one).
9659 */
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009660static void nohz_balancer_kick(int type)
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009661{
9662 int ilb_cpu;
9663
9664 nohz.next_balance++;
9665
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009666 ilb_cpu = find_new_ilb(type);
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009667
Suresh Siddha0b005cf2011-12-01 17:07:34 -08009668 if (ilb_cpu >= nr_cpu_ids)
9669 return;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009670
Suresh Siddhacd490c52011-12-06 11:26:34 -08009671 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
Suresh Siddha1c792db2011-12-01 17:07:32 -08009672 return;
9673 /*
9674 * Use smp_send_reschedule() instead of resched_cpu().
9675 * This way we generate a sched IPI on the target cpu which
9676 * is idle. And the softirq performing nohz idle load balance
9677 * will be run before returning from the IPI.
9678 */
9679 smp_send_reschedule(ilb_cpu);
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009680 return;
9681}
9682
Olav Haugan3f2cb302016-05-31 14:34:46 -07009683void nohz_balance_clear_nohz_mask(int cpu)
9684{
9685 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
9686 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
9687 atomic_dec(&nohz.nr_cpus);
9688 }
9689}
9690
Thomas Gleixner20a5c8c2016-03-10 12:54:20 +01009691void nohz_balance_exit_idle(unsigned int cpu)
Suresh Siddha71325962012-01-19 18:28:57 -08009692{
9693 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
Mike Galbraithd987fc72011-12-05 10:01:47 +01009694 /*
9695 * Completely isolated CPUs don't ever set, so we must test.
9696 */
Olav Haugan3f2cb302016-05-31 14:34:46 -07009697 nohz_balance_clear_nohz_mask(cpu);
Suresh Siddha71325962012-01-19 18:28:57 -08009698 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
9699 }
9700}
9701
Suresh Siddha69e1e812011-12-01 17:07:33 -08009702static inline void set_cpu_sd_state_busy(void)
9703{
9704 struct sched_domain *sd;
Preeti U Murthy37dc6b52013-10-30 08:42:52 +05309705 int cpu = smp_processor_id();
Suresh Siddha69e1e812011-12-01 17:07:33 -08009706
Suresh Siddha69e1e812011-12-01 17:07:33 -08009707 rcu_read_lock();
Peter Zijlstra0e369d72016-05-09 10:38:01 +02009708 sd = rcu_dereference(per_cpu(sd_llc, cpu));
Vincent Guittot25f55d92013-04-23 16:59:02 +02009709
9710 if (!sd || !sd->nohz_idle)
9711 goto unlock;
9712 sd->nohz_idle = 0;
9713
Peter Zijlstra0e369d72016-05-09 10:38:01 +02009714 atomic_inc(&sd->shared->nr_busy_cpus);
Vincent Guittot25f55d92013-04-23 16:59:02 +02009715unlock:
Suresh Siddha69e1e812011-12-01 17:07:33 -08009716 rcu_read_unlock();
9717}
9718
9719void set_cpu_sd_state_idle(void)
9720{
9721 struct sched_domain *sd;
Preeti U Murthy37dc6b52013-10-30 08:42:52 +05309722 int cpu = smp_processor_id();
Suresh Siddha69e1e812011-12-01 17:07:33 -08009723
Suresh Siddha69e1e812011-12-01 17:07:33 -08009724 rcu_read_lock();
Peter Zijlstra0e369d72016-05-09 10:38:01 +02009725 sd = rcu_dereference(per_cpu(sd_llc, cpu));
Vincent Guittot25f55d92013-04-23 16:59:02 +02009726
9727 if (!sd || sd->nohz_idle)
9728 goto unlock;
9729 sd->nohz_idle = 1;
9730
Peter Zijlstra0e369d72016-05-09 10:38:01 +02009731 atomic_dec(&sd->shared->nr_busy_cpus);
Vincent Guittot25f55d92013-04-23 16:59:02 +02009732unlock:
Suresh Siddha69e1e812011-12-01 17:07:33 -08009733 rcu_read_unlock();
9734}
9735
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009736/*
Alex Shic1cc0172012-09-10 15:10:58 +08009737 * This routine will record that the cpu is going idle with tick stopped.
Suresh Siddha0b005cf2011-12-01 17:07:34 -08009738 * This info will be used in performing idle load balancing in the future.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009739 */
Alex Shic1cc0172012-09-10 15:10:58 +08009740void nohz_balance_enter_idle(int cpu)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009741{
Suresh Siddha71325962012-01-19 18:28:57 -08009742 /*
9743 * If this cpu is going down, then nothing needs to be done.
9744 */
9745 if (!cpu_active(cpu))
9746 return;
9747
Alex Shic1cc0172012-09-10 15:10:58 +08009748 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
9749 return;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009750
Mike Galbraithd987fc72011-12-05 10:01:47 +01009751 /*
9752 * If we're a completely isolated CPU, we don't play.
9753 */
Olav Haugan3f2cb302016-05-31 14:34:46 -07009754 if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu))
Mike Galbraithd987fc72011-12-05 10:01:47 +01009755 return;
9756
Alex Shic1cc0172012-09-10 15:10:58 +08009757 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9758 atomic_inc(&nohz.nr_cpus);
9759 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009760}
9761#endif
9762
9763static DEFINE_SPINLOCK(balancing);
9764
Peter Zijlstra49c022e2011-04-05 10:14:25 +02009765/*
9766 * Scale the max load_balance interval with the number of CPUs in the system.
9767 * This trades load-balance latency on larger machines for less cross talk.
9768 */
Peter Zijlstra029632f2011-10-25 10:00:11 +02009769void update_max_interval(void)
Peter Zijlstra49c022e2011-04-05 10:14:25 +02009770{
Olav Haugan3f2cb302016-05-31 14:34:46 -07009771 cpumask_t avail_mask;
9772 unsigned int available_cpus;
9773
9774 cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask);
9775 available_cpus = cpumask_weight(&avail_mask);
9776
9777 max_load_balance_interval = HZ*available_cpus/10;
Peter Zijlstra49c022e2011-04-05 10:14:25 +02009778}
9779
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009780/*
9781 * It checks each scheduling domain to see if it is due to be balanced,
9782 * and initiates a balancing operation if so.
9783 *
Libinb9b08532013-04-01 19:14:01 +08009784 * Balancing parameters are set up in init_sched_domains.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009785 */
Daniel Lezcanof7ed0a82014-01-06 12:34:43 +01009786static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009787{
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009788 int continue_balancing = 1;
Daniel Lezcanof7ed0a82014-01-06 12:34:43 +01009789 int cpu = rq->cpu;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009790 unsigned long interval;
Peter Zijlstra04f733b2012-05-11 00:12:02 +02009791 struct sched_domain *sd;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009792 /* Earliest time when we have to do rebalance again */
9793 unsigned long next_balance = jiffies + 60*HZ;
9794 int update_next_balance = 0;
Jason Lowf48627e2013-09-13 11:26:53 -07009795 int need_serialize, need_decay = 0;
9796 u64 max_cost = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009797
Paul Turner48a16752012-10-04 13:18:31 +02009798 update_blocked_averages(cpu);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08009799
Peter Zijlstradce840a2011-04-07 14:09:50 +02009800 rcu_read_lock();
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009801 for_each_domain(cpu, sd) {
Jason Lowf48627e2013-09-13 11:26:53 -07009802 /*
9803 * Decay the newidle max times here because this is a regular
9804 * visit to all the domains. Decay ~1% per second.
9805 */
9806 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
9807 sd->max_newidle_lb_cost =
9808 (sd->max_newidle_lb_cost * 253) / 256;
9809 sd->next_decay_max_lb_cost = jiffies + HZ;
9810 need_decay = 1;
9811 }
9812 max_cost += sd->max_newidle_lb_cost;
9813
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009814 if (!(sd->flags & SD_LOAD_BALANCE))
9815 continue;
9816
Jason Lowf48627e2013-09-13 11:26:53 -07009817 /*
9818 * Stop the load balance at this level. There is another
9819 * CPU in our sched group which is doing load balancing more
9820 * actively.
9821 */
9822 if (!continue_balancing) {
9823 if (need_decay)
9824 continue;
9825 break;
9826 }
9827
Jason Low52a08ef2014-05-08 17:49:22 -07009828 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009829
9830 need_serialize = sd->flags & SD_SERIALIZE;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009831 if (need_serialize) {
9832 if (!spin_trylock(&balancing))
9833 goto out;
9834 }
9835
9836 if (time_after_eq(jiffies, sd->last_balance + interval)) {
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009837 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009838 /*
Peter Zijlstra62633222013-08-19 12:41:09 +02009839 * The LBF_DST_PINNED logic could have changed
Joonsoo Kimde5eb2d2013-04-23 17:27:38 +09009840 * env->dst_cpu, so we can't know our idle
9841 * state even if we migrated tasks. Update it.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009842 */
Joonsoo Kimde5eb2d2013-04-23 17:27:38 +09009843 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009844 }
9845 sd->last_balance = jiffies;
Jason Low52a08ef2014-05-08 17:49:22 -07009846 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009847 }
9848 if (need_serialize)
9849 spin_unlock(&balancing);
9850out:
9851 if (time_after(next_balance, sd->last_balance + interval)) {
9852 next_balance = sd->last_balance + interval;
9853 update_next_balance = 1;
9854 }
Jason Lowf48627e2013-09-13 11:26:53 -07009855 }
9856 if (need_decay) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009857 /*
Jason Lowf48627e2013-09-13 11:26:53 -07009858 * Ensure the rq-wide value also decays but keep it at a
9859 * reasonable floor to avoid funnies with rq->avg_idle.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009860 */
Jason Lowf48627e2013-09-13 11:26:53 -07009861 rq->max_idle_balance_cost =
9862 max((u64)sysctl_sched_migration_cost, max_cost);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009863 }
Peter Zijlstradce840a2011-04-07 14:09:50 +02009864 rcu_read_unlock();
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009865
9866 /*
9867 * next_balance will be updated only when there is a need.
9868 * When the cpu is attached to null domain for ex, it will not be
9869 * updated.
9870 */
Vincent Guittotc5afb6a2015-08-03 11:55:50 +02009871 if (likely(update_next_balance)) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009872 rq->next_balance = next_balance;
Vincent Guittotc5afb6a2015-08-03 11:55:50 +02009873
9874#ifdef CONFIG_NO_HZ_COMMON
9875 /*
9876 * If this CPU has been elected to perform the nohz idle
9877 * balance. Other idle CPUs have already rebalanced with
9878 * nohz_idle_balance() and nohz.next_balance has been
9879 * updated accordingly. This CPU is now running the idle load
9880 * balance for itself and we need to update the
9881 * nohz.next_balance accordingly.
9882 */
9883 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
9884 nohz.next_balance = rq->next_balance;
9885#endif
9886 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009887}
9888
Frederic Weisbecker3451d022011-08-10 23:21:01 +02009889#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009890/*
Frederic Weisbecker3451d022011-08-10 23:21:01 +02009891 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009892 * rebalancing for all the cpus for whom scheduler ticks are stopped.
9893 */
Daniel Lezcano208cb162014-01-06 12:34:44 +01009894static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009895{
Daniel Lezcano208cb162014-01-06 12:34:44 +01009896 int this_cpu = this_rq->cpu;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009897 struct rq *rq;
9898 int balance_cpu;
Vincent Guittotc5afb6a2015-08-03 11:55:50 +02009899 /* Earliest time when we have to do rebalance again */
9900 unsigned long next_balance = jiffies + 60*HZ;
9901 int update_next_balance = 0;
Olav Haugand67250b2016-11-01 17:30:36 -07009902 cpumask_t cpus;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009903
Suresh Siddha1c792db2011-12-01 17:07:32 -08009904 if (idle != CPU_IDLE ||
9905 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
9906 goto end;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009907
Olav Haugand67250b2016-11-01 17:30:36 -07009908 cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);
9909
9910 for_each_cpu(balance_cpu, &cpus) {
Suresh Siddha8a6d42d2011-12-06 11:19:37 -08009911 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009912 continue;
9913
9914 /*
9915 * If this cpu gets work to do, stop the load balancing
9916 * work being done for other cpus. Next load
9917 * balancing owner will pick it up.
9918 */
Suresh Siddha1c792db2011-12-01 17:07:32 -08009919 if (need_resched())
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009920 break;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009921
Vincent Guittot5ed4f1d2012-09-13 06:11:26 +02009922 rq = cpu_rq(balance_cpu);
9923
Tim Chened61bbc2014-05-20 14:39:27 -07009924 /*
9925 * If time for next balance is due,
9926 * do the balance.
9927 */
9928 if (time_after_eq(jiffies, rq->next_balance)) {
9929 raw_spin_lock_irq(&rq->lock);
9930 update_rq_clock(rq);
Frederic Weisbeckercee1afc2016-04-13 15:56:50 +02009931 cpu_load_update_idle(rq);
Tim Chened61bbc2014-05-20 14:39:27 -07009932 raw_spin_unlock_irq(&rq->lock);
9933 rebalance_domains(rq, CPU_IDLE);
9934 }
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009935
Vincent Guittotc5afb6a2015-08-03 11:55:50 +02009936 if (time_after(next_balance, rq->next_balance)) {
9937 next_balance = rq->next_balance;
9938 update_next_balance = 1;
9939 }
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009940 }
Vincent Guittotc5afb6a2015-08-03 11:55:50 +02009941
9942 /*
9943 * next_balance will be updated only when there is a need.
9944 * When the CPU is attached to null domain for ex, it will not be
9945 * updated.
9946 */
9947 if (likely(update_next_balance))
9948 nohz.next_balance = next_balance;
Suresh Siddha1c792db2011-12-01 17:07:32 -08009949end:
9950 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009951}
9952
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009953#ifdef CONFIG_SCHED_HMP
9954static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
9955{
9956 struct sched_domain *sd;
9957 int i;
9958
9959 if (rq->nr_running < 2)
9960 return 0;
9961
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07009962 if (!sysctl_sched_restrict_cluster_spill ||
9963 sched_boost_policy() == SCHED_BOOST_ON_ALL)
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009964 return 1;
9965
Pavankumar Kondeti7b0a1442016-04-13 15:13:56 +05309966 if (cpu_max_power_cost(cpu) == max_power_cost)
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07009967 return 1;
9968
9969 rcu_read_lock();
9970 sd = rcu_dereference_check_sched_domain(rq->sd);
9971 if (!sd) {
9972 rcu_read_unlock();
9973 return 0;
9974 }
9975
9976 for_each_cpu(i, sched_domain_span(sd)) {
9977 if (cpu_load(i) < sched_spill_load &&
9978 cpu_rq(i)->nr_running <
9979 sysctl_sched_spill_nr_run) {
9980 /* Change the kick type to limit to CPUs that
9981 * are of equal or lower capacity.
9982 */
9983 *type = NOHZ_KICK_RESTRICT;
9984 break;
9985 }
9986 }
9987 rcu_read_unlock();
9988 return 1;
9989}
9990#endif /* CONFIG_SCHED_HMP */
9991
9992static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
9993{
9994 unsigned long now = jiffies;
9995
9996 /*
9997 * None are in tickless mode and hence no need for NOHZ idle load
9998 * balancing.
9999 */
10000 if (likely(!atomic_read(&nohz.nr_cpus)))
10001 return 0;
10002
10003#ifdef CONFIG_SCHED_HMP
10004 return _nohz_kick_needed_hmp(rq, cpu, type);
10005#endif
10006
10007 if (time_before(now, nohz.next_balance))
10008 return 0;
10009
10010 return (rq->nr_running >= 2);
10011}
10012
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010013/*
Suresh Siddha0b005cf2011-12-01 17:07:34 -080010014 * Current heuristic for kicking the idle load balancer in the presence
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010010015 * of an idle cpu in the system.
Suresh Siddha0b005cf2011-12-01 17:07:34 -080010016 * - This rq has more than one task.
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010010017 * - This rq has at least one CFS task and the capacity of the CPU is
10018 * significantly reduced because of RT tasks or IRQs.
10019 * - At parent of LLC scheduler domain level, this cpu's scheduler group has
10020 * multiple busy cpu.
Suresh Siddha0b005cf2011-12-01 17:07:34 -080010021 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
10022 * domain span are idle.
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010023 */
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010024static inline bool nohz_kick_needed(struct rq *rq, int *type)
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010025{
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010026#ifndef CONFIG_SCHED_HMP
Peter Zijlstra0e369d72016-05-09 10:38:01 +020010027 struct sched_domain_shared *sds;
Suresh Siddha0b005cf2011-12-01 17:07:34 -080010028 struct sched_domain *sd;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010029 int nr_busy;
10030#endif
10031 int cpu = rq->cpu;
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010010032 bool kick = false;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010033
Daniel Lezcano4a725622014-01-06 12:34:39 +010010034 if (unlikely(rq->idle_balance))
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010010035 return false;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010036
Suresh Siddha1c792db2011-12-01 17:07:32 -080010037 /*
10038 * We may be recently in ticked or tickless idle mode. At the first
10039 * busy tick after returning from idle, we will update the busy stats.
10040 */
Suresh Siddha69e1e812011-12-01 17:07:33 -080010041 set_cpu_sd_state_busy();
Alex Shic1cc0172012-09-10 15:10:58 +080010042 nohz_balance_exit_idle(cpu);
Suresh Siddha0b005cf2011-12-01 17:07:34 -080010043
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010044 if (_nohz_kick_needed(rq, cpu, type))
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010010045 return true;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010046
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010047#ifndef CONFIG_SCHED_HMP
Peter Zijlstra067491b2011-12-07 14:32:08 +010010048 rcu_read_lock();
Peter Zijlstra0e369d72016-05-09 10:38:01 +020010049 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
10050 if (sds) {
10051 /*
10052 * XXX: write a coherent comment on why we do this.
10053 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
10054 */
10055 nr_busy = atomic_read(&sds->nr_busy_cpus);
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010010056 if (nr_busy > 1) {
10057 kick = true;
10058 goto unlock;
10059 }
10060
10061 }
10062
10063 sd = rcu_dereference(rq->sd);
10064 if (sd) {
10065 if ((rq->cfs.h_nr_running >= 1) &&
10066 check_cpu_capacity(rq, sd)) {
10067 kick = true;
10068 goto unlock;
10069 }
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010070 }
Preeti U Murthy37dc6b52013-10-30 08:42:52 +053010071
10072 sd = rcu_dereference(per_cpu(sd_asym, cpu));
Preeti U Murthy37dc6b52013-10-30 08:42:52 +053010073 if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010010074 sched_domain_span(sd)) < cpu)) {
10075 kick = true;
10076 goto unlock;
10077 }
Preeti U Murthy37dc6b52013-10-30 08:42:52 +053010078
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010010079unlock:
Peter Zijlstra067491b2011-12-07 14:32:08 +010010080 rcu_read_unlock();
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010081#endif
Vincent Guittot1aaf90a2015-02-27 16:54:14 +010010082 return kick;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010083}
10084#else
Daniel Lezcano208cb162014-01-06 12:34:44 +010010085static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010086#endif
10087
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010088/*
10089 * run_rebalance_domains is triggered when needed from the scheduler tick.
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010090 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010091 */
Emese Revfy0766f782016-06-20 20:42:34 +020010092static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010093{
Daniel Lezcano208cb162014-01-06 12:34:44 +010010094 struct rq *this_rq = this_rq();
Suresh Siddha6eb57e02011-10-03 15:09:01 -070010095 enum cpu_idle_type idle = this_rq->idle_balance ?
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010096 CPU_IDLE : CPU_NOT_IDLE;
10097
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010098 /*
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010099 * If this cpu has a pending nohz_balance_kick, then do the
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010100 * balancing on behalf of the other idle cpus whose ticks are
Preeti U Murthyd4573c32015-03-26 18:32:44 +053010101 * stopped. Do nohz_idle_balance *before* rebalance_domains to
10102 * give the idle cpus a chance to load balance. Else we may
10103 * load balance only within the local sched_domain hierarchy
10104 * and abort nohz_idle_balance altogether if we pull some load.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010105 */
Daniel Lezcano208cb162014-01-06 12:34:44 +010010106 nohz_idle_balance(this_rq, idle);
Preeti U Murthyd4573c32015-03-26 18:32:44 +053010107 rebalance_domains(this_rq, idle);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010108}
10109
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010110/*
10111 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010112 */
Daniel Lezcano7caff662014-01-06 12:34:38 +010010113void trigger_load_balance(struct rq *rq)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010114{
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010115 int type = NOHZ_KICK_ANY;
10116
Olav Haugan3f2cb302016-05-31 14:34:46 -070010117 /* Don't need to rebalance while attached to NULL domain or
10118 * cpu is isolated.
10119 */
10120 if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq)))
Daniel Lezcanoc7260992014-01-06 12:34:45 +010010121 return;
10122
10123 if (time_after_eq(jiffies, rq->next_balance))
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010124 raise_softirq(SCHED_SOFTIRQ);
Frederic Weisbecker3451d022011-08-10 23:21:01 +020010125#ifdef CONFIG_NO_HZ_COMMON
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010126 if (nohz_kick_needed(rq, &type))
10127 nohz_balancer_kick(type);
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010128#endif
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010129}
10130
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +010010131static void rq_online_fair(struct rq *rq)
10132{
10133 update_sysctl();
Kirill Tkhai0e59bda2014-06-25 12:19:42 +040010134
10135 update_runtime_enabled(rq);
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +010010136}
10137
10138static void rq_offline_fair(struct rq *rq)
10139{
10140 update_sysctl();
Peter Boonstoppela4c96ae2012-08-09 15:34:47 -070010141
10142 /* Ensure any throttled groups are reachable by pick_next_task */
10143 unthrottle_offline_cfs_rqs(rq);
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +010010144}
10145
Dhaval Giani55e12e52008-06-24 23:39:43 +053010146#endif /* CONFIG_SMP */
Peter Williamse1d14842007-10-24 18:23:51 +020010147
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010148/*
10149 * scheduler tick hitting a task of our scheduling class:
10150 */
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +010010151static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010152{
10153 struct cfs_rq *cfs_rq;
10154 struct sched_entity *se = &curr->se;
10155
10156 for_each_sched_entity(se) {
10157 cfs_rq = cfs_rq_of(se);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +010010158 entity_tick(cfs_rq, se, queued);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010159 }
Ben Segall18bf2802012-10-04 12:51:20 +020010160
Srikar Dronamrajub52da862015-10-02 07:48:25 +053010161 if (static_branch_unlikely(&sched_numa_balancing))
Peter Zijlstracbee9f82012-10-25 14:16:43 +020010162 task_tick_numa(rq, curr);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010163}
10164
10165/*
Peter Zijlstracd29fe62009-11-27 17:32:46 +010010166 * called on fork with the child task as argument from the parent's context
10167 * - child not yet on the tasklist
10168 * - preemption disabled
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010169 */
Peter Zijlstracd29fe62009-11-27 17:32:46 +010010170static void task_fork_fair(struct task_struct *p)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010171{
Daisuke Nishimura4fc420c2011-12-15 14:36:55 +090010172 struct cfs_rq *cfs_rq;
10173 struct sched_entity *se = &p->se, *curr;
Peter Zijlstracd29fe62009-11-27 17:32:46 +010010174 struct rq *rq = this_rq();
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010175
Peter Zijlstrae210bff2016-06-16 18:51:48 +020010176 raw_spin_lock(&rq->lock);
Peter Zijlstra861d0342010-08-19 13:31:43 +020010177 update_rq_clock(rq);
10178
Daisuke Nishimura4fc420c2011-12-15 14:36:55 +090010179 cfs_rq = task_cfs_rq(current);
10180 curr = cfs_rq->curr;
Peter Zijlstrae210bff2016-06-16 18:51:48 +020010181 if (curr) {
10182 update_curr(cfs_rq);
Mike Galbraithb5d9d732009-09-08 11:12:28 +020010183 se->vruntime = curr->vruntime;
Peter Zijlstrae210bff2016-06-16 18:51:48 +020010184 }
Peter Zijlstraaeb73b02007-10-15 17:00:05 +020010185 place_entity(cfs_rq, se, 1);
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +020010186
Peter Zijlstracd29fe62009-11-27 17:32:46 +010010187 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
Dmitry Adamushko87fefa32007-10-15 17:00:08 +020010188 /*
Ingo Molnaredcb60a2007-10-15 17:00:08 +020010189 * Upon rescheduling, sched_class::put_prev_task() will place
10190 * 'current' within the tree based on its new key value.
10191 */
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +020010192 swap(curr->vruntime, se->vruntime);
Kirill Tkhai88751252014-06-29 00:03:57 +040010193 resched_curr(rq);
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +020010194 }
10195
Peter Zijlstra88ec22d2009-12-16 18:04:41 +010010196 se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstrae210bff2016-06-16 18:51:48 +020010197 raw_spin_unlock(&rq->lock);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010198}
10199
Steven Rostedtcb469842008-01-25 21:08:22 +010010200/*
10201 * Priority of the task has changed. Check to see if we preempt
10202 * the current task.
10203 */
Peter Zijlstrada7a7352011-01-17 17:03:27 +010010204static void
10205prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
Steven Rostedtcb469842008-01-25 21:08:22 +010010206{
Kirill Tkhaida0c1e62014-08-20 13:47:32 +040010207 if (!task_on_rq_queued(p))
Peter Zijlstrada7a7352011-01-17 17:03:27 +010010208 return;
10209
Steven Rostedtcb469842008-01-25 21:08:22 +010010210 /*
10211 * Reschedule if we are currently running on this runqueue and
10212 * our priority decreased, or if we are not currently running on
10213 * this runqueue and our priority is higher than the current's
10214 */
Peter Zijlstrada7a7352011-01-17 17:03:27 +010010215 if (rq->curr == p) {
Steven Rostedtcb469842008-01-25 21:08:22 +010010216 if (p->prio > oldprio)
Kirill Tkhai88751252014-06-29 00:03:57 +040010217 resched_curr(rq);
Steven Rostedtcb469842008-01-25 21:08:22 +010010218 } else
Peter Zijlstra15afe092008-09-20 23:38:02 +020010219 check_preempt_curr(rq, p, 0);
Steven Rostedtcb469842008-01-25 21:08:22 +010010220}
10221
Byungchul Parkdaa59402015-08-20 20:22:00 +090010222static inline bool vruntime_normalized(struct task_struct *p)
10223{
10224 struct sched_entity *se = &p->se;
10225
10226 /*
10227 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
10228 * the dequeue_entity(.flags=0) will already have normalized the
10229 * vruntime.
10230 */
10231 if (p->on_rq)
10232 return true;
10233
10234 /*
10235 * When !on_rq, vruntime of the task has usually NOT been normalized.
10236 * But there are some cases where it has already been normalized:
10237 *
10238 * - A forked child which is waiting for being woken up by
10239 * wake_up_new_task().
10240 * - A task which has been woken up by try_to_wake_up() and
10241 * waiting for actually being woken up by sched_ttwu_pending().
10242 */
10243 if (!se->sum_exec_runtime || p->state == TASK_WAKING)
10244 return true;
10245
10246 return false;
10247}
10248
10249static void detach_task_cfs_rq(struct task_struct *p)
Peter Zijlstrada7a7352011-01-17 17:03:27 +010010250{
10251 struct sched_entity *se = &p->se;
10252 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra01011472016-06-17 11:20:46 +020010253 u64 now = cfs_rq_clock_task(cfs_rq);
Peter Zijlstrada7a7352011-01-17 17:03:27 +010010254
Byungchul Parkdaa59402015-08-20 20:22:00 +090010255 if (!vruntime_normalized(p)) {
Peter Zijlstrada7a7352011-01-17 17:03:27 +010010256 /*
10257 * Fix up our vruntime so that the current sleep doesn't
10258 * cause 'unlimited' sleep bonus.
10259 */
10260 place_entity(cfs_rq, se, 0);
10261 se->vruntime -= cfs_rq->min_vruntime;
10262 }
Paul Turner9ee474f2012-10-04 13:18:30 +020010263
Yuyang Du9d89c252015-07-15 08:04:37 +080010264 /* Catch up with the cfs_rq and remove our load when we leave */
Peter Zijlstra7c3edd22016-07-13 10:56:25 +020010265 update_cfs_rq_load_avg(now, cfs_rq, false);
Byungchul Parka05e8c52015-08-20 20:21:56 +090010266 detach_entity_load_avg(cfs_rq, se);
Peter Zijlstra7c3edd22016-07-13 10:56:25 +020010267 update_tg_load_avg(cfs_rq, false);
Peter Zijlstrada7a7352011-01-17 17:03:27 +010010268}
10269
Byungchul Parkdaa59402015-08-20 20:22:00 +090010270static void attach_task_cfs_rq(struct task_struct *p)
Steven Rostedtcb469842008-01-25 21:08:22 +010010271{
Kirill Tkhaif36c0192014-08-06 12:06:01 +040010272 struct sched_entity *se = &p->se;
Byungchul Parkdaa59402015-08-20 20:22:00 +090010273 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra01011472016-06-17 11:20:46 +020010274 u64 now = cfs_rq_clock_task(cfs_rq);
Byungchul Park7855a352015-08-10 18:02:55 +090010275
10276#ifdef CONFIG_FAIR_GROUP_SCHED
Michael wangeb7a59b2014-02-20 11:14:53 +080010277 /*
10278 * Since the real-depth could have been changed (only FAIR
10279 * class maintain depth value), reset depth properly.
10280 */
10281 se->depth = se->parent ? se->parent->depth + 1 : 0;
10282#endif
Byungchul Park7855a352015-08-10 18:02:55 +090010283
Byungchul Park6efdb102015-08-20 20:21:59 +090010284 /* Synchronize task with its cfs_rq */
Peter Zijlstra7c3edd22016-07-13 10:56:25 +020010285 update_cfs_rq_load_avg(now, cfs_rq, false);
Byungchul Parkdaa59402015-08-20 20:22:00 +090010286 attach_entity_load_avg(cfs_rq, se);
Peter Zijlstra7c3edd22016-07-13 10:56:25 +020010287 update_tg_load_avg(cfs_rq, false);
Byungchul Park6efdb102015-08-20 20:21:59 +090010288
Byungchul Parkdaa59402015-08-20 20:22:00 +090010289 if (!vruntime_normalized(p))
10290 se->vruntime += cfs_rq->min_vruntime;
10291}
Byungchul Park7855a352015-08-10 18:02:55 +090010292
Byungchul Parkdaa59402015-08-20 20:22:00 +090010293static void switched_from_fair(struct rq *rq, struct task_struct *p)
10294{
10295 detach_task_cfs_rq(p);
10296}
10297
10298static void switched_to_fair(struct rq *rq, struct task_struct *p)
10299{
10300 attach_task_cfs_rq(p);
10301
10302 if (task_on_rq_queued(p)) {
Byungchul Park7855a352015-08-10 18:02:55 +090010303 /*
Byungchul Parkdaa59402015-08-20 20:22:00 +090010304 * We were most likely switched from sched_rt, so
10305 * kick off the schedule if running, otherwise just see
10306 * if we can still preempt the current task.
Byungchul Park7855a352015-08-10 18:02:55 +090010307 */
Byungchul Parkdaa59402015-08-20 20:22:00 +090010308 if (rq->curr == p)
10309 resched_curr(rq);
10310 else
10311 check_preempt_curr(rq, p, 0);
Byungchul Park7855a352015-08-10 18:02:55 +090010312 }
Steven Rostedtcb469842008-01-25 21:08:22 +010010313}
10314
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +020010315/* Account for a task changing its policy or group.
10316 *
10317 * This routine is mostly called to set cfs_rq->curr field when a task
10318 * migrates between groups/classes.
10319 */
10320static void set_curr_task_fair(struct rq *rq)
10321{
10322 struct sched_entity *se = &rq->curr->se;
10323
Paul Turnerec12cb72011-07-21 09:43:30 -070010324 for_each_sched_entity(se) {
10325 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10326
10327 set_next_entity(cfs_rq, se);
10328 /* ensure bandwidth has been allocated on our new cfs_rq */
10329 account_cfs_rq_runtime(cfs_rq, 0);
10330 }
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +020010331}
10332
Peter Zijlstra029632f2011-10-25 10:00:11 +020010333void init_cfs_rq(struct cfs_rq *cfs_rq)
10334{
10335 cfs_rq->tasks_timeline = RB_ROOT;
Peter Zijlstra029632f2011-10-25 10:00:11 +020010336 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
10337#ifndef CONFIG_64BIT
10338 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
10339#endif
Alex Shi141965c2013-06-26 13:05:39 +080010340#ifdef CONFIG_SMP
Yuyang Du9d89c252015-07-15 08:04:37 +080010341 atomic_long_set(&cfs_rq->removed_load_avg, 0);
10342 atomic_long_set(&cfs_rq->removed_util_avg, 0);
Paul Turner9ee474f2012-10-04 13:18:30 +020010343#endif
Peter Zijlstra029632f2011-10-25 10:00:11 +020010344}
10345
Peter Zijlstra810b3812008-02-29 15:21:01 -050010346#ifdef CONFIG_FAIR_GROUP_SCHED
Vincent Guittotea86cb42016-06-17 13:38:55 +020010347static void task_set_group_fair(struct task_struct *p)
10348{
10349 struct sched_entity *se = &p->se;
10350
10351 set_task_rq(p, task_cpu(p));
10352 se->depth = se->parent ? se->parent->depth + 1 : 0;
10353}
10354
Peter Zijlstrabc54da22015-08-31 17:13:55 +020010355static void task_move_group_fair(struct task_struct *p)
Peter Zijlstra810b3812008-02-29 15:21:01 -050010356{
Byungchul Parkdaa59402015-08-20 20:22:00 +090010357 detach_task_cfs_rq(p);
Peter Zijlstrab2b5ce02010-10-15 15:24:15 +020010358 set_task_rq(p, task_cpu(p));
Byungchul Park6efdb102015-08-20 20:21:59 +090010359
10360#ifdef CONFIG_SMP
10361 /* Tell se's cfs_rq has been changed -- migrated */
10362 p->se.avg.last_update_time = 0;
10363#endif
Byungchul Parkdaa59402015-08-20 20:22:00 +090010364 attach_task_cfs_rq(p);
Peter Zijlstra810b3812008-02-29 15:21:01 -050010365}
Peter Zijlstra029632f2011-10-25 10:00:11 +020010366
Vincent Guittotea86cb42016-06-17 13:38:55 +020010367static void task_change_group_fair(struct task_struct *p, int type)
10368{
10369 switch (type) {
10370 case TASK_SET_GROUP:
10371 task_set_group_fair(p);
10372 break;
10373
10374 case TASK_MOVE_GROUP:
10375 task_move_group_fair(p);
10376 break;
10377 }
10378}
10379
Peter Zijlstra029632f2011-10-25 10:00:11 +020010380void free_fair_sched_group(struct task_group *tg)
10381{
10382 int i;
10383
10384 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
10385
10386 for_each_possible_cpu(i) {
10387 if (tg->cfs_rq)
10388 kfree(tg->cfs_rq[i]);
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010010389 if (tg->se)
Peter Zijlstra029632f2011-10-25 10:00:11 +020010390 kfree(tg->se[i]);
10391 }
10392
10393 kfree(tg->cfs_rq);
10394 kfree(tg->se);
10395}
10396
10397int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10398{
Peter Zijlstra029632f2011-10-25 10:00:11 +020010399 struct sched_entity *se;
Peter Zijlstrab7fa30c2016-06-09 15:07:50 +020010400 struct cfs_rq *cfs_rq;
Peter Zijlstra029632f2011-10-25 10:00:11 +020010401 int i;
10402
10403 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
10404 if (!tg->cfs_rq)
10405 goto err;
10406 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
10407 if (!tg->se)
10408 goto err;
10409
10410 tg->shares = NICE_0_LOAD;
10411
10412 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
10413
10414 for_each_possible_cpu(i) {
10415 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
10416 GFP_KERNEL, cpu_to_node(i));
10417 if (!cfs_rq)
10418 goto err;
10419
10420 se = kzalloc_node(sizeof(struct sched_entity),
10421 GFP_KERNEL, cpu_to_node(i));
10422 if (!se)
10423 goto err_free_rq;
10424
10425 init_cfs_rq(cfs_rq);
10426 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
Yuyang Du540247f2015-07-15 08:04:39 +080010427 init_entity_runnable_average(se);
Peter Zijlstra029632f2011-10-25 10:00:11 +020010428 }
10429
10430 return 1;
10431
10432err_free_rq:
10433 kfree(cfs_rq);
10434err:
10435 return 0;
10436}
10437
Peter Zijlstra8663e242016-06-22 14:58:02 +020010438void online_fair_sched_group(struct task_group *tg)
10439{
10440 struct sched_entity *se;
10441 struct rq *rq;
10442 int i;
10443
10444 for_each_possible_cpu(i) {
10445 rq = cpu_rq(i);
10446 se = tg->se[i];
10447
10448 raw_spin_lock_irq(&rq->lock);
10449 post_init_entity_util_avg(se);
Peter Zijlstra55e16d32016-06-22 15:14:26 +020010450 sync_throttle(tg, i);
Peter Zijlstra8663e242016-06-22 14:58:02 +020010451 raw_spin_unlock_irq(&rq->lock);
10452 }
10453}
10454
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010010455void unregister_fair_sched_group(struct task_group *tg)
Peter Zijlstra029632f2011-10-25 10:00:11 +020010456{
Peter Zijlstra029632f2011-10-25 10:00:11 +020010457 unsigned long flags;
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010010458 struct rq *rq;
10459 int cpu;
Peter Zijlstra029632f2011-10-25 10:00:11 +020010460
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010010461 for_each_possible_cpu(cpu) {
10462 if (tg->se[cpu])
10463 remove_entity_load_avg(tg->se[cpu]);
Peter Zijlstra029632f2011-10-25 10:00:11 +020010464
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010010465 /*
10466 * Only empty task groups can be destroyed; so we can speculatively
10467 * check on_list without danger of it being re-added.
10468 */
10469 if (!tg->cfs_rq[cpu]->on_list)
10470 continue;
10471
10472 rq = cpu_rq(cpu);
10473
10474 raw_spin_lock_irqsave(&rq->lock, flags);
10475 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
10476 raw_spin_unlock_irqrestore(&rq->lock, flags);
10477 }
Peter Zijlstra029632f2011-10-25 10:00:11 +020010478}
10479
10480void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
10481 struct sched_entity *se, int cpu,
10482 struct sched_entity *parent)
10483{
10484 struct rq *rq = cpu_rq(cpu);
10485
10486 cfs_rq->tg = tg;
10487 cfs_rq->rq = rq;
Peter Zijlstra029632f2011-10-25 10:00:11 +020010488 init_cfs_rq_runtime(cfs_rq);
10489
10490 tg->cfs_rq[cpu] = cfs_rq;
10491 tg->se[cpu] = se;
10492
10493 /* se could be NULL for root_task_group */
10494 if (!se)
10495 return;
10496
Peter Zijlstrafed14d42012-02-11 06:05:00 +010010497 if (!parent) {
Peter Zijlstra029632f2011-10-25 10:00:11 +020010498 se->cfs_rq = &rq->cfs;
Peter Zijlstrafed14d42012-02-11 06:05:00 +010010499 se->depth = 0;
10500 } else {
Peter Zijlstra029632f2011-10-25 10:00:11 +020010501 se->cfs_rq = parent->my_q;
Peter Zijlstrafed14d42012-02-11 06:05:00 +010010502 se->depth = parent->depth + 1;
10503 }
Peter Zijlstra029632f2011-10-25 10:00:11 +020010504
10505 se->my_q = cfs_rq;
Paul Turner0ac9b1c2013-10-16 11:16:27 -070010506 /* guarantee group entities always have weight */
10507 update_load_set(&se->load, NICE_0_LOAD);
Peter Zijlstra029632f2011-10-25 10:00:11 +020010508 se->parent = parent;
10509}
10510
10511static DEFINE_MUTEX(shares_mutex);
10512
10513int sched_group_set_shares(struct task_group *tg, unsigned long shares)
10514{
10515 int i;
10516 unsigned long flags;
10517
10518 /*
10519 * We can't change the weight of the root cgroup.
10520 */
10521 if (!tg->se[0])
10522 return -EINVAL;
10523
10524 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
10525
10526 mutex_lock(&shares_mutex);
10527 if (tg->shares == shares)
10528 goto done;
10529
10530 tg->shares = shares;
10531 for_each_possible_cpu(i) {
10532 struct rq *rq = cpu_rq(i);
10533 struct sched_entity *se;
10534
10535 se = tg->se[i];
10536 /* Propagate contribution to hierarchy */
10537 raw_spin_lock_irqsave(&rq->lock, flags);
Frederic Weisbecker71b1da42013-04-12 01:50:59 +020010538
10539 /* Possible calls to update_curr() need rq clock */
10540 update_rq_clock(rq);
Linus Torvalds17bc14b2012-12-14 07:20:43 -080010541 for_each_sched_entity(se)
Peter Zijlstra029632f2011-10-25 10:00:11 +020010542 update_cfs_shares(group_cfs_rq(se));
10543 raw_spin_unlock_irqrestore(&rq->lock, flags);
10544 }
10545
10546done:
10547 mutex_unlock(&shares_mutex);
10548 return 0;
10549}
10550#else /* CONFIG_FAIR_GROUP_SCHED */
10551
10552void free_fair_sched_group(struct task_group *tg) { }
10553
10554int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10555{
10556 return 1;
10557}
10558
Peter Zijlstra8663e242016-06-22 14:58:02 +020010559void online_fair_sched_group(struct task_group *tg) { }
10560
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010010561void unregister_fair_sched_group(struct task_group *tg) { }
Peter Zijlstra029632f2011-10-25 10:00:11 +020010562
10563#endif /* CONFIG_FAIR_GROUP_SCHED */
10564
Peter Zijlstra810b3812008-02-29 15:21:01 -050010565
H Hartley Sweeten6d686f42010-01-13 20:21:52 -070010566static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
Peter Williams0d721ce2009-09-21 01:31:53 +000010567{
10568 struct sched_entity *se = &task->se;
Peter Williams0d721ce2009-09-21 01:31:53 +000010569 unsigned int rr_interval = 0;
10570
10571 /*
10572 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
10573 * idle runqueue:
10574 */
Peter Williams0d721ce2009-09-21 01:31:53 +000010575 if (rq->cfs.load.weight)
Zhu Yanhaia59f4e02013-01-08 12:56:52 +080010576 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
Peter Williams0d721ce2009-09-21 01:31:53 +000010577
10578 return rr_interval;
10579}
10580
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010581/*
10582 * All the scheduling class methods:
10583 */
Peter Zijlstra029632f2011-10-25 10:00:11 +020010584const struct sched_class fair_sched_class = {
Ingo Molnar5522d5d2007-10-15 17:00:12 +020010585 .next = &idle_sched_class,
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010586 .enqueue_task = enqueue_task_fair,
10587 .dequeue_task = dequeue_task_fair,
10588 .yield_task = yield_task_fair,
Mike Galbraithd95f4122011-02-01 09:50:51 -050010589 .yield_to_task = yield_to_task_fair,
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010590
Ingo Molnar2e09bf52007-10-15 17:00:05 +020010591 .check_preempt_curr = check_preempt_wakeup,
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010592
10593 .pick_next_task = pick_next_task_fair,
10594 .put_prev_task = put_prev_task_fair,
10595
Peter Williams681f3e62007-10-24 18:23:51 +020010596#ifdef CONFIG_SMP
Li Zefan4ce72a22008-10-22 15:25:26 +080010597 .select_task_rq = select_task_rq_fair,
Paul Turner0a74bef2012-10-04 13:18:30 +020010598 .migrate_task_rq = migrate_task_rq_fair,
Alex Shi141965c2013-06-26 13:05:39 +080010599
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +010010600 .rq_online = rq_online_fair,
10601 .rq_offline = rq_offline_fair,
Peter Zijlstra88ec22d2009-12-16 18:04:41 +010010602
Yuyang Du12695572015-07-15 08:04:40 +080010603 .task_dead = task_dead_fair,
Peter Zijlstrac5b28032015-05-15 17:43:35 +020010604 .set_cpus_allowed = set_cpus_allowed_common,
Peter Williams681f3e62007-10-24 18:23:51 +020010605#endif
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010606
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +020010607 .set_curr_task = set_curr_task_fair,
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010608 .task_tick = task_tick_fair,
Peter Zijlstracd29fe62009-11-27 17:32:46 +010010609 .task_fork = task_fork_fair,
Steven Rostedtcb469842008-01-25 21:08:22 +010010610
10611 .prio_changed = prio_changed_fair,
Peter Zijlstrada7a7352011-01-17 17:03:27 +010010612 .switched_from = switched_from_fair,
Steven Rostedtcb469842008-01-25 21:08:22 +010010613 .switched_to = switched_to_fair,
Peter Zijlstra810b3812008-02-29 15:21:01 -050010614
Peter Williams0d721ce2009-09-21 01:31:53 +000010615 .get_rr_interval = get_rr_interval_fair,
10616
Stanislaw Gruszka6e998912014-11-12 16:58:44 +010010617 .update_curr = update_curr_fair,
10618
Peter Zijlstra810b3812008-02-29 15:21:01 -050010619#ifdef CONFIG_FAIR_GROUP_SCHED
Vincent Guittotea86cb42016-06-17 13:38:55 +020010620 .task_change_group = task_change_group_fair,
Peter Zijlstra810b3812008-02-29 15:21:01 -050010621#endif
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -070010622#ifdef CONFIG_SCHED_HMP
10623 .fixup_hmp_sched_stats = fixup_hmp_sched_stats_fair,
10624#endif
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010625};
10626
10627#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra029632f2011-10-25 10:00:11 +020010628void print_cfs_stats(struct seq_file *m, int cpu)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010629{
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010630 struct cfs_rq *cfs_rq;
10631
Peter Zijlstra5973e5b2008-01-25 21:08:34 +010010632 rcu_read_lock();
Ingo Molnarc3b64f12007-08-09 11:16:51 +020010633 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
Ingo Molnar5cef9ec2007-08-09 11:16:47 +020010634 print_cfs_rq(m, cpu, cfs_rq);
Peter Zijlstra5973e5b2008-01-25 21:08:34 +010010635 rcu_read_unlock();
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010636}
Srikar Dronamraju397f2372015-06-25 22:51:43 +053010637
10638#ifdef CONFIG_NUMA_BALANCING
10639void show_numa_stats(struct task_struct *p, struct seq_file *m)
10640{
10641 int node;
10642 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
10643
10644 for_each_online_node(node) {
10645 if (p->numa_faults) {
10646 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
10647 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
10648 }
10649 if (p->numa_group) {
10650 gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
10651 gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
10652 }
10653 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
10654 }
10655}
10656#endif /* CONFIG_NUMA_BALANCING */
10657#endif /* CONFIG_SCHED_DEBUG */
Peter Zijlstra029632f2011-10-25 10:00:11 +020010658
10659__init void init_sched_fair_class(void)
10660{
10661#ifdef CONFIG_SMP
10662 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
10663
Frederic Weisbecker3451d022011-08-10 23:21:01 +020010664#ifdef CONFIG_NO_HZ_COMMON
Diwakar Tundlam554ceca2012-03-07 14:44:26 -080010665 nohz.next_balance = jiffies;
Peter Zijlstra029632f2011-10-25 10:00:11 +020010666 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
Peter Zijlstra029632f2011-10-25 10:00:11 +020010667#endif
10668#endif /* SMP */
10669
10670}