blob: f288da4c2b62843d36f32ea3cc3dc4294d9dc057 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Peter Zijlstra391e43d2011-11-15 17:14:39 +01002 * kernel/sched/core.c
Linus Torvalds1da177e2005-04-16 15:20:36 -07003 *
4 * Kernel scheduler and related syscalls
5 *
6 * Copyright (C) 1991-2002 Linus Torvalds
7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
Ingo Molnarc31f2e82007-07-09 18:52:01 +020019 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
Ingo Molnarb9131762008-01-25 21:08:19 +010025 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 * Thomas Gleixner, Mike Kravetz
Linus Torvalds1da177e2005-04-16 15:20:36 -070027 */
28
Mark Rutlande1b77c92016-03-09 14:08:18 -080029#include <linux/kasan.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070030#include <linux/mm.h>
31#include <linux/module.h>
32#include <linux/nmi.h>
33#include <linux/init.h>
Ingo Molnardff06c12007-07-09 18:52:00 +020034#include <linux/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070035#include <linux/highmem.h>
Andy Lutomirskif98db602016-04-26 09:39:06 -070036#include <linux/mmu_context.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070037#include <linux/interrupt.h>
Randy.Dunlapc59ede72006-01-11 12:17:46 -080038#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039#include <linux/completion.h>
40#include <linux/kernel_stat.h>
Ingo Molnar9a11b49a2006-07-03 00:24:33 -070041#include <linux/debug_locks.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020042#include <linux/perf_event.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070043#include <linux/security.h>
44#include <linux/notifier.h>
45#include <linux/profile.h>
Nigel Cunningham7dfb7102006-12-06 20:34:23 -080046#include <linux/freezer.h>
akpm@osdl.org198e2f12006-01-12 01:05:30 -080047#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070048#include <linux/blkdev.h>
49#include <linux/delay.h>
Pavel Emelyanovb4888932007-10-18 23:40:14 -070050#include <linux/pid_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070051#include <linux/smp.h>
52#include <linux/threads.h>
53#include <linux/timer.h>
54#include <linux/rcupdate.h>
55#include <linux/cpu.h>
56#include <linux/cpuset.h>
57#include <linux/percpu.h>
Alexey Dobriyanb5aadf72008-10-06 13:23:43 +040058#include <linux/proc_fs.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070059#include <linux/seq_file.h>
Nick Piggine692ab52007-07-26 13:40:43 +020060#include <linux/sysctl.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070061#include <linux/syscalls.h>
62#include <linux/times.h>
Jay Lan8f0ab512006-09-30 23:28:59 -070063#include <linux/tsacct_kern.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080064#include <linux/kprobes.h>
Shailabh Nagar0ff92242006-07-14 00:24:37 -070065#include <linux/delayacct.h>
Ingo Molnardff06c12007-07-09 18:52:00 +020066#include <linux/unistd.h>
Jens Axboef5ff8422007-09-21 09:19:54 +020067#include <linux/pagemap.h>
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +010068#include <linux/hrtimer.h>
Reynes Philippe30914a52008-03-17 16:19:05 -070069#include <linux/tick.h>
Peter Zijlstraf00b45c2008-04-19 19:45:00 +020070#include <linux/ctype.h>
Steven Rostedt6cd8a4b2008-05-12 21:20:42 +020071#include <linux/ftrace.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090072#include <linux/slab.h>
Carsten Emdef1c6f1a2011-10-26 23:14:16 +020073#include <linux/init_task.h>
Frederic Weisbecker91d1aa432012-11-27 19:33:25 +010074#include <linux/context_tracking.h>
Gideon Israel Dsouza52f5684c2014-04-07 15:39:20 -070075#include <linux/compiler.h>
Josh Poimboeuf8e05e962016-02-28 22:22:38 -060076#include <linux/frame.h>
Giovanni Gherdovich60756202016-08-05 10:21:56 +020077#include <linux/prefetch.h>
Olav Haugan3f2cb302016-05-31 14:34:46 -070078#include <linux/irq.h>
Connor O'Brien6e7b83d2018-01-31 18:11:57 -080079#include <linux/cpufreq_times.h>
Ingo Molnarbadaff82017-02-08 08:45:17 +010080#include <linux/sched/loadavg.h>
Suren Baghdasaryanc405bfb2019-02-17 15:07:38 -080081#include <linux/cgroup-defs.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070082
David Howells96f951e2012-03-28 18:30:03 +010083#include <asm/switch_to.h>
Eric Dumazet5517d862007-05-08 00:32:57 -070084#include <asm/tlb.h>
Satyam Sharma838225b2007-10-24 18:23:50 +020085#include <asm/irq_regs.h>
Christian Borntraegerdb7e5272012-01-11 08:58:16 +010086#include <asm/mutex.h>
Glauber Costae6e66852011-07-11 15:28:17 -040087#ifdef CONFIG_PARAVIRT
88#include <asm/paravirt.h>
89#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -070090
Peter Zijlstra029632f2011-10-25 10:00:11 +020091#include "sched.h"
Joonwoo Parkf7d6cd42017-01-17 15:19:43 -080092#include "walt.h"
Tejun Heoea138442013-01-18 14:05:55 -080093#include "../workqueue_internal.h"
Thomas Gleixner29d5e042012-04-20 13:05:45 +000094#include "../smpboot.h"
Pavankumar Kondetie9afb1a2017-03-21 15:03:35 +053095#include "../time/tick-internal.h"
Gregory Haskins6e0534f2008-05-12 21:21:01 +020096
Steven Rostedta8d154b2009-04-10 09:36:00 -040097#define CREATE_TRACE_POINTS
Steven Rostedtad8d75f2009-04-14 19:39:12 -040098#include <trace/events/sched.h>
Srivatsa Vaddagiri26c21542016-05-31 09:08:38 -070099#include "walt.h"
Steven Rostedta8d154b2009-04-10 09:36:00 -0400100
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -0700101ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head);
102
Peter Zijlstra029632f2011-10-25 10:00:11 +0200103DEFINE_MUTEX(sched_domains_mutex);
104DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
Peter Zijlstradc61b1d2010-06-08 11:40:42 +0200105
Peter Zijlstrafe44d622010-12-09 14:15:34 +0100106static void update_rq_clock_task(struct rq *rq, s64 delta);
Venkatesh Pallipadi305e6832010-10-04 17:03:21 -0700107
Peter Zijlstra029632f2011-10-25 10:00:11 +0200108void update_rq_clock(struct rq *rq)
Peter Zijlstra3e51f332008-05-03 18:29:28 +0200109{
Peter Zijlstrafe44d622010-12-09 14:15:34 +0100110 s64 delta;
Venkatesh Pallipadi305e6832010-10-04 17:03:21 -0700111
Peter Zijlstra9edfbfe2015-01-05 11:18:11 +0100112 lockdep_assert_held(&rq->lock);
113
114 if (rq->clock_skip_update & RQCF_ACT_SKIP)
Mike Galbraithf26f9af2010-12-08 11:05:42 +0100115 return;
Venkatesh Pallipadiaa483802010-10-04 17:03:22 -0700116
Peter Zijlstrafe44d622010-12-09 14:15:34 +0100117 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
Mike Galbraith4036ac12014-06-24 07:49:40 +0200118 if (delta < 0)
119 return;
Peter Zijlstrafe44d622010-12-09 14:15:34 +0100120 rq->clock += delta;
121 update_rq_clock_task(rq, delta);
Peter Zijlstra3e51f332008-05-03 18:29:28 +0200122}
123
Ingo Molnare436d802007-07-19 21:28:35 +0200124/*
Ingo Molnarbf5c91b2007-10-15 17:00:04 +0200125 * Debugging: various feature bits
126 */
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200127
128#define SCHED_FEAT(name, enabled) \
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200129 (1UL << __SCHED_FEAT_##name) * enabled |
130
131const_debug unsigned int sysctl_sched_features =
Peter Zijlstra391e43d2011-11-15 17:14:39 +0100132#include "features.h"
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200133 0;
134
135#undef SCHED_FEAT
136
Ingo Molnarbf5c91b2007-10-15 17:00:04 +0200137/*
Peter Zijlstrab82d9fd2007-11-09 22:39:39 +0100138 * Number of tasks to iterate in a single balance run.
139 * Limited because this is done with IRQs disabled.
140 */
141const_debug unsigned int sysctl_sched_nr_migrate = 32;
142
143/*
Peter Zijlstrae9e92502009-09-01 10:34:37 +0200144 * period over which we average the RT time consumption, measured
145 * in ms.
146 *
147 * default: 1s
148 */
149const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
150
151/*
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +0100152 * period over which we measure -rt task cpu usage in us.
Peter Zijlstrafa85ae22008-01-25 21:08:29 +0100153 * default: 1s
154 */
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +0100155unsigned int sysctl_sched_rt_period = 1000000;
Peter Zijlstrafa85ae22008-01-25 21:08:29 +0100156
Peter Zijlstra029632f2011-10-25 10:00:11 +0200157__read_mostly int scheduler_running;
Ingo Molnar6892b752008-02-13 14:02:36 +0100158
Peter Zijlstrafa85ae22008-01-25 21:08:29 +0100159/*
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +0100160 * part of the period that we allow rt tasks to run in us.
161 * default: 0.95s
Peter Zijlstrafa85ae22008-01-25 21:08:29 +0100162 */
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +0100163int sysctl_sched_rt_runtime = 950000;
164
Rik van Riel3fa08182015-03-09 12:12:07 -0400165/* cpus with isolated domains */
166cpumask_var_t cpu_isolated_map;
167
Patrick Bellasid2489002016-07-28 18:44:40 +0100168struct rq *
169lock_rq_of(struct task_struct *p, struct rq_flags *flags)
170{
171 return task_rq_lock(p, flags);
172}
173
174void
175unlock_rq_of(struct rq *rq, struct task_struct *p, struct rq_flags *flags)
176{
177 task_rq_unlock(rq, p, flags);
178}
179
Dario Faggioli332ac172013-11-07 14:43:45 +0100180/*
Peter Zijlstra3e71a462016-04-28 16:16:33 +0200181 * __task_rq_lock - lock the rq @p resides on.
182 */
Peter Zijlstraeb580752015-07-31 21:28:18 +0200183struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
Peter Zijlstra3e71a462016-04-28 16:16:33 +0200184 __acquires(rq->lock)
185{
186 struct rq *rq;
187
188 lockdep_assert_held(&p->pi_lock);
189
190 for (;;) {
191 rq = task_rq(p);
192 raw_spin_lock(&rq->lock);
193 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
Matt Fleming5a91d732016-09-21 14:38:10 +0100194 rq_pin_lock(rq, rf);
Peter Zijlstra3e71a462016-04-28 16:16:33 +0200195 return rq;
196 }
197 raw_spin_unlock(&rq->lock);
198
199 while (unlikely(task_on_rq_migrating(p)))
200 cpu_relax();
201 }
202}
203
204/*
205 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
206 */
Peter Zijlstraeb580752015-07-31 21:28:18 +0200207struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
Peter Zijlstra3e71a462016-04-28 16:16:33 +0200208 __acquires(p->pi_lock)
209 __acquires(rq->lock)
210{
211 struct rq *rq;
212
213 for (;;) {
Peter Zijlstraeb580752015-07-31 21:28:18 +0200214 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
Peter Zijlstra3e71a462016-04-28 16:16:33 +0200215 rq = task_rq(p);
216 raw_spin_lock(&rq->lock);
217 /*
218 * move_queued_task() task_rq_lock()
219 *
220 * ACQUIRE (rq->lock)
221 * [S] ->on_rq = MIGRATING [L] rq = task_rq()
222 * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
223 * [S] ->cpu = new_cpu [L] task_rq()
224 * [L] ->on_rq
225 * RELEASE (rq->lock)
226 *
227 * If we observe the old cpu in task_rq_lock, the acquire of
228 * the old rq->lock will fully serialize against the stores.
229 *
230 * If we observe the new cpu in task_rq_lock, the acquire will
231 * pair with the WMB to ensure we must then also see migrating.
232 */
233 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
Matt Fleming5a91d732016-09-21 14:38:10 +0100234 rq_pin_lock(rq, rf);
Peter Zijlstra3e71a462016-04-28 16:16:33 +0200235 return rq;
236 }
237 raw_spin_unlock(&rq->lock);
Peter Zijlstraeb580752015-07-31 21:28:18 +0200238 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
Peter Zijlstra3e71a462016-04-28 16:16:33 +0200239
240 while (unlikely(task_on_rq_migrating(p)))
241 cpu_relax();
242 }
243}
244
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +0100245#ifdef CONFIG_SCHED_HRTICK
246/*
247 * Use HR-timers to deliver accurate preemption points.
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +0100248 */
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +0100249
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +0100250static void hrtick_clear(struct rq *rq)
251{
252 if (hrtimer_active(&rq->hrtick_timer))
253 hrtimer_cancel(&rq->hrtick_timer);
254}
255
256/*
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +0100257 * High-resolution timer tick.
258 * Runs from hardirq context with interrupts disabled.
259 */
260static enum hrtimer_restart hrtick(struct hrtimer *timer)
261{
262 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
263
264 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
265
Thomas Gleixner05fa7852009-11-17 14:28:38 +0100266 raw_spin_lock(&rq->lock);
Peter Zijlstra3e51f332008-05-03 18:29:28 +0200267 update_rq_clock(rq);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +0100268 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
Thomas Gleixner05fa7852009-11-17 14:28:38 +0100269 raw_spin_unlock(&rq->lock);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +0100270
271 return HRTIMER_NORESTART;
272}
273
Rabin Vincent95e904c2008-05-11 05:55:33 +0530274#ifdef CONFIG_SMP
Peter Zijlstra971ee282013-06-28 11:18:53 +0200275
Thomas Gleixner4961b6e2015-04-14 21:09:05 +0000276static void __hrtick_restart(struct rq *rq)
Peter Zijlstra971ee282013-06-28 11:18:53 +0200277{
278 struct hrtimer *timer = &rq->hrtick_timer;
Peter Zijlstra971ee282013-06-28 11:18:53 +0200279
Thomas Gleixner4961b6e2015-04-14 21:09:05 +0000280 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
Peter Zijlstra971ee282013-06-28 11:18:53 +0200281}
282
Peter Zijlstra31656512008-07-18 18:01:23 +0200283/*
284 * called from hardirq (IPI) context
285 */
286static void __hrtick_start(void *arg)
Peter Zijlstrab328ca12008-04-29 10:02:46 +0200287{
Peter Zijlstra31656512008-07-18 18:01:23 +0200288 struct rq *rq = arg;
Peter Zijlstrab328ca12008-04-29 10:02:46 +0200289
Thomas Gleixner05fa7852009-11-17 14:28:38 +0100290 raw_spin_lock(&rq->lock);
Peter Zijlstra971ee282013-06-28 11:18:53 +0200291 __hrtick_restart(rq);
Peter Zijlstra31656512008-07-18 18:01:23 +0200292 rq->hrtick_csd_pending = 0;
Thomas Gleixner05fa7852009-11-17 14:28:38 +0100293 raw_spin_unlock(&rq->lock);
Peter Zijlstrab328ca12008-04-29 10:02:46 +0200294}
295
Peter Zijlstra31656512008-07-18 18:01:23 +0200296/*
297 * Called to set the hrtick timer state.
298 *
299 * called with rq->lock held and irqs disabled
300 */
Peter Zijlstra029632f2011-10-25 10:00:11 +0200301void hrtick_start(struct rq *rq, u64 delay)
Peter Zijlstrab328ca12008-04-29 10:02:46 +0200302{
Peter Zijlstra31656512008-07-18 18:01:23 +0200303 struct hrtimer *timer = &rq->hrtick_timer;
xiaofeng.yan177ef2a2014-08-26 03:15:41 +0000304 ktime_t time;
305 s64 delta;
306
307 /*
308 * Don't schedule slices shorter than 10000ns, that just
309 * doesn't make sense and can cause timer DoS.
310 */
311 delta = max_t(s64, delay, 10000LL);
312 time = ktime_add_ns(timer->base->get_time(), delta);
Peter Zijlstrab328ca12008-04-29 10:02:46 +0200313
Arjan van de Vencc584b22008-09-01 15:02:30 -0700314 hrtimer_set_expires(timer, time);
Peter Zijlstra31656512008-07-18 18:01:23 +0200315
316 if (rq == this_rq()) {
Peter Zijlstra971ee282013-06-28 11:18:53 +0200317 __hrtick_restart(rq);
Peter Zijlstra31656512008-07-18 18:01:23 +0200318 } else if (!rq->hrtick_csd_pending) {
Frederic Weisbeckerc46fff22014-02-24 16:40:02 +0100319 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
Peter Zijlstra31656512008-07-18 18:01:23 +0200320 rq->hrtick_csd_pending = 1;
321 }
Peter Zijlstrab328ca12008-04-29 10:02:46 +0200322}
323
Peter Zijlstra31656512008-07-18 18:01:23 +0200324#else
325/*
326 * Called to set the hrtick timer state.
327 *
328 * called with rq->lock held and irqs disabled
329 */
Peter Zijlstra029632f2011-10-25 10:00:11 +0200330void hrtick_start(struct rq *rq, u64 delay)
Peter Zijlstra31656512008-07-18 18:01:23 +0200331{
Wanpeng Li86893332014-11-26 08:44:06 +0800332 /*
333 * Don't schedule slices shorter than 10000ns, that just
334 * doesn't make sense. Rely on vruntime for fairness.
335 */
336 delay = max_t(u64, delay, 10000LL);
Thomas Gleixner4961b6e2015-04-14 21:09:05 +0000337 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
338 HRTIMER_MODE_REL_PINNED);
Peter Zijlstra31656512008-07-18 18:01:23 +0200339}
Rabin Vincent95e904c2008-05-11 05:55:33 +0530340#endif /* CONFIG_SMP */
Peter Zijlstrab328ca12008-04-29 10:02:46 +0200341
342static void init_rq_hrtick(struct rq *rq)
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +0100343{
Peter Zijlstra31656512008-07-18 18:01:23 +0200344#ifdef CONFIG_SMP
345 rq->hrtick_csd_pending = 0;
346
347 rq->hrtick_csd.flags = 0;
348 rq->hrtick_csd.func = __hrtick_start;
349 rq->hrtick_csd.info = rq;
350#endif
351
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +0100352 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
353 rq->hrtick_timer.function = hrtick;
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +0100354}
Andrew Morton006c75f2008-09-22 14:55:46 -0700355#else /* CONFIG_SCHED_HRTICK */
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +0100356static inline void hrtick_clear(struct rq *rq)
357{
358}
359
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +0100360static inline void init_rq_hrtick(struct rq *rq)
361{
362}
Andrew Morton006c75f2008-09-22 14:55:46 -0700363#endif /* CONFIG_SCHED_HRTICK */
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +0100364
Frederic Weisbecker55295782016-03-24 15:38:01 +0100365/*
366 * cmpxchg based fetch_or, macro so it works for different integer types
367 */
368#define fetch_or(ptr, mask) \
369 ({ \
370 typeof(ptr) _ptr = (ptr); \
371 typeof(mask) _mask = (mask); \
372 typeof(*_ptr) _old, _val = *_ptr; \
373 \
374 for (;;) { \
375 _old = cmpxchg(_ptr, _val, _val | _mask); \
376 if (_old == _val) \
377 break; \
378 _val = _old; \
379 } \
380 _old; \
381})
382
Peter Zijlstrae3baac42014-06-04 10:31:18 -0700383#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
Peter Zijlstrafd99f912014-04-09 15:35:08 +0200384/*
385 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
386 * this avoids any races wrt polling state changes and thereby avoids
387 * spurious IPIs.
388 */
389static bool set_nr_and_not_polling(struct task_struct *p)
390{
391 struct thread_info *ti = task_thread_info(p);
392 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
393}
Peter Zijlstrae3baac42014-06-04 10:31:18 -0700394
395/*
396 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
397 *
398 * If this returns true, then the idle task promises to call
399 * sched_ttwu_pending() and reschedule soon.
400 */
401static bool set_nr_if_polling(struct task_struct *p)
402{
403 struct thread_info *ti = task_thread_info(p);
Jason Low316c1608d2015-04-28 13:00:20 -0700404 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
Peter Zijlstrae3baac42014-06-04 10:31:18 -0700405
406 for (;;) {
407 if (!(val & _TIF_POLLING_NRFLAG))
408 return false;
409 if (val & _TIF_NEED_RESCHED)
410 return true;
411 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
412 if (old == val)
413 break;
414 val = old;
415 }
416 return true;
417}
418
Peter Zijlstrafd99f912014-04-09 15:35:08 +0200419#else
420static bool set_nr_and_not_polling(struct task_struct *p)
421{
422 set_tsk_need_resched(p);
423 return true;
424}
Peter Zijlstrae3baac42014-06-04 10:31:18 -0700425
426#ifdef CONFIG_SMP
427static bool set_nr_if_polling(struct task_struct *p)
428{
429 return false;
430}
431#endif
Peter Zijlstrafd99f912014-04-09 15:35:08 +0200432#endif
433
Peter Zijlstra76751042015-05-01 08:27:50 -0700434void wake_q_add(struct wake_q_head *head, struct task_struct *task)
435{
436 struct wake_q_node *node = &task->wake_q;
437
438 /*
439 * Atomically grab the task, if ->wake_q is !nil already it means
440 * its already queued (either by us or someone else) and will get the
441 * wakeup due to that.
442 *
443 * This cmpxchg() implies a full barrier, which pairs with the write
Davidlohr Bueso58fe9c42016-05-08 20:58:10 -0700444 * barrier implied by the wakeup in wake_up_q().
Peter Zijlstra76751042015-05-01 08:27:50 -0700445 */
446 if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
447 return;
448
449 get_task_struct(task);
450
451 /*
452 * The head is context local, there can be no concurrency.
453 */
454 *head->lastp = node;
455 head->lastp = &node->next;
456}
457
458void wake_up_q(struct wake_q_head *head)
459{
460 struct wake_q_node *node = head->first;
461
462 while (node != WAKE_Q_TAIL) {
463 struct task_struct *task;
464
465 task = container_of(node, struct task_struct, wake_q);
466 BUG_ON(!task);
467 /* task can safely be re-inserted now */
468 node = node->next;
469 task->wake_q.next = NULL;
470
471 /*
472 * wake_up_process() implies a wmb() to pair with the queueing
473 * in wake_q_add() so as not to miss wakeups.
474 */
475 wake_up_process(task);
476 put_task_struct(task);
477 }
478}
479
Peter Zijlstrafd99f912014-04-09 15:35:08 +0200480/*
Kirill Tkhai88751252014-06-29 00:03:57 +0400481 * resched_curr - mark rq's current task 'to be rescheduled now'.
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200482 *
483 * On UP this means the setting of the need_resched flag, on SMP it
484 * might also involve a cross-CPU call to trigger the scheduler on
485 * the target CPU.
486 */
Kirill Tkhai88751252014-06-29 00:03:57 +0400487void resched_curr(struct rq *rq)
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200488{
Kirill Tkhai88751252014-06-29 00:03:57 +0400489 struct task_struct *curr = rq->curr;
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200490 int cpu;
491
Kirill Tkhai88751252014-06-29 00:03:57 +0400492 lockdep_assert_held(&rq->lock);
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200493
Kirill Tkhai88751252014-06-29 00:03:57 +0400494 if (test_tsk_need_resched(curr))
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200495 return;
496
Kirill Tkhai88751252014-06-29 00:03:57 +0400497 cpu = cpu_of(rq);
Peter Zijlstrafd99f912014-04-09 15:35:08 +0200498
Peter Zijlstraf27dde82013-08-14 14:55:31 +0200499 if (cpu == smp_processor_id()) {
Kirill Tkhai88751252014-06-29 00:03:57 +0400500 set_tsk_need_resched(curr);
Peter Zijlstraf27dde82013-08-14 14:55:31 +0200501 set_preempt_need_resched();
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200502 return;
Peter Zijlstraf27dde82013-08-14 14:55:31 +0200503 }
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200504
Kirill Tkhai88751252014-06-29 00:03:57 +0400505 if (set_nr_and_not_polling(curr))
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200506 smp_send_reschedule(cpu);
Andy Lutomirskidfc68f22014-06-04 10:31:15 -0700507 else
508 trace_sched_wake_idle_without_ipi(cpu);
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200509}
510
Peter Zijlstra029632f2011-10-25 10:00:11 +0200511void resched_cpu(int cpu)
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200512{
513 struct rq *rq = cpu_rq(cpu);
514 unsigned long flags;
515
Paul E. McKenney5d0299902017-09-18 08:54:40 -0700516 raw_spin_lock_irqsave(&rq->lock, flags);
Paul E. McKenneycce2b932017-10-13 16:24:28 -0700517 if (cpu_online(cpu) || cpu == smp_processor_id())
518 resched_curr(rq);
Thomas Gleixner05fa7852009-11-17 14:28:38 +0100519 raw_spin_unlock_irqrestore(&rq->lock, flags);
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200520}
Thomas Gleixner06d83082008-03-22 09:20:24 +0100521
Peter Zijlstrab021fe32013-09-17 09:30:55 +0200522#ifdef CONFIG_SMP
Frederic Weisbecker3451d022011-08-10 23:21:01 +0200523#ifdef CONFIG_NO_HZ_COMMON
Thomas Gleixner06d83082008-03-22 09:20:24 +0100524/*
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -0700525 * In the semi idle case, use the nearest busy cpu for migrating timers
526 * from an idle cpu. This is good for power-savings.
527 *
528 * We don't do similar optimization for completely idle system, as
529 * selecting an idle cpu will add more delays to the timers than intended
530 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
531 */
Thomas Gleixnerbc7a34b2015-05-26 22:50:33 +0000532int get_nohz_timer_target(void)
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -0700533{
Thomas Gleixnerbc7a34b2015-05-26 22:50:33 +0000534 int i, cpu = smp_processor_id();
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -0700535 struct sched_domain *sd;
536
Vatika Harlalka9642d182015-09-01 16:50:59 +0200537 if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
Viresh Kumar6201b4d2014-03-18 16:26:07 +0530538 return cpu;
539
Peter Zijlstra057f3fa2011-04-18 11:24:34 +0200540 rcu_read_lock();
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -0700541 for_each_domain(cpu, sd) {
Peter Zijlstra057f3fa2011-04-18 11:24:34 +0200542 for_each_cpu(i, sched_domain_span(sd)) {
Wanpeng Li44496922016-05-04 14:45:34 +0800543 if (cpu == i)
544 continue;
545
546 if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
Peter Zijlstra057f3fa2011-04-18 11:24:34 +0200547 cpu = i;
548 goto unlock;
549 }
550 }
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -0700551 }
Vatika Harlalka9642d182015-09-01 16:50:59 +0200552
553 if (!is_housekeeping_cpu(cpu))
554 cpu = housekeeping_any_cpu();
Peter Zijlstra057f3fa2011-04-18 11:24:34 +0200555unlock:
556 rcu_read_unlock();
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -0700557 return cpu;
558}
559/*
Thomas Gleixner06d83082008-03-22 09:20:24 +0100560 * When add_timer_on() enqueues a timer into the timer wheel of an
561 * idle CPU then this timer might expire before the next timer event
562 * which is scheduled to wake up that CPU. In case of a completely
563 * idle system the next event might even be infinite time into the
564 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
565 * leaves the inner idle loop so the newly added timer is taken into
566 * account when the CPU goes back to idle and evaluates the timer
567 * wheel for the next timer event.
568 */
Frederic Weisbecker1c200912011-08-10 23:21:01 +0200569static void wake_up_idle_cpu(int cpu)
Thomas Gleixner06d83082008-03-22 09:20:24 +0100570{
571 struct rq *rq = cpu_rq(cpu);
572
573 if (cpu == smp_processor_id())
574 return;
575
Andy Lutomirski67b9ca72014-06-04 10:31:17 -0700576 if (set_nr_and_not_polling(rq->idle))
Thomas Gleixner06d83082008-03-22 09:20:24 +0100577 smp_send_reschedule(cpu);
Andy Lutomirskidfc68f22014-06-04 10:31:15 -0700578 else
579 trace_sched_wake_idle_without_ipi(cpu);
Thomas Gleixner06d83082008-03-22 09:20:24 +0100580}
Mike Galbraith39c0cbe2010-03-11 17:17:13 +0100581
Frederic Weisbeckerc5bfece2013-04-12 16:45:34 +0200582static bool wake_up_full_nohz_cpu(int cpu)
Frederic Weisbecker1c200912011-08-10 23:21:01 +0200583{
Frederic Weisbecker53c5fa12014-06-04 16:20:21 +0200584 /*
585 * We just need the target to call irq_exit() and re-evaluate
586 * the next tick. The nohz full kick at least implies that.
587 * If needed we can still optimize that later with an
588 * empty IRQ.
589 */
Paul E. McKenney379d9ec2016-06-30 10:37:20 -0700590 if (cpu_is_offline(cpu))
591 return true; /* Don't try to wake offline CPUs. */
Frederic Weisbeckerc5bfece2013-04-12 16:45:34 +0200592 if (tick_nohz_full_cpu(cpu)) {
Frederic Weisbecker1c200912011-08-10 23:21:01 +0200593 if (cpu != smp_processor_id() ||
594 tick_nohz_tick_stopped())
Frederic Weisbecker53c5fa12014-06-04 16:20:21 +0200595 tick_nohz_full_kick_cpu(cpu);
Frederic Weisbecker1c200912011-08-10 23:21:01 +0200596 return true;
597 }
598
599 return false;
600}
601
Paul E. McKenney379d9ec2016-06-30 10:37:20 -0700602/*
603 * Wake up the specified CPU. If the CPU is going offline, it is the
604 * caller's responsibility to deal with the lost wakeup, for example,
605 * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
606 */
Frederic Weisbecker1c200912011-08-10 23:21:01 +0200607void wake_up_nohz_cpu(int cpu)
608{
Frederic Weisbeckerc5bfece2013-04-12 16:45:34 +0200609 if (!wake_up_full_nohz_cpu(cpu))
Frederic Weisbecker1c200912011-08-10 23:21:01 +0200610 wake_up_idle_cpu(cpu);
611}
612
Suresh Siddhaca380622011-10-03 15:09:00 -0700613static inline bool got_nohz_idle_kick(void)
614{
Suresh Siddha1c792db2011-12-01 17:07:32 -0800615 int cpu = smp_processor_id();
Vincent Guittot873b4c62013-06-05 10:13:11 +0200616
617 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
618 return false;
619
620 if (idle_cpu(cpu) && !need_resched())
621 return true;
622
623 /*
624 * We can't run Idle Load Balance on this CPU for this time so we
625 * cancel it and clear NOHZ_BALANCE_KICK
626 */
627 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
628 return false;
Suresh Siddhaca380622011-10-03 15:09:00 -0700629}
630
Frederic Weisbecker3451d022011-08-10 23:21:01 +0200631#else /* CONFIG_NO_HZ_COMMON */
Suresh Siddhaca380622011-10-03 15:09:00 -0700632
633static inline bool got_nohz_idle_kick(void)
634{
635 return false;
636}
637
Frederic Weisbecker3451d022011-08-10 23:21:01 +0200638#endif /* CONFIG_NO_HZ_COMMON */
Thomas Gleixner06d83082008-03-22 09:20:24 +0100639
Frederic Weisbeckerce831b32013-04-20 15:15:35 +0200640#ifdef CONFIG_NO_HZ_FULL
Frederic Weisbecker76d92ac2015-07-17 22:25:49 +0200641bool sched_can_stop_tick(struct rq *rq)
Frederic Weisbeckerce831b32013-04-20 15:15:35 +0200642{
Frederic Weisbecker76d92ac2015-07-17 22:25:49 +0200643 int fifo_nr_running;
644
645 /* Deadline tasks, even if single, need the tick */
646 if (rq->dl.dl_nr_running)
647 return false;
648
Frederic Weisbecker3882ec62014-03-18 22:54:04 +0100649 /*
Peter Zijlstra2548d542016-04-21 18:03:15 +0200650 * If there are more than one RR tasks, we need the tick to effect the
651 * actual RR behaviour.
Rik van Riel1e78cdb2015-02-16 15:23:49 -0500652 */
Frederic Weisbecker76d92ac2015-07-17 22:25:49 +0200653 if (rq->rt.rr_nr_running) {
654 if (rq->rt.rr_nr_running == 1)
655 return true;
656 else
657 return false;
Rik van Riel1e78cdb2015-02-16 15:23:49 -0500658 }
659
Peter Zijlstra2548d542016-04-21 18:03:15 +0200660 /*
661 * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
662 * forced preemption between FIFO tasks.
663 */
664 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
665 if (fifo_nr_running)
666 return true;
667
668 /*
669 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
670 * if there's more than one we need the tick for involuntary
671 * preemption.
672 */
673 if (rq->nr_running > 1)
Viresh Kumar541b8262014-06-24 14:04:12 +0530674 return false;
Frederic Weisbeckerce831b32013-04-20 15:15:35 +0200675
Viresh Kumar541b8262014-06-24 14:04:12 +0530676 return true;
Frederic Weisbeckerce831b32013-04-20 15:15:35 +0200677}
678#endif /* CONFIG_NO_HZ_FULL */
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200679
Peter Zijlstra029632f2011-10-25 10:00:11 +0200680void sched_avg_update(struct rq *rq)
Peter Zijlstrae9e92502009-09-01 10:34:37 +0200681{
682 s64 period = sched_avg_period();
683
Frederic Weisbecker78becc22013-04-12 01:51:02 +0200684 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
Will Deacon0d98bb22010-05-24 12:11:43 -0700685 /*
686 * Inline assembly required to prevent the compiler
687 * optimising this loop into a divmod call.
688 * See __iter_div_u64_rem() for another example of this.
689 */
690 asm("" : "+rm" (rq->age_stamp));
Peter Zijlstrae9e92502009-09-01 10:34:37 +0200691 rq->age_stamp += period;
692 rq->rt_avg /= 2;
693 }
694}
695
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +0200696#endif /* CONFIG_SMP */
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200697
Paul Turnera790de92011-07-21 09:43:29 -0700698#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
699 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
Peter Zijlstraeb755802008-08-19 12:33:05 +0200700/*
Paul Turner82774342011-07-21 09:43:35 -0700701 * Iterate task_group tree rooted at *from, calling @down when first entering a
702 * node and @up when leaving it for the final time.
703 *
704 * Caller must hold rcu_lock or sufficient equivalent.
Peter Zijlstraeb755802008-08-19 12:33:05 +0200705 */
Peter Zijlstra029632f2011-10-25 10:00:11 +0200706int walk_tg_tree_from(struct task_group *from,
Paul Turner82774342011-07-21 09:43:35 -0700707 tg_visitor down, tg_visitor up, void *data)
Peter Zijlstraeb755802008-08-19 12:33:05 +0200708{
709 struct task_group *parent, *child;
710 int ret;
711
Paul Turner82774342011-07-21 09:43:35 -0700712 parent = from;
713
Peter Zijlstraeb755802008-08-19 12:33:05 +0200714down:
715 ret = (*down)(parent, data);
716 if (ret)
Paul Turner82774342011-07-21 09:43:35 -0700717 goto out;
Peter Zijlstraeb755802008-08-19 12:33:05 +0200718 list_for_each_entry_rcu(child, &parent->children, siblings) {
719 parent = child;
720 goto down;
721
722up:
723 continue;
724 }
725 ret = (*up)(parent, data);
Paul Turner82774342011-07-21 09:43:35 -0700726 if (ret || parent == from)
727 goto out;
Peter Zijlstraeb755802008-08-19 12:33:05 +0200728
729 child = parent;
730 parent = parent->parent;
731 if (parent)
732 goto up;
Paul Turner82774342011-07-21 09:43:35 -0700733out:
Peter Zijlstraeb755802008-08-19 12:33:05 +0200734 return ret;
735}
736
Peter Zijlstra029632f2011-10-25 10:00:11 +0200737int tg_nop(struct task_group *tg, void *data)
Peter Zijlstraeb755802008-08-19 12:33:05 +0200738{
739 return 0;
740}
741#endif
742
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200743static void set_load_weight(struct task_struct *p)
744{
Nikhil Raof05998d2011-05-18 10:09:38 -0700745 int prio = p->static_prio - MAX_RT_PRIO;
746 struct load_weight *load = &p->se.load;
747
Ingo Molnardd41f592007-07-09 18:51:59 +0200748 /*
749 * SCHED_IDLE tasks get minimal weight:
750 */
Henrik Austad20f9cd22015-09-09 17:00:41 +0200751 if (idle_policy(p->policy)) {
Nikhil Raoc8b28112011-05-18 14:37:48 -0700752 load->weight = scale_load(WEIGHT_IDLEPRIO);
Nikhil Raof05998d2011-05-18 10:09:38 -0700753 load->inv_weight = WMULT_IDLEPRIO;
Ingo Molnardd41f592007-07-09 18:51:59 +0200754 return;
755 }
756
Andi Kleened82b8a2015-11-29 20:59:43 -0800757 load->weight = scale_load(sched_prio_to_weight[prio]);
758 load->inv_weight = sched_prio_to_wmult[prio];
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200759}
760
Peter Zijlstra1de64442015-09-30 17:44:13 +0200761static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
Gregory Haskins2087a1a2008-06-27 14:30:00 -0600762{
Mike Galbraitha64692a2010-03-11 17:16:20 +0100763 update_rq_clock(rq);
Johannes Weiner3df0e592018-10-26 15:06:27 -0700764 if (!(flags & ENQUEUE_RESTORE)) {
Peter Zijlstra1de64442015-09-30 17:44:13 +0200765 sched_info_queued(rq, p);
Johannes Weiner3df0e592018-10-26 15:06:27 -0700766 psi_enqueue(p, flags & ENQUEUE_WAKEUP);
767 }
Peter Zijlstra371fd7e2010-03-24 16:38:48 +0100768 p->sched_class->enqueue_task(rq, p, flags);
Pavankumar Kondeti4e13d112018-01-25 01:12:08 +0530769 walt_update_last_enqueue(p);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -0700770 trace_sched_enq_deq_task(p, 1, cpumask_bits(&p->cpus_allowed)[0]);
Ingo Molnardd41f592007-07-09 18:51:59 +0200771}
772
Peter Zijlstra1de64442015-09-30 17:44:13 +0200773static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
Ingo Molnardd41f592007-07-09 18:51:59 +0200774{
Mike Galbraitha64692a2010-03-11 17:16:20 +0100775 update_rq_clock(rq);
Johannes Weiner3df0e592018-10-26 15:06:27 -0700776 if (!(flags & DEQUEUE_SAVE)) {
Peter Zijlstra1de64442015-09-30 17:44:13 +0200777 sched_info_dequeued(rq, p);
Johannes Weiner3df0e592018-10-26 15:06:27 -0700778 psi_dequeue(p, flags & DEQUEUE_SLEEP);
779 }
Peter Zijlstra371fd7e2010-03-24 16:38:48 +0100780 p->sched_class->dequeue_task(rq, p, flags);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -0700781 trace_sched_enq_deq_task(p, 0, cpumask_bits(&p->cpus_allowed)[0]);
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200782}
783
Peter Zijlstra029632f2011-10-25 10:00:11 +0200784void activate_task(struct rq *rq, struct task_struct *p, int flags)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +0100785{
786 if (task_contributes_to_load(p))
787 rq->nr_uninterruptible--;
788
Peter Zijlstra371fd7e2010-03-24 16:38:48 +0100789 enqueue_task(rq, p, flags);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +0100790}
791
Peter Zijlstra029632f2011-10-25 10:00:11 +0200792void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +0100793{
794 if (task_contributes_to_load(p))
795 rq->nr_uninterruptible++;
796
Syed Rameez Mustafa25de0112017-05-10 12:09:15 -0700797 if (flags & DEQUEUE_SLEEP)
798 clear_ed_task(p, rq);
799
Peter Zijlstra371fd7e2010-03-24 16:38:48 +0100800 dequeue_task(rq, p, flags);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +0100801}
802
Peter Zijlstrafe44d622010-12-09 14:15:34 +0100803static void update_rq_clock_task(struct rq *rq, s64 delta)
Venkatesh Pallipadiaa483802010-10-04 17:03:22 -0700804{
Glauber Costa095c0aa2011-07-11 15:28:18 -0400805/*
806 * In theory, the compile should just see 0 here, and optimize out the call
807 * to sched_rt_avg_update. But I don't trust it...
808 */
809#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
810 s64 steal = 0, irq_delta = 0;
811#endif
812#ifdef CONFIG_IRQ_TIME_ACCOUNTING
Peter Zijlstra8e92c202010-12-09 14:15:34 +0100813 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
Peter Zijlstrafe44d622010-12-09 14:15:34 +0100814
815 /*
816 * Since irq_time is only updated on {soft,}irq_exit, we might run into
817 * this case when a previous update_rq_clock() happened inside a
818 * {soft,}irq region.
819 *
820 * When this happens, we stop ->clock_task and only update the
821 * prev_irq_time stamp to account for the part that fit, so that a next
822 * update will consume the rest. This ensures ->clock_task is
823 * monotonic.
824 *
825 * It does however cause some slight miss-attribution of {soft,}irq
826 * time, a more accurate solution would be to update the irq_time using
827 * the current rq->clock timestamp, except that would require using
828 * atomic ops.
829 */
830 if (irq_delta > delta)
831 irq_delta = delta;
832
833 rq->prev_irq_time += irq_delta;
834 delta -= irq_delta;
Glauber Costa095c0aa2011-07-11 15:28:18 -0400835#endif
836#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
Ingo Molnarc5905af2012-02-24 08:31:31 +0100837 if (static_key_false((&paravirt_steal_rq_enabled))) {
Glauber Costa095c0aa2011-07-11 15:28:18 -0400838 steal = paravirt_steal_clock(cpu_of(rq));
839 steal -= rq->prev_steal_time_rq;
840
841 if (unlikely(steal > delta))
842 steal = delta;
843
Glauber Costa095c0aa2011-07-11 15:28:18 -0400844 rq->prev_steal_time_rq += steal;
Glauber Costa095c0aa2011-07-11 15:28:18 -0400845 delta -= steal;
846 }
847#endif
848
Peter Zijlstrafe44d622010-12-09 14:15:34 +0100849 rq->clock_task += delta;
850
Glauber Costa095c0aa2011-07-11 15:28:18 -0400851#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
Nicolas Pitre5d4dfdd2014-05-27 13:50:41 -0400852 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
Glauber Costa095c0aa2011-07-11 15:28:18 -0400853 sched_rt_avg_update(rq, irq_delta + steal);
854#endif
Venkatesh Pallipadiaa483802010-10-04 17:03:22 -0700855}
856
Peter Zijlstra34f971f2010-09-22 13:53:15 +0200857void sched_set_stop_task(int cpu, struct task_struct *stop)
858{
859 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
860 struct task_struct *old_stop = cpu_rq(cpu)->stop;
861
862 if (stop) {
863 /*
864 * Make it appear like a SCHED_FIFO task, its something
865 * userspace knows about and won't get confused about.
866 *
867 * Also, it will make PI more or less work without too
868 * much confusion -- but then, stop work should not
869 * rely on PI working anyway.
870 */
871 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
872
873 stop->sched_class = &stop_sched_class;
874 }
875
876 cpu_rq(cpu)->stop = stop;
877
878 if (old_stop) {
879 /*
880 * Reset it back to a normal scheduling class so that
881 * it can die in pieces.
882 */
883 old_stop->sched_class = &rt_sched_class;
884 }
885}
886
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +0100887/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200888 * __normal_prio - return the priority that is based on the static prio
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200889 */
Ingo Molnar14531182007-07-09 18:51:59 +0200890static inline int __normal_prio(struct task_struct *p)
891{
Ingo Molnardd41f592007-07-09 18:51:59 +0200892 return p->static_prio;
Ingo Molnar14531182007-07-09 18:51:59 +0200893}
894
895/*
Ingo Molnarb29739f2006-06-27 02:54:51 -0700896 * Calculate the expected normal priority: i.e. priority
897 * without taking RT-inheritance into account. Might be
898 * boosted by interactivity modifiers. Changes upon fork,
899 * setprio syscalls, and whenever the interactivity
900 * estimator recalculates.
901 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700902static inline int normal_prio(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700903{
904 int prio;
905
Dario Faggioliaab03e02013-11-28 11:14:43 +0100906 if (task_has_dl_policy(p))
907 prio = MAX_DL_PRIO-1;
908 else if (task_has_rt_policy(p))
Ingo Molnarb29739f2006-06-27 02:54:51 -0700909 prio = MAX_RT_PRIO-1 - p->rt_priority;
910 else
911 prio = __normal_prio(p);
912 return prio;
913}
914
915/*
916 * Calculate the current priority, i.e. the priority
917 * taken into account by the scheduler. This value might
918 * be boosted by RT tasks, or might be boosted by
919 * interactivity modifiers. Will be RT if the task got
920 * RT-boosted. If not then it returns p->normal_prio.
921 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700922static int effective_prio(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700923{
924 p->normal_prio = normal_prio(p);
925 /*
926 * If we are RT tasks or we were boosted to RT priority,
927 * keep the priority unchanged. Otherwise, update priority
928 * to the normal priority:
929 */
930 if (!rt_prio(p->prio))
931 return p->normal_prio;
932 return p->prio;
933}
934
Linus Torvalds1da177e2005-04-16 15:20:36 -0700935/**
936 * task_curr - is this task currently executing on a CPU?
937 * @p: the task in question.
Yacine Belkadie69f6182013-07-12 20:45:47 +0200938 *
939 * Return: 1 if the task is currently executing. 0 otherwise.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700940 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700941inline int task_curr(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942{
943 return cpu_curr(task_cpu(p)) == p;
944}
945
Kirill Tkhai67dfa1b2014-10-27 17:40:52 +0300946/*
Peter Zijlstra4c9a4bc2015-06-11 14:46:39 +0200947 * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
948 * use the balance_callback list if you want balancing.
949 *
950 * this means any call to check_class_changed() must be followed by a call to
951 * balance_callback().
Kirill Tkhai67dfa1b2014-10-27 17:40:52 +0300952 */
Steven Rostedtcb469842008-01-25 21:08:22 +0100953static inline void check_class_changed(struct rq *rq, struct task_struct *p,
954 const struct sched_class *prev_class,
Peter Zijlstrada7a7352011-01-17 17:03:27 +0100955 int oldprio)
Steven Rostedtcb469842008-01-25 21:08:22 +0100956{
957 if (prev_class != p->sched_class) {
958 if (prev_class->switched_from)
Peter Zijlstrada7a7352011-01-17 17:03:27 +0100959 prev_class->switched_from(rq, p);
Peter Zijlstra4c9a4bc2015-06-11 14:46:39 +0200960
Peter Zijlstrada7a7352011-01-17 17:03:27 +0100961 p->sched_class->switched_to(rq, p);
Dario Faggioli2d3d8912013-11-07 14:43:44 +0100962 } else if (oldprio != p->prio || dl_task(p))
Peter Zijlstrada7a7352011-01-17 17:03:27 +0100963 p->sched_class->prio_changed(rq, p, oldprio);
Steven Rostedtcb469842008-01-25 21:08:22 +0100964}
965
Peter Zijlstra029632f2011-10-25 10:00:11 +0200966void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
Peter Zijlstra1e5a7402010-10-31 12:37:04 +0100967{
968 const struct sched_class *class;
969
970 if (p->sched_class == rq->curr->sched_class) {
971 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
972 } else {
973 for_each_class(class) {
974 if (class == rq->curr->sched_class)
975 break;
976 if (class == p->sched_class) {
Kirill Tkhai88751252014-06-29 00:03:57 +0400977 resched_curr(rq);
Peter Zijlstra1e5a7402010-10-31 12:37:04 +0100978 break;
979 }
980 }
981 }
982
983 /*
984 * A queue event has occurred, and we're going to schedule. In
985 * this case, we can save a useless back to back clock update.
986 */
Kirill Tkhaida0c1e62014-08-20 13:47:32 +0400987 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
Peter Zijlstra9edfbfe2015-01-05 11:18:11 +0100988 rq_clock_skip_update(rq, true);
Peter Zijlstra1e5a7402010-10-31 12:37:04 +0100989}
990
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991#ifdef CONFIG_SMP
Peter Zijlstra5cc389b2015-06-11 14:46:50 +0200992/*
993 * This is how migration works:
994 *
995 * 1) we invoke migration_cpu_stop() on the target CPU using
996 * stop_one_cpu().
997 * 2) stopper starts to run (implicitly forcing the migrated thread
998 * off the CPU)
999 * 3) it checks whether the migrated task is still in the wrong runqueue.
1000 * 4) if it's in the wrong runqueue then the migration thread removes
1001 * it and puts it into the right queue.
1002 * 5) stopper completes and stop_one_cpu() returns and the migration
1003 * is done.
1004 */
1005
1006/*
1007 * move_queued_task - move a queued task to new rq.
1008 *
1009 * Returns (locked) new rq. Old rq's lock is released.
1010 */
Peter Zijlstra5e16bbc2015-06-11 14:46:51 +02001011static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001012{
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001013 lockdep_assert_held(&rq->lock);
1014
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001015 p->on_rq = TASK_ON_RQ_MIGRATING;
Joonwoo Park3ea94de2015-11-12 19:38:54 -08001016 dequeue_task(rq, p, 0);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07001017 double_lock_balance(rq, cpu_rq(new_cpu));
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001018 set_task_cpu(p, new_cpu);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07001019 double_rq_unlock(cpu_rq(new_cpu), rq);
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001020
1021 rq = cpu_rq(new_cpu);
1022
1023 raw_spin_lock(&rq->lock);
1024 BUG_ON(task_cpu(p) != new_cpu);
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001025 enqueue_task(rq, p, 0);
Joonwoo Park3ea94de2015-11-12 19:38:54 -08001026 p->on_rq = TASK_ON_RQ_QUEUED;
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001027 check_preempt_curr(rq, p, 0);
1028
1029 return rq;
1030}
1031
1032struct migration_arg {
1033 struct task_struct *task;
1034 int dest_cpu;
1035};
1036
1037/*
1038 * Move (not current) task off this cpu, onto dest cpu. We're doing
1039 * this because either it can't run here any more (set_cpus_allowed()
1040 * away from this CPU, or CPU going down), or because we're
1041 * attempting to rebalance this task on exec (sched_exec).
1042 *
1043 * So we race with normal scheduler movements, but that's OK, as long
1044 * as the task is no longer on this CPU.
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001045 */
Peter Zijlstra5e16bbc2015-06-11 14:46:51 +02001046static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001047{
Tejun Heo1ac6b302017-06-17 08:10:08 -04001048 if (p->flags & PF_KTHREAD) {
1049 if (unlikely(!cpu_online(dest_cpu)))
1050 return rq;
1051 } else {
1052 if (unlikely(!cpu_active(dest_cpu)))
1053 return rq;
1054 }
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001055
1056 /* Affinity changed (again). */
1057 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
Peter Zijlstra5e16bbc2015-06-11 14:46:51 +02001058 return rq;
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001059
Peter Zijlstra5e16bbc2015-06-11 14:46:51 +02001060 rq = move_queued_task(rq, p, dest_cpu);
1061
1062 return rq;
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001063}
1064
1065/*
1066 * migration_cpu_stop - this will be executed by a highprio stopper thread
1067 * and performs thread migration by bumping thread off CPU then
1068 * 'pushing' onto another runqueue.
1069 */
1070static int migration_cpu_stop(void *data)
1071{
1072 struct migration_arg *arg = data;
Peter Zijlstra5e16bbc2015-06-11 14:46:51 +02001073 struct task_struct *p = arg->task;
1074 struct rq *rq = this_rq();
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07001075 bool moved = false;
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001076
1077 /*
1078 * The original target cpu might have gone down and we might
1079 * be on another cpu but it doesn't matter.
1080 */
1081 local_irq_disable();
1082 /*
1083 * We need to explicitly wake pending tasks before running
1084 * __migrate_task() such that we will not miss enforcing cpus_allowed
1085 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
1086 */
1087 sched_ttwu_pending();
Peter Zijlstra5e16bbc2015-06-11 14:46:51 +02001088
1089 raw_spin_lock(&p->pi_lock);
1090 raw_spin_lock(&rq->lock);
1091 /*
1092 * If task_rq(p) != rq, it cannot be migrated here, because we're
1093 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
1094 * we're holding p->pi_lock.
1095 */
Cheng Chaobf89a302016-09-14 10:01:50 +08001096 if (task_rq(p) == rq) {
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07001097 if (task_on_rq_queued(p)) {
Cheng Chaobf89a302016-09-14 10:01:50 +08001098 rq = __migrate_task(rq, p, arg->dest_cpu);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07001099 moved = true;
1100 } else {
Cheng Chaobf89a302016-09-14 10:01:50 +08001101 p->wake_cpu = arg->dest_cpu;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07001102 }
Cheng Chaobf89a302016-09-14 10:01:50 +08001103 }
Peter Zijlstra5e16bbc2015-06-11 14:46:51 +02001104 raw_spin_unlock(&rq->lock);
1105 raw_spin_unlock(&p->pi_lock);
1106
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001107 local_irq_enable();
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07001108
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001109 return 0;
1110}
1111
Peter Zijlstrac5b28032015-05-15 17:43:35 +02001112/*
1113 * sched_class::set_cpus_allowed must do the below, but is not required to
1114 * actually call this function.
1115 */
1116void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001117{
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001118 cpumask_copy(&p->cpus_allowed, new_mask);
1119 p->nr_cpus_allowed = cpumask_weight(new_mask);
1120}
1121
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001122void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1123{
Peter Zijlstra6c370672015-05-15 17:43:36 +02001124 struct rq *rq = task_rq(p);
1125 bool queued, running;
1126
Peter Zijlstra25834c72015-05-15 17:43:34 +02001127 lockdep_assert_held(&p->pi_lock);
Peter Zijlstra6c370672015-05-15 17:43:36 +02001128
1129 queued = task_on_rq_queued(p);
1130 running = task_current(rq, p);
1131
1132 if (queued) {
1133 /*
1134 * Because __kthread_bind() calls this on blocked tasks without
1135 * holding rq->lock.
1136 */
1137 lockdep_assert_held(&rq->lock);
Peter Zijlstra1de64442015-09-30 17:44:13 +02001138 dequeue_task(rq, p, DEQUEUE_SAVE);
Peter Zijlstra6c370672015-05-15 17:43:36 +02001139 }
1140 if (running)
1141 put_prev_task(rq, p);
1142
Peter Zijlstrac5b28032015-05-15 17:43:35 +02001143 p->sched_class->set_cpus_allowed(p, new_mask);
Peter Zijlstra6c370672015-05-15 17:43:36 +02001144
Peter Zijlstra6c370672015-05-15 17:43:36 +02001145 if (queued)
Peter Zijlstra1de64442015-09-30 17:44:13 +02001146 enqueue_task(rq, p, ENQUEUE_RESTORE);
Vincent Guittota399d232016-09-12 09:47:52 +02001147 if (running)
Peter Zijlstrab2bf6c32016-09-20 22:00:38 +02001148 set_curr_task(rq, p);
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001149}
1150
1151/*
1152 * Change a given task's CPU affinity. Migrate the thread to a
1153 * proper CPU and schedule it away if the CPU it's executing on
1154 * is removed from the allowed bitmask.
1155 *
1156 * NOTE: the caller must have a valid reference to the task, the
1157 * task must not exit() & deallocate itself prematurely. The
1158 * call is not atomic; no spinlocks may be held.
1159 */
Peter Zijlstra25834c72015-05-15 17:43:34 +02001160static int __set_cpus_allowed_ptr(struct task_struct *p,
1161 const struct cpumask *new_mask, bool check)
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001162{
Peter Zijlstra (Intel)e9d867a2016-03-10 12:54:08 +01001163 const struct cpumask *cpu_valid_mask = cpu_active_mask;
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001164 unsigned int dest_cpu;
Peter Zijlstraeb580752015-07-31 21:28:18 +02001165 struct rq_flags rf;
1166 struct rq *rq;
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001167 int ret = 0;
Olav Haugan3f2cb302016-05-31 14:34:46 -07001168 cpumask_t allowed_mask;
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001169
Peter Zijlstraeb580752015-07-31 21:28:18 +02001170 rq = task_rq_lock(p, &rf);
Wanpeng Liab3d5312017-02-21 23:52:55 -08001171 update_rq_clock(rq);
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001172
Peter Zijlstra (Intel)e9d867a2016-03-10 12:54:08 +01001173 if (p->flags & PF_KTHREAD) {
1174 /*
1175 * Kernel threads are allowed on online && !active CPUs
1176 */
1177 cpu_valid_mask = cpu_online_mask;
1178 }
1179
Peter Zijlstra25834c72015-05-15 17:43:34 +02001180 /*
1181 * Must re-check here, to close a race against __kthread_bind(),
1182 * sched_setaffinity() is not guaranteed to observe the flag.
1183 */
1184 if (check && (p->flags & PF_NO_SETAFFINITY)) {
1185 ret = -EINVAL;
1186 goto out;
1187 }
1188
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001189 if (cpumask_equal(&p->cpus_allowed, new_mask))
1190 goto out;
1191
Olav Haugan3f2cb302016-05-31 14:34:46 -07001192 cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask);
Olav Haugandc10c3f2016-12-07 16:34:49 -08001193 cpumask_and(&allowed_mask, &allowed_mask, cpu_valid_mask);
Olav Haugan3f2cb302016-05-31 14:34:46 -07001194
Olav Haugandc10c3f2016-12-07 16:34:49 -08001195 dest_cpu = cpumask_any(&allowed_mask);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07001196 if (dest_cpu >= nr_cpu_ids) {
Olav Haugandc10c3f2016-12-07 16:34:49 -08001197 cpumask_and(&allowed_mask, cpu_valid_mask, new_mask);
1198 dest_cpu = cpumask_any(&allowed_mask);
Olav Haugan3f2cb302016-05-31 14:34:46 -07001199 if (dest_cpu >= nr_cpu_ids) {
1200 ret = -EINVAL;
1201 goto out;
1202 }
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001203 }
1204
1205 do_set_cpus_allowed(p, new_mask);
1206
Peter Zijlstra (Intel)e9d867a2016-03-10 12:54:08 +01001207 if (p->flags & PF_KTHREAD) {
1208 /*
1209 * For kernel threads that do indeed end up on online &&
1210 * !active we want to ensure they are strict per-cpu threads.
1211 */
1212 WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
1213 !cpumask_intersects(new_mask, cpu_active_mask) &&
1214 p->nr_cpus_allowed != 1);
1215 }
1216
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001217 /* Can the task run on the task's current CPU? If so, we're done */
Olav Haugan3f2cb302016-05-31 14:34:46 -07001218 if (cpumask_test_cpu(task_cpu(p), &allowed_mask))
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001219 goto out;
1220
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001221 if (task_running(rq, p) || p->state == TASK_WAKING) {
1222 struct migration_arg arg = { p, dest_cpu };
1223 /* Need help from migration thread: drop lock and wait. */
Peter Zijlstraeb580752015-07-31 21:28:18 +02001224 task_rq_unlock(rq, p, &rf);
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001225 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1226 tlb_migrate_finish(p->mm);
1227 return 0;
Peter Zijlstracbce1a62015-06-11 14:46:54 +02001228 } else if (task_on_rq_queued(p)) {
1229 /*
1230 * OK, since we're going to drop the lock immediately
1231 * afterwards anyway.
1232 */
Matt Fleming5a91d732016-09-21 14:38:10 +01001233 rq_unpin_lock(rq, &rf);
Peter Zijlstra5e16bbc2015-06-11 14:46:51 +02001234 rq = move_queued_task(rq, p, dest_cpu);
Matt Fleming5a91d732016-09-21 14:38:10 +01001235 rq_repin_lock(rq, &rf);
Peter Zijlstracbce1a62015-06-11 14:46:54 +02001236 }
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001237out:
Peter Zijlstraeb580752015-07-31 21:28:18 +02001238 task_rq_unlock(rq, p, &rf);
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001239
1240 return ret;
1241}
Peter Zijlstra25834c72015-05-15 17:43:34 +02001242
1243int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
1244{
1245 return __set_cpus_allowed_ptr(p, new_mask, false);
1246}
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001247EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
1248
Ingo Molnardd41f592007-07-09 18:51:59 +02001249void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
Ingo Molnarc65cc872007-07-09 18:51:58 +02001250{
Peter Zijlstrae2912002009-12-16 18:04:36 +01001251#ifdef CONFIG_SCHED_DEBUG
1252 /*
1253 * We should never call set_task_cpu() on a blocked task,
1254 * ttwu() will sort out the placement.
1255 */
Peter Zijlstra077614e2009-12-17 13:16:31 +01001256 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
Oleg Nesterove2336f62014-10-08 20:33:48 +02001257 !p->on_rq);
Peter Zijlstra0122ec52011-04-05 17:23:51 +02001258
Joonwoo Park3ea94de2015-11-12 19:38:54 -08001259 /*
1260 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
1261 * because schedstat_wait_{start,end} rebase migrating task's wait_start
1262 * time relying on p->on_rq.
1263 */
1264 WARN_ON_ONCE(p->state == TASK_RUNNING &&
1265 p->sched_class == &fair_sched_class &&
1266 (p->on_rq && !task_on_rq_migrating(p)));
1267
Peter Zijlstra0122ec52011-04-05 17:23:51 +02001268#ifdef CONFIG_LOCKDEP
Peter Zijlstra6c6c54e2011-06-03 17:37:07 +02001269 /*
1270 * The caller should hold either p->pi_lock or rq->lock, when changing
1271 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
1272 *
1273 * sched_move_task() holds both and thus holding either pins the cgroup,
Peter Zijlstra8323f262012-06-22 13:36:05 +02001274 * see task_group().
Peter Zijlstra6c6c54e2011-06-03 17:37:07 +02001275 *
1276 * Furthermore, all task_rq users should acquire both locks, see
1277 * task_rq_lock().
1278 */
Peter Zijlstra0122ec52011-04-05 17:23:51 +02001279 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1280 lockdep_is_held(&task_rq(p)->lock)));
1281#endif
Peter Zijlstrae2912002009-12-16 18:04:36 +01001282#endif
1283
Pavankumar Kondeti25532bf2017-08-01 15:50:46 +05301284 trace_sched_migrate_task(p, new_cpu, task_util(p));
1285
Peter Zijlstra0c697742009-12-22 15:43:19 +01001286 if (task_cpu(p) != new_cpu) {
Paul Turner0a74bef2012-10-04 13:18:30 +02001287 if (p->sched_class->migrate_task_rq)
xiaofeng.yan5a4fd032015-09-23 14:55:59 +08001288 p->sched_class->migrate_task_rq(p);
Peter Zijlstra0c697742009-12-22 15:43:19 +01001289 p->se.nr_migrations++;
Peter Zijlstraff303e62015-04-17 20:05:30 +02001290 perf_event_task_migrate(p);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07001291
1292 fixup_busy_time(p, new_cpu);
Peter Zijlstra0c697742009-12-22 15:43:19 +01001293 }
Ingo Molnardd41f592007-07-09 18:51:59 +02001294
1295 __set_task_cpu(p, new_cpu);
Ingo Molnarc65cc872007-07-09 18:51:58 +02001296}
1297
Peter Zijlstraac66f542013-10-07 11:29:16 +01001298static void __migrate_swap_task(struct task_struct *p, int cpu)
1299{
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04001300 if (task_on_rq_queued(p)) {
Peter Zijlstraac66f542013-10-07 11:29:16 +01001301 struct rq *src_rq, *dst_rq;
1302
1303 src_rq = task_rq(p);
1304 dst_rq = cpu_rq(cpu);
1305
Joonwoo Park3ea94de2015-11-12 19:38:54 -08001306 p->on_rq = TASK_ON_RQ_MIGRATING;
Peter Zijlstraac66f542013-10-07 11:29:16 +01001307 deactivate_task(src_rq, p, 0);
1308 set_task_cpu(p, cpu);
1309 activate_task(dst_rq, p, 0);
Joonwoo Park3ea94de2015-11-12 19:38:54 -08001310 p->on_rq = TASK_ON_RQ_QUEUED;
Peter Zijlstraac66f542013-10-07 11:29:16 +01001311 check_preempt_curr(dst_rq, p, 0);
1312 } else {
1313 /*
1314 * Task isn't running anymore; make it appear like we migrated
1315 * it before it went to sleep. This means on wakeup we make the
Leo Yana1fd4652016-08-05 14:32:38 +08001316 * previous cpu our target instead of where it really is.
Peter Zijlstraac66f542013-10-07 11:29:16 +01001317 */
1318 p->wake_cpu = cpu;
1319 }
1320}
1321
1322struct migration_swap_arg {
1323 struct task_struct *src_task, *dst_task;
1324 int src_cpu, dst_cpu;
1325};
1326
1327static int migrate_swap_stop(void *data)
1328{
1329 struct migration_swap_arg *arg = data;
1330 struct rq *src_rq, *dst_rq;
1331 int ret = -EAGAIN;
1332
Peter Zijlstra62694cd2015-10-09 18:36:29 +02001333 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
1334 return -EAGAIN;
1335
Peter Zijlstraac66f542013-10-07 11:29:16 +01001336 src_rq = cpu_rq(arg->src_cpu);
1337 dst_rq = cpu_rq(arg->dst_cpu);
1338
Peter Zijlstra74602312013-10-10 20:17:22 +02001339 double_raw_lock(&arg->src_task->pi_lock,
1340 &arg->dst_task->pi_lock);
Peter Zijlstraac66f542013-10-07 11:29:16 +01001341 double_rq_lock(src_rq, dst_rq);
Peter Zijlstra62694cd2015-10-09 18:36:29 +02001342
Peter Zijlstraac66f542013-10-07 11:29:16 +01001343 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1344 goto unlock;
1345
1346 if (task_cpu(arg->src_task) != arg->src_cpu)
1347 goto unlock;
1348
1349 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
1350 goto unlock;
1351
1352 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
1353 goto unlock;
1354
1355 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1356 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1357
1358 ret = 0;
1359
1360unlock:
1361 double_rq_unlock(src_rq, dst_rq);
Peter Zijlstra74602312013-10-10 20:17:22 +02001362 raw_spin_unlock(&arg->dst_task->pi_lock);
1363 raw_spin_unlock(&arg->src_task->pi_lock);
Peter Zijlstraac66f542013-10-07 11:29:16 +01001364
1365 return ret;
1366}
1367
1368/*
1369 * Cross migrate two tasks
1370 */
1371int migrate_swap(struct task_struct *cur, struct task_struct *p)
1372{
1373 struct migration_swap_arg arg;
1374 int ret = -EINVAL;
1375
Peter Zijlstraac66f542013-10-07 11:29:16 +01001376 arg = (struct migration_swap_arg){
1377 .src_task = cur,
1378 .src_cpu = task_cpu(cur),
1379 .dst_task = p,
1380 .dst_cpu = task_cpu(p),
1381 };
1382
1383 if (arg.src_cpu == arg.dst_cpu)
1384 goto out;
1385
Peter Zijlstra6acce3e2013-10-11 14:38:20 +02001386 /*
1387 * These three tests are all lockless; this is OK since all of them
1388 * will be re-checked with proper locks held further down the line.
1389 */
Peter Zijlstraac66f542013-10-07 11:29:16 +01001390 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1391 goto out;
1392
1393 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
1394 goto out;
1395
1396 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1397 goto out;
1398
Mel Gorman286549d2014-01-21 15:51:03 -08001399 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
Peter Zijlstraac66f542013-10-07 11:29:16 +01001400 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1401
1402out:
Peter Zijlstraac66f542013-10-07 11:29:16 +01001403 return ret;
1404}
1405
Linus Torvalds1da177e2005-04-16 15:20:36 -07001406/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001407 * wait_task_inactive - wait for a thread to unschedule.
1408 *
Roland McGrath85ba2d82008-07-25 19:45:58 -07001409 * If @match_state is nonzero, it's the @p->state value just checked and
1410 * not expected to change. If it changes, i.e. @p might have woken up,
1411 * then return zero. When we succeed in waiting for @p to be off its CPU,
1412 * we return a positive number (its total switch count). If a second call
1413 * a short while later returns the same number, the caller can be sure that
1414 * @p has remained unscheduled the whole time.
1415 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07001416 * The caller must ensure that the task *will* unschedule sometime soon,
1417 * else this function might spin for a *long* time. This function can't
1418 * be called with interrupts off, or it may introduce deadlock with
1419 * smp_call_function() if an IPI is sent by the same process we are
1420 * waiting to become inactive.
1421 */
Roland McGrath85ba2d82008-07-25 19:45:58 -07001422unsigned long wait_task_inactive(struct task_struct *p, long match_state)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001423{
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04001424 int running, queued;
Peter Zijlstraeb580752015-07-31 21:28:18 +02001425 struct rq_flags rf;
Roland McGrath85ba2d82008-07-25 19:45:58 -07001426 unsigned long ncsw;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001427 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001428
Andi Kleen3a5c3592007-10-15 17:00:14 +02001429 for (;;) {
1430 /*
1431 * We do the initial early heuristics without holding
1432 * any task-queue locks at all. We'll only try to get
1433 * the runqueue lock when things look like they will
1434 * work out!
1435 */
1436 rq = task_rq(p);
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001437
Andi Kleen3a5c3592007-10-15 17:00:14 +02001438 /*
1439 * If the task is actively running on another CPU
1440 * still, just relax and busy-wait without holding
1441 * any locks.
1442 *
1443 * NOTE! Since we don't hold any locks, it's not
1444 * even sure that "rq" stays as the right runqueue!
1445 * But we don't care, since "task_running()" will
1446 * return false if the runqueue has changed and p
1447 * is actually now running somewhere else!
1448 */
Roland McGrath85ba2d82008-07-25 19:45:58 -07001449 while (task_running(rq, p)) {
1450 if (match_state && unlikely(p->state != match_state))
1451 return 0;
Andi Kleen3a5c3592007-10-15 17:00:14 +02001452 cpu_relax();
Roland McGrath85ba2d82008-07-25 19:45:58 -07001453 }
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001454
Andi Kleen3a5c3592007-10-15 17:00:14 +02001455 /*
1456 * Ok, time to look more closely! We need the rq
1457 * lock now, to be *sure*. If we're wrong, we'll
1458 * just go back and repeat.
1459 */
Peter Zijlstraeb580752015-07-31 21:28:18 +02001460 rq = task_rq_lock(p, &rf);
Peter Zijlstra27a9da62010-05-04 20:36:56 +02001461 trace_sched_wait_task(p);
Andi Kleen3a5c3592007-10-15 17:00:14 +02001462 running = task_running(rq, p);
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04001463 queued = task_on_rq_queued(p);
Roland McGrath85ba2d82008-07-25 19:45:58 -07001464 ncsw = 0;
Oleg Nesterovf31e11d2008-08-20 16:54:44 -07001465 if (!match_state || p->state == match_state)
Oleg Nesterov93dcf552008-08-20 16:54:44 -07001466 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
Peter Zijlstraeb580752015-07-31 21:28:18 +02001467 task_rq_unlock(rq, p, &rf);
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001468
Andi Kleen3a5c3592007-10-15 17:00:14 +02001469 /*
Roland McGrath85ba2d82008-07-25 19:45:58 -07001470 * If it changed from the expected state, bail out now.
1471 */
1472 if (unlikely(!ncsw))
1473 break;
1474
1475 /*
Andi Kleen3a5c3592007-10-15 17:00:14 +02001476 * Was it really running after all now that we
1477 * checked with the proper locks actually held?
1478 *
1479 * Oops. Go back and try again..
1480 */
1481 if (unlikely(running)) {
1482 cpu_relax();
1483 continue;
1484 }
1485
1486 /*
1487 * It's not enough that it's not actively running,
1488 * it must be off the runqueue _entirely_, and not
1489 * preempted!
1490 *
Luis Henriques80dd99b2009-03-16 19:58:09 +00001491 * So if it was still runnable (but just not actively
Andi Kleen3a5c3592007-10-15 17:00:14 +02001492 * running right now), it's preempted, and we should
1493 * yield - it could be a while.
1494 */
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04001495 if (unlikely(queued)) {
Syed Rameez Mustafac9cce0c2017-01-12 21:31:07 -08001496 ktime_t to = ktime_set(0, NSEC_PER_MSEC);
Thomas Gleixner8eb90c32011-02-23 23:52:21 +00001497
1498 set_current_state(TASK_UNINTERRUPTIBLE);
1499 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
Andi Kleen3a5c3592007-10-15 17:00:14 +02001500 continue;
1501 }
1502
1503 /*
1504 * Ahh, all good. It wasn't running, and it wasn't
1505 * runnable, which means that it will never become
1506 * running in the future either. We're all done!
1507 */
1508 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001509 }
Roland McGrath85ba2d82008-07-25 19:45:58 -07001510
1511 return ncsw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001512}
1513
1514/***
1515 * kick_process - kick a running thread to enter/exit the kernel
1516 * @p: the to-be-kicked thread
1517 *
1518 * Cause a process which is running on another CPU to enter
1519 * kernel-mode, without any delay. (to get signals handled.)
1520 *
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001521 * NOTE: this function doesn't have to take the runqueue lock,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001522 * because all it wants to ensure is that the remote task enters
1523 * the kernel. If the IPI races and the task has been migrated
1524 * to another CPU then no harm is done and the purpose has been
1525 * achieved as well.
1526 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001527void kick_process(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001528{
1529 int cpu;
1530
1531 preempt_disable();
1532 cpu = task_cpu(p);
1533 if ((cpu != smp_processor_id()) && task_curr(p))
1534 smp_send_reschedule(cpu);
1535 preempt_enable();
1536}
Rusty Russellb43e3522009-06-12 22:27:00 -06001537EXPORT_SYMBOL_GPL(kick_process);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001538
Oleg Nesterov30da6882010-03-15 10:10:19 +01001539/*
Peter Zijlstra013fdb82011-04-05 17:23:45 +02001540 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
Peter Zijlstra (Intel)e9d867a2016-03-10 12:54:08 +01001541 *
1542 * A few notes on cpu_active vs cpu_online:
1543 *
1544 * - cpu_active must be a subset of cpu_online
1545 *
1546 * - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
1547 * see __set_cpus_allowed_ptr(). At this point the newly online
1548 * cpu isn't yet part of the sched domains, and balancing will not
1549 * see it.
1550 *
1551 * - on cpu-down we clear cpu_active() to mask the sched domains and
1552 * avoid the load balancer to place new tasks on the to be removed
1553 * cpu. Existing tasks will remain running there and will be taken
1554 * off.
1555 *
1556 * This means that fallback selection must not select !active CPUs.
1557 * And can assume that any active CPU must be online. Conversely
1558 * select_task_rq() below may allow selection of !active CPUs in order
1559 * to satisfy the above rules.
Oleg Nesterov30da6882010-03-15 10:10:19 +01001560 */
Olav Haugan3f2cb302016-05-31 14:34:46 -07001561static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso)
Peter Zijlstra5da9a0f2009-12-16 18:04:38 +01001562{
Tang Chenaa00d892013-02-22 16:33:33 -08001563 int nid = cpu_to_node(cpu);
1564 const struct cpumask *nodemask = NULL;
Syed Rameez Mustafa1855bff2016-09-30 17:21:40 -07001565 enum { cpuset, possible, fail, bug } state = cpuset;
Peter Zijlstra2baab4e2012-03-20 15:57:01 +01001566 int dest_cpu;
Olav Haugan3f2cb302016-05-31 14:34:46 -07001567 int isolated_candidate = -1;
Peter Zijlstra5da9a0f2009-12-16 18:04:38 +01001568
Tang Chenaa00d892013-02-22 16:33:33 -08001569 /*
1570 * If the node that the cpu is on has been offlined, cpu_to_node()
1571 * will return -1. There is no cpu on the node, and we should
1572 * select the cpu on the other node.
1573 */
1574 if (nid != -1) {
1575 nodemask = cpumask_of_node(nid);
1576
1577 /* Look for allowed, online CPU in same node. */
1578 for_each_cpu(dest_cpu, nodemask) {
Tang Chenaa00d892013-02-22 16:33:33 -08001579 if (!cpu_active(dest_cpu))
1580 continue;
Olav Haugan3f2cb302016-05-31 14:34:46 -07001581 if (cpu_isolated(dest_cpu))
1582 continue;
Tang Chenaa00d892013-02-22 16:33:33 -08001583 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1584 return dest_cpu;
1585 }
Peter Zijlstra2baab4e2012-03-20 15:57:01 +01001586 }
Peter Zijlstra5da9a0f2009-12-16 18:04:38 +01001587
Peter Zijlstra2baab4e2012-03-20 15:57:01 +01001588 for (;;) {
1589 /* Any allowed, online CPU? */
Srivatsa S. Bhate3831ed2012-03-30 19:40:28 +05301590 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
Tejun Heofeb245e2016-06-16 15:35:04 -04001591 if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
1592 continue;
1593 if (!cpu_online(dest_cpu))
Peter Zijlstra2baab4e2012-03-20 15:57:01 +01001594 continue;
Olav Haugan3f2cb302016-05-31 14:34:46 -07001595 if (cpu_isolated(dest_cpu)) {
1596 if (allow_iso)
1597 isolated_candidate = dest_cpu;
1598 continue;
1599 }
1600 goto out;
1601 }
1602
1603 if (isolated_candidate != -1) {
1604 dest_cpu = isolated_candidate;
Peter Zijlstra2baab4e2012-03-20 15:57:01 +01001605 goto out;
1606 }
Peter Zijlstra5da9a0f2009-12-16 18:04:38 +01001607
Oleg Nesterove73e85f2015-10-10 20:53:15 +02001608 /* No more Mr. Nice Guy. */
Peter Zijlstra2baab4e2012-03-20 15:57:01 +01001609 switch (state) {
1610 case cpuset:
Oleg Nesterove73e85f2015-10-10 20:53:15 +02001611 if (IS_ENABLED(CONFIG_CPUSETS)) {
1612 cpuset_cpus_allowed_fallback(p);
1613 state = possible;
1614 break;
1615 }
1616 /* fall-through */
Peter Zijlstra2baab4e2012-03-20 15:57:01 +01001617 case possible:
1618 do_set_cpus_allowed(p, cpu_possible_mask);
1619 state = fail;
1620 break;
1621
1622 case fail:
Syed Rameez Mustafa1855bff2016-09-30 17:21:40 -07001623 allow_iso = true;
1624 state = bug;
1625 break;
1626
1627 case bug:
Peter Zijlstra2baab4e2012-03-20 15:57:01 +01001628 BUG();
1629 break;
1630 }
1631 }
1632
1633out:
1634 if (state != cpuset) {
1635 /*
1636 * Don't tell them about moving exiting tasks or
1637 * kernel threads (both mm NULL), since they never
1638 * leave kernel.
1639 */
1640 if (p->mm && printk_ratelimit()) {
John Stultzaac74dc2014-06-04 16:11:40 -07001641 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
Peter Zijlstra2baab4e2012-03-20 15:57:01 +01001642 task_pid_nr(p), p->comm, cpu);
1643 }
Peter Zijlstra5da9a0f2009-12-16 18:04:38 +01001644 }
1645
1646 return dest_cpu;
1647}
1648
Peter Zijlstrae2912002009-12-16 18:04:36 +01001649/*
Peter Zijlstra013fdb82011-04-05 17:23:45 +02001650 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
Peter Zijlstrae2912002009-12-16 18:04:36 +01001651 */
Peter Zijlstra970b13b2009-11-25 13:31:39 +01001652static inline
Peter Zijlstraac66f542013-10-07 11:29:16 +01001653int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
Peter Zijlstra970b13b2009-11-25 13:31:39 +01001654{
Olav Haugan3f2cb302016-05-31 14:34:46 -07001655 bool allow_isolated = (p->flags & PF_KTHREAD);
1656
Peter Zijlstracbce1a62015-06-11 14:46:54 +02001657 lockdep_assert_held(&p->pi_lock);
1658
Thomas Gleixner50605ff2016-05-11 14:23:31 +02001659 if (tsk_nr_cpus_allowed(p) > 1)
Wanpeng Li6c1d9412014-11-05 09:14:37 +08001660 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
Peter Zijlstra (Intel)e9d867a2016-03-10 12:54:08 +01001661 else
1662 cpu = cpumask_any(tsk_cpus_allowed(p));
Peter Zijlstrae2912002009-12-16 18:04:36 +01001663
1664 /*
1665 * In order not to call set_task_cpu() on a blocking task we need
1666 * to rely on ttwu() to place the task on a valid ->cpus_allowed
1667 * cpu.
1668 *
1669 * Since this is common to all placement strategies, this lives here.
1670 *
1671 * [ this allows ->select_task() to simply return task_cpu(p) and
1672 * not worry about this generic constraint ]
1673 */
Peter Zijlstrafa17b502011-06-16 12:23:22 +02001674 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
Olav Haugan3f2cb302016-05-31 14:34:46 -07001675 !cpu_online(cpu)) ||
1676 (cpu_isolated(cpu) && !allow_isolated))
1677 cpu = select_fallback_rq(task_cpu(p), p, allow_isolated);
Peter Zijlstrae2912002009-12-16 18:04:36 +01001678
1679 return cpu;
Peter Zijlstra970b13b2009-11-25 13:31:39 +01001680}
Mike Galbraith09a40af2010-04-15 07:29:59 +02001681
Pavankumar Kondetid4127502017-07-20 08:56:15 +05301682static void update_avg(u64 *avg, u64 sample)
Mike Galbraith09a40af2010-04-15 07:29:59 +02001683{
1684 s64 diff = sample - *avg;
1685 *avg += diff >> 3;
1686}
Peter Zijlstra25834c72015-05-15 17:43:34 +02001687
1688#else
1689
1690static inline int __set_cpus_allowed_ptr(struct task_struct *p,
1691 const struct cpumask *new_mask, bool check)
1692{
1693 return set_cpus_allowed_ptr(p, new_mask);
1694}
1695
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02001696#endif /* CONFIG_SMP */
Peter Zijlstra970b13b2009-11-25 13:31:39 +01001697
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02001698static void
Peter Zijlstrab84cb5d2011-04-05 17:23:55 +02001699ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
Tejun Heo9ed38112009-12-03 15:08:03 +09001700{
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05001701 struct rq *rq;
1702
1703 if (!schedstat_enabled())
1704 return;
1705
1706 rq = this_rq();
Tejun Heo9ed38112009-12-03 15:08:03 +09001707
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02001708#ifdef CONFIG_SMP
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05001709 if (cpu == rq->cpu) {
Josh Poimboeufae928822016-06-17 12:43:24 -05001710 schedstat_inc(rq->ttwu_local);
1711 schedstat_inc(p->se.statistics.nr_wakeups_local);
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02001712 } else {
1713 struct sched_domain *sd;
1714
Josh Poimboeufae928822016-06-17 12:43:24 -05001715 schedstat_inc(p->se.statistics.nr_wakeups_remote);
Peter Zijlstra057f3fa2011-04-18 11:24:34 +02001716 rcu_read_lock();
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05001717 for_each_domain(rq->cpu, sd) {
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02001718 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
Josh Poimboeufae928822016-06-17 12:43:24 -05001719 schedstat_inc(sd->ttwu_wake_remote);
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02001720 break;
1721 }
1722 }
Peter Zijlstra057f3fa2011-04-18 11:24:34 +02001723 rcu_read_unlock();
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02001724 }
Peter Zijlstraf339b9d2011-05-31 10:49:20 +02001725
1726 if (wake_flags & WF_MIGRATED)
Josh Poimboeufae928822016-06-17 12:43:24 -05001727 schedstat_inc(p->se.statistics.nr_wakeups_migrate);
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02001728#endif /* CONFIG_SMP */
1729
Josh Poimboeufae928822016-06-17 12:43:24 -05001730 schedstat_inc(rq->ttwu_count);
1731 schedstat_inc(p->se.statistics.nr_wakeups);
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02001732
1733 if (wake_flags & WF_SYNC)
Josh Poimboeufae928822016-06-17 12:43:24 -05001734 schedstat_inc(p->se.statistics.nr_wakeups_sync);
Tejun Heo9ed38112009-12-03 15:08:03 +09001735}
1736
Peter Zijlstra1de64442015-09-30 17:44:13 +02001737static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
Tejun Heo9ed38112009-12-03 15:08:03 +09001738{
Tejun Heo9ed38112009-12-03 15:08:03 +09001739 activate_task(rq, p, en_flags);
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04001740 p->on_rq = TASK_ON_RQ_QUEUED;
Peter Zijlstrac2f71152011-04-13 13:28:56 +02001741
1742 /* if a worker is waking up, notify workqueue */
1743 if (p->flags & PF_WQ_WORKER)
1744 wq_worker_waking_up(p, cpu_of(rq));
Tejun Heo9ed38112009-12-03 15:08:03 +09001745}
1746
Peter Zijlstra23f41ee2011-04-05 17:23:56 +02001747/*
1748 * Mark the task runnable and perform wakeup-preemption.
1749 */
Peter Zijlstrae7904a22015-08-01 19:25:08 +02001750static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
Matt Fleming5a91d732016-09-21 14:38:10 +01001751 struct rq_flags *rf)
Tejun Heo9ed38112009-12-03 15:08:03 +09001752{
Tejun Heo9ed38112009-12-03 15:08:03 +09001753 check_preempt_curr(rq, p, wake_flags);
Tejun Heo9ed38112009-12-03 15:08:03 +09001754 p->state = TASK_RUNNING;
Peter Zijlstrafbd705a2015-06-09 11:13:36 +02001755 trace_sched_wakeup(p);
1756
Tejun Heo9ed38112009-12-03 15:08:03 +09001757#ifdef CONFIG_SMP
Peter Zijlstra4c9a4bc2015-06-11 14:46:39 +02001758 if (p->sched_class->task_woken) {
1759 /*
Peter Zijlstracbce1a62015-06-11 14:46:54 +02001760 * Our task @p is fully woken up and running; so its safe to
1761 * drop the rq->lock, hereafter rq is only used for statistics.
Peter Zijlstra4c9a4bc2015-06-11 14:46:39 +02001762 */
Matt Fleming5a91d732016-09-21 14:38:10 +01001763 rq_unpin_lock(rq, rf);
Tejun Heo9ed38112009-12-03 15:08:03 +09001764 p->sched_class->task_woken(rq, p);
Matt Fleming5a91d732016-09-21 14:38:10 +01001765 rq_repin_lock(rq, rf);
Peter Zijlstra4c9a4bc2015-06-11 14:46:39 +02001766 }
Tejun Heo9ed38112009-12-03 15:08:03 +09001767
Steven Rostedte69c6342010-12-06 17:10:31 -05001768 if (rq->idle_stamp) {
Frederic Weisbecker78becc22013-04-12 01:51:02 +02001769 u64 delta = rq_clock(rq) - rq->idle_stamp;
Jason Low9bd721c2013-09-13 11:26:52 -07001770 u64 max = 2*rq->max_idle_balance_cost;
Tejun Heo9ed38112009-12-03 15:08:03 +09001771
Jason Lowabfafa52013-09-13 11:26:51 -07001772 update_avg(&rq->avg_idle, delta);
1773
1774 if (rq->avg_idle > max)
Tejun Heo9ed38112009-12-03 15:08:03 +09001775 rq->avg_idle = max;
Jason Lowabfafa52013-09-13 11:26:51 -07001776
Tejun Heo9ed38112009-12-03 15:08:03 +09001777 rq->idle_stamp = 0;
1778 }
1779#endif
1780}
1781
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02001782static void
Peter Zijlstrae7904a22015-08-01 19:25:08 +02001783ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
Matt Fleming5a91d732016-09-21 14:38:10 +01001784 struct rq_flags *rf)
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02001785{
Peter Zijlstrab5179ac2016-05-11 16:10:34 +02001786 int en_flags = ENQUEUE_WAKEUP;
1787
Peter Zijlstracbce1a62015-06-11 14:46:54 +02001788 lockdep_assert_held(&rq->lock);
1789
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02001790#ifdef CONFIG_SMP
1791 if (p->sched_contributes_to_load)
1792 rq->nr_uninterruptible--;
Peter Zijlstrab5179ac2016-05-11 16:10:34 +02001793
Peter Zijlstrab5179ac2016-05-11 16:10:34 +02001794 if (wake_flags & WF_MIGRATED)
Peter Zijlstra59efa0b2016-05-10 18:24:37 +02001795 en_flags |= ENQUEUE_MIGRATED;
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02001796#endif
1797
Peter Zijlstrab5179ac2016-05-11 16:10:34 +02001798 ttwu_activate(rq, p, en_flags);
Matt Fleming5a91d732016-09-21 14:38:10 +01001799 ttwu_do_wakeup(rq, p, wake_flags, rf);
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02001800}
1801
1802/*
1803 * Called in case the task @p isn't fully descheduled from its runqueue,
1804 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
1805 * since all we need to do is flip p->state to TASK_RUNNING, since
1806 * the task is still ->on_rq.
1807 */
1808static int ttwu_remote(struct task_struct *p, int wake_flags)
1809{
Peter Zijlstraeb580752015-07-31 21:28:18 +02001810 struct rq_flags rf;
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02001811 struct rq *rq;
1812 int ret = 0;
1813
Peter Zijlstraeb580752015-07-31 21:28:18 +02001814 rq = __task_rq_lock(p, &rf);
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04001815 if (task_on_rq_queued(p)) {
Frederic Weisbecker1ad4ec02013-04-12 01:51:00 +02001816 /* check_preempt_curr() may use rq clock */
1817 update_rq_clock(rq);
Matt Fleming5a91d732016-09-21 14:38:10 +01001818 ttwu_do_wakeup(rq, p, wake_flags, &rf);
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02001819 ret = 1;
1820 }
Peter Zijlstraeb580752015-07-31 21:28:18 +02001821 __task_rq_unlock(rq, &rf);
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02001822
1823 return ret;
1824}
1825
Peter Zijlstra317f3942011-04-05 17:23:58 +02001826#ifdef CONFIG_SMP
Peter Zijlstrae3baac42014-06-04 10:31:18 -07001827void sched_ttwu_pending(void)
Peter Zijlstra317f3942011-04-05 17:23:58 +02001828{
1829 struct rq *rq = this_rq();
Peter Zijlstrafa14ff42011-09-12 13:06:17 +02001830 struct llist_node *llist = llist_del_all(&rq->wake_list);
1831 struct task_struct *p;
Peter Zijlstrae3baac42014-06-04 10:31:18 -07001832 unsigned long flags;
Matt Fleming5a91d732016-09-21 14:38:10 +01001833 struct rq_flags rf;
Peter Zijlstra317f3942011-04-05 17:23:58 +02001834
Peter Zijlstrae3baac42014-06-04 10:31:18 -07001835 if (!llist)
1836 return;
1837
1838 raw_spin_lock_irqsave(&rq->lock, flags);
Matt Fleming5a91d732016-09-21 14:38:10 +01001839 rq_pin_lock(rq, &rf);
Peter Zijlstra317f3942011-04-05 17:23:58 +02001840
Peter Zijlstrafa14ff42011-09-12 13:06:17 +02001841 while (llist) {
Peter Zijlstrab7e7ade2016-05-23 11:19:07 +02001842 int wake_flags = 0;
1843
Peter Zijlstrafa14ff42011-09-12 13:06:17 +02001844 p = llist_entry(llist, struct task_struct, wake_entry);
1845 llist = llist_next(llist);
Peter Zijlstrab7e7ade2016-05-23 11:19:07 +02001846
1847 if (p->sched_remote_wakeup)
1848 wake_flags = WF_MIGRATED;
1849
Matt Fleming5a91d732016-09-21 14:38:10 +01001850 ttwu_do_activate(rq, p, wake_flags, &rf);
Peter Zijlstra317f3942011-04-05 17:23:58 +02001851 }
1852
Matt Fleming5a91d732016-09-21 14:38:10 +01001853 rq_unpin_lock(rq, &rf);
Peter Zijlstrae3baac42014-06-04 10:31:18 -07001854 raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra317f3942011-04-05 17:23:58 +02001855}
1856
1857void scheduler_ipi(void)
1858{
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07001859 int cpu = smp_processor_id();
1860
Peter Zijlstraf27dde82013-08-14 14:55:31 +02001861 /*
1862 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
1863 * TIF_NEED_RESCHED remotely (for the first time) will also send
1864 * this IPI.
1865 */
Peter Zijlstra8cb75e02013-11-20 12:22:37 +01001866 preempt_fold_need_resched();
Peter Zijlstraf27dde82013-08-14 14:55:31 +02001867
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07001868 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() &&
1869 !got_boost_kick())
Peter Zijlstrac5d753a2011-07-19 15:07:25 -07001870 return;
1871
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07001872 if (got_boost_kick()) {
1873 struct rq *rq = cpu_rq(cpu);
1874
1875 if (rq->curr->sched_class == &fair_sched_class)
1876 check_for_migration(rq, rq->curr);
1877 clear_boost_kick(cpu);
1878 }
1879
Peter Zijlstrac5d753a2011-07-19 15:07:25 -07001880 /*
1881 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
1882 * traditionally all their work was done from the interrupt return
1883 * path. Now that we actually do some work, we need to make sure
1884 * we do call them.
1885 *
1886 * Some archs already do call them, luckily irq_enter/exit nest
1887 * properly.
1888 *
1889 * Arguably we should visit all archs and update all handlers,
1890 * however a fair share of IPIs are still resched only so this would
1891 * somewhat pessimize the simple resched case.
1892 */
1893 irq_enter();
Peter Zijlstrafa14ff42011-09-12 13:06:17 +02001894 sched_ttwu_pending();
Suresh Siddhaca380622011-10-03 15:09:00 -07001895
1896 /*
1897 * Check if someone kicked us for doing the nohz idle load balance.
1898 */
Olav Haugand67250b2016-11-01 17:30:36 -07001899 if (unlikely(got_nohz_idle_kick()) && !cpu_isolated(cpu)) {
Suresh Siddha6eb57e02011-10-03 15:09:01 -07001900 this_rq()->idle_balance = 1;
Suresh Siddhaca380622011-10-03 15:09:00 -07001901 raise_softirq_irqoff(SCHED_SOFTIRQ);
Suresh Siddha6eb57e02011-10-03 15:09:01 -07001902 }
Peter Zijlstrac5d753a2011-07-19 15:07:25 -07001903 irq_exit();
Peter Zijlstra317f3942011-04-05 17:23:58 +02001904}
1905
Peter Zijlstrab7e7ade2016-05-23 11:19:07 +02001906static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
Peter Zijlstra317f3942011-04-05 17:23:58 +02001907{
Peter Zijlstrae3baac42014-06-04 10:31:18 -07001908 struct rq *rq = cpu_rq(cpu);
1909
Peter Zijlstrab7e7ade2016-05-23 11:19:07 +02001910 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
1911
Peter Zijlstrae3baac42014-06-04 10:31:18 -07001912 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
1913 if (!set_nr_if_polling(rq->idle))
1914 smp_send_reschedule(cpu);
1915 else
1916 trace_sched_wake_idle_without_ipi(cpu);
1917 }
Peter Zijlstra317f3942011-04-05 17:23:58 +02001918}
Peter Zijlstrad6aa8f82011-05-26 14:21:33 +02001919
Chuansheng Liuf6be8af2014-09-04 15:17:53 +08001920void wake_up_if_idle(int cpu)
1921{
1922 struct rq *rq = cpu_rq(cpu);
1923 unsigned long flags;
1924
Andy Lutomirskifd7de1e2014-11-29 08:13:51 -08001925 rcu_read_lock();
1926
1927 if (!is_idle_task(rcu_dereference(rq->curr)))
1928 goto out;
Chuansheng Liuf6be8af2014-09-04 15:17:53 +08001929
1930 if (set_nr_if_polling(rq->idle)) {
1931 trace_sched_wake_idle_without_ipi(cpu);
1932 } else {
1933 raw_spin_lock_irqsave(&rq->lock, flags);
1934 if (is_idle_task(rq->curr))
1935 smp_send_reschedule(cpu);
1936 /* Else cpu is not in idle, do nothing here */
1937 raw_spin_unlock_irqrestore(&rq->lock, flags);
1938 }
Andy Lutomirskifd7de1e2014-11-29 08:13:51 -08001939
1940out:
1941 rcu_read_unlock();
Chuansheng Liuf6be8af2014-09-04 15:17:53 +08001942}
1943
Peter Zijlstra39be3502012-01-26 12:44:34 +01001944bool cpus_share_cache(int this_cpu, int that_cpu)
Peter Zijlstra518cd622011-12-07 15:07:31 +01001945{
1946 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1947}
Peter Zijlstrad6aa8f82011-05-26 14:21:33 +02001948#endif /* CONFIG_SMP */
Peter Zijlstra317f3942011-04-05 17:23:58 +02001949
Peter Zijlstrab5179ac2016-05-11 16:10:34 +02001950static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02001951{
1952 struct rq *rq = cpu_rq(cpu);
Matt Fleming5a91d732016-09-21 14:38:10 +01001953 struct rq_flags rf;
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02001954
Daniel Hellstrom17d9f312011-05-20 04:01:10 +00001955#if defined(CONFIG_SMP)
Peter Zijlstra39be3502012-01-26 12:44:34 +01001956 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
Peter Zijlstraf01114c2011-05-31 12:26:55 +02001957 sched_clock_cpu(cpu); /* sync clocks x-cpu */
Peter Zijlstrab7e7ade2016-05-23 11:19:07 +02001958 ttwu_queue_remote(p, cpu, wake_flags);
Peter Zijlstra317f3942011-04-05 17:23:58 +02001959 return;
1960 }
1961#endif
1962
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02001963 raw_spin_lock(&rq->lock);
Matt Fleming5a91d732016-09-21 14:38:10 +01001964 rq_pin_lock(rq, &rf);
1965 ttwu_do_activate(rq, p, wake_flags, &rf);
1966 rq_unpin_lock(rq, &rf);
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02001967 raw_spin_unlock(&rq->lock);
Tejun Heo9ed38112009-12-03 15:08:03 +09001968}
1969
Peter Zijlstra8643cda2015-11-17 19:01:11 +01001970/*
1971 * Notes on Program-Order guarantees on SMP systems.
1972 *
1973 * MIGRATION
1974 *
1975 * The basic program-order guarantee on SMP systems is that when a task [t]
1976 * migrates, all its activity on its old cpu [c0] happens-before any subsequent
1977 * execution on its new cpu [c1].
1978 *
1979 * For migration (of runnable tasks) this is provided by the following means:
1980 *
1981 * A) UNLOCK of the rq(c0)->lock scheduling out task t
1982 * B) migration for t is required to synchronize *both* rq(c0)->lock and
1983 * rq(c1)->lock (if not at the same time, then in that order).
1984 * C) LOCK of the rq(c1)->lock scheduling in task
1985 *
1986 * Transitivity guarantees that B happens after A and C after B.
1987 * Note: we only require RCpc transitivity.
1988 * Note: the cpu doing B need not be c0 or c1
1989 *
1990 * Example:
1991 *
1992 * CPU0 CPU1 CPU2
1993 *
1994 * LOCK rq(0)->lock
1995 * sched-out X
1996 * sched-in Y
1997 * UNLOCK rq(0)->lock
1998 *
1999 * LOCK rq(0)->lock // orders against CPU0
2000 * dequeue X
2001 * UNLOCK rq(0)->lock
2002 *
2003 * LOCK rq(1)->lock
2004 * enqueue X
2005 * UNLOCK rq(1)->lock
2006 *
2007 * LOCK rq(1)->lock // orders against CPU2
2008 * sched-out Z
2009 * sched-in X
2010 * UNLOCK rq(1)->lock
2011 *
2012 *
2013 * BLOCKING -- aka. SLEEP + WAKEUP
2014 *
2015 * For blocking we (obviously) need to provide the same guarantee as for
2016 * migration. However the means are completely different as there is no lock
2017 * chain to provide order. Instead we do:
2018 *
2019 * 1) smp_store_release(X->on_cpu, 0)
Peter Zijlstra1f03e8d2016-04-04 10:57:12 +02002020 * 2) smp_cond_load_acquire(!X->on_cpu)
Peter Zijlstra8643cda2015-11-17 19:01:11 +01002021 *
2022 * Example:
2023 *
2024 * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule)
2025 *
2026 * LOCK rq(0)->lock LOCK X->pi_lock
2027 * dequeue X
2028 * sched-out X
2029 * smp_store_release(X->on_cpu, 0);
2030 *
Peter Zijlstra1f03e8d2016-04-04 10:57:12 +02002031 * smp_cond_load_acquire(&X->on_cpu, !VAL);
Peter Zijlstra8643cda2015-11-17 19:01:11 +01002032 * X->state = WAKING
2033 * set_task_cpu(X,2)
2034 *
2035 * LOCK rq(2)->lock
2036 * enqueue X
2037 * X->state = RUNNING
2038 * UNLOCK rq(2)->lock
2039 *
2040 * LOCK rq(2)->lock // orders against CPU1
2041 * sched-out Z
2042 * sched-in X
2043 * UNLOCK rq(2)->lock
2044 *
2045 * UNLOCK X->pi_lock
2046 * UNLOCK rq(0)->lock
2047 *
2048 *
2049 * However; for wakeups there is a second guarantee we must provide, namely we
2050 * must observe the state that lead to our wakeup. That is, not only must our
2051 * task observe its own prior state, it must also observe the stores prior to
2052 * its wakeup.
2053 *
2054 * This means that any means of doing remote wakeups must order the CPU doing
2055 * the wakeup against the CPU the task is going to end up running on. This,
2056 * however, is already required for the regular Program-Order guarantee above,
Peter Zijlstra1f03e8d2016-04-04 10:57:12 +02002057 * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
Peter Zijlstra8643cda2015-11-17 19:01:11 +01002058 *
2059 */
2060
Tejun Heo9ed38112009-12-03 15:08:03 +09002061/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07002062 * try_to_wake_up - wake up a thread
Tejun Heo9ed38112009-12-03 15:08:03 +09002063 * @p: the thread to be awakened
Linus Torvalds1da177e2005-04-16 15:20:36 -07002064 * @state: the mask of task states that can be woken
Tejun Heo9ed38112009-12-03 15:08:03 +09002065 * @wake_flags: wake modifier flags (WF_*)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002066 *
2067 * Put it on the run-queue if it's not already there. The "current"
2068 * thread is always on the run-queue (except when the actual
2069 * re-schedule is in progress), and as such you're allowed to do
2070 * the simpler "current->state = TASK_RUNNING" to mark yourself
2071 * runnable without the overhead of this.
2072 *
Yacine Belkadie69f6182013-07-12 20:45:47 +02002073 * Return: %true if @p was woken up, %false if it was already running.
Tejun Heo9ed38112009-12-03 15:08:03 +09002074 * or @state didn't match @p's state.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002075 */
Peter Zijlstrae4a52bc2011-04-05 17:23:54 +02002076static int
2077try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002078{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002079 unsigned long flags;
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02002080 int cpu, success = 0;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002081#ifdef CONFIG_SMP
2082 unsigned int old_load;
2083 struct rq *rq;
2084 u64 wallclock;
2085 struct related_thread_group *grp = NULL;
2086 int src_cpu;
2087 bool notif_required = false;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002088 bool check_group = false;
Joonwoo Park432cf0bb2016-11-30 15:00:16 -08002089#endif
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002090
Oleg Nesterove0acd0a2013-08-12 18:14:00 +02002091 /*
2092 * If we are going to wake up a thread waiting for CONDITION we
2093 * need to ensure that CONDITION=1 done by the caller can not be
2094 * reordered with p->state check below. This pairs with mb() in
2095 * set_current_state() the waiting thread does.
2096 */
2097 smp_mb__before_spinlock();
Peter Zijlstra013fdb82011-04-05 17:23:45 +02002098 raw_spin_lock_irqsave(&p->pi_lock, flags);
Peter Zijlstrae9c84312009-09-15 14:43:03 +02002099 if (!(p->state & state))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002100 goto out;
2101
Peter Zijlstrafbd705a2015-06-09 11:13:36 +02002102 trace_sched_waking(p);
2103
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02002104 success = 1; /* we're going to change ->state */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002105 cpu = task_cpu(p);
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02002106
Balbir Singh135e8c92016-09-05 13:16:40 +10002107 /*
2108 * Ensure we load p->on_rq _after_ p->state, otherwise it would
2109 * be possible to, falsely, observe p->on_rq == 0 and get stuck
2110 * in smp_cond_load_acquire() below.
2111 *
2112 * sched_ttwu_pending() try_to_wake_up()
2113 * [S] p->on_rq = 1; [L] P->state
2114 * UNLOCK rq->lock -----.
2115 * \
2116 * +--- RMB
2117 * schedule() /
2118 * LOCK rq->lock -----'
2119 * UNLOCK rq->lock
2120 *
2121 * [task p]
2122 * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq
2123 *
2124 * Pairs with the UNLOCK+LOCK on rq->lock from the
2125 * last wakeup of our task and the schedule that got our task
2126 * current.
2127 */
2128 smp_rmb();
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02002129 if (p->on_rq && ttwu_remote(p, wake_flags))
2130 goto stat;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002131
2132#ifdef CONFIG_SMP
Peter Zijlstrae9c84312009-09-15 14:43:03 +02002133 /*
Peter Zijlstraecf7d012015-10-07 14:14:13 +02002134 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
2135 * possible to, falsely, observe p->on_cpu == 0.
2136 *
2137 * One must be running (->on_cpu == 1) in order to remove oneself
2138 * from the runqueue.
2139 *
2140 * [S] ->on_cpu = 1; [L] ->on_rq
2141 * UNLOCK rq->lock
2142 * RMB
2143 * LOCK rq->lock
2144 * [S] ->on_rq = 0; [L] ->on_cpu
2145 *
2146 * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
2147 * from the consecutive calls to schedule(); the first switching to our
2148 * task, the second putting it to sleep.
2149 */
2150 smp_rmb();
2151
2152 /*
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02002153 * If the owning (remote) cpu is still in the middle of schedule() with
2154 * this task as prev, wait until its done referencing the task.
Peter Zijlstrab75a2252015-10-06 14:36:17 +02002155 *
2156 * Pairs with the smp_store_release() in finish_lock_switch().
2157 *
2158 * This ensures that tasks getting woken will be fully ordered against
2159 * their previous state and preserve Program Order.
Peter Zijlstrae4a52bc2011-04-05 17:23:54 +02002160 */
Peter Zijlstra1f03e8d2016-04-04 10:57:12 +02002161 smp_cond_load_acquire(&p->on_cpu, !VAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002162
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002163 rq = cpu_rq(task_cpu(p));
2164 raw_spin_lock(&rq->lock);
2165 old_load = task_load(p);
Pavankumar Kondetifaa04442018-06-25 16:13:39 +05302166 wallclock = sched_ktime_clock();
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002167 update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
2168 update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
2169 raw_spin_unlock(&rq->lock);
2170
2171 rcu_read_lock();
2172 grp = task_related_thread_group(p);
2173 if (update_preferred_cluster(grp, p, old_load))
2174 set_preferred_cluster(grp);
2175 rcu_read_unlock();
2176 check_group = grp != NULL;
2177
Peter Zijlstraa8e4f2e2011-04-05 17:23:49 +02002178 p->sched_contributes_to_load = !!task_contributes_to_load(p);
Peter Zijlstrae9c84312009-09-15 14:43:03 +02002179 p->state = TASK_WAKING;
Peter Zijlstraefbbd052009-12-16 18:04:40 +01002180
Peter Zijlstraac66f542013-10-07 11:29:16 +01002181 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002182 src_cpu = task_cpu(p);
2183 if (src_cpu != cpu) {
Peter Zijlstraf339b9d2011-05-31 10:49:20 +02002184 wake_flags |= WF_MIGRATED;
Johannes Weiner3df0e592018-10-26 15:06:27 -07002185 psi_ttwu_dequeue(p);
Mike Galbraith055a0082009-11-12 11:07:44 +01002186 set_task_cpu(p, cpu);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002187 notif_required = true;
Peter Zijlstraf339b9d2011-05-31 10:49:20 +02002188 }
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002189
Srivatsa Vaddagiri52465c42016-09-09 19:50:27 +05302190 note_task_waking(p, wallclock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002191#endif /* CONFIG_SMP */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002192
Peter Zijlstrab5179ac2016-05-11 16:10:34 +02002193 ttwu_queue(p, cpu, wake_flags);
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02002194stat:
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05002195 ttwu_stat(p, cpu, wake_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002196out:
Peter Zijlstra013fdb82011-04-05 17:23:45 +02002197 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002198
Syed Rameez Mustafae14a2332017-05-19 14:42:35 -07002199 if (success && sched_predl) {
2200 raw_spin_lock_irqsave(&cpu_rq(cpu)->lock, flags);
2201 if (do_pl_notif(cpu_rq(cpu)))
2202 cpufreq_update_util(cpu_rq(cpu),
2203 SCHED_CPUFREQ_WALT |
2204 SCHED_CPUFREQ_PL);
2205 raw_spin_unlock_irqrestore(&cpu_rq(cpu)->lock, flags);
2206 }
2207
Linus Torvalds1da177e2005-04-16 15:20:36 -07002208 return success;
2209}
2210
David Howells50fa6102009-04-28 15:01:38 +01002211/**
Tejun Heo21aa9af2010-06-08 21:40:37 +02002212 * try_to_wake_up_local - try to wake up a local task with rq lock held
2213 * @p: the thread to be awakened
Luis de Bethencourt9279e0d2016-07-10 15:00:26 +01002214 * @cookie: context's cookie for pinning
Tejun Heo21aa9af2010-06-08 21:40:37 +02002215 *
Peter Zijlstra2acca552011-04-05 17:23:50 +02002216 * Put @p on the run-queue if it's not already there. The caller must
Tejun Heo21aa9af2010-06-08 21:40:37 +02002217 * ensure that this_rq() is locked, @p is bound to this_rq() and not
Peter Zijlstra2acca552011-04-05 17:23:50 +02002218 * the current task.
Tejun Heo21aa9af2010-06-08 21:40:37 +02002219 */
Matt Fleming5a91d732016-09-21 14:38:10 +01002220static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
Tejun Heo21aa9af2010-06-08 21:40:37 +02002221{
2222 struct rq *rq = task_rq(p);
Tejun Heo21aa9af2010-06-08 21:40:37 +02002223
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002224 if (rq != this_rq() || p == current) {
2225 printk_deferred("%s: Failed to wakeup task %d (%s), rq = %p,"
2226 " this_rq = %p, p = %p, current = %p\n", __func__,
2227 task_pid_nr(p), p->comm, rq, this_rq(), p, current);
2228
Tejun Heo383efcd2013-03-18 12:22:34 -07002229 return;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002230 }
Tejun Heo383efcd2013-03-18 12:22:34 -07002231
Tejun Heo21aa9af2010-06-08 21:40:37 +02002232 lockdep_assert_held(&rq->lock);
2233
Peter Zijlstra2acca552011-04-05 17:23:50 +02002234 if (!raw_spin_trylock(&p->pi_lock)) {
Peter Zijlstracbce1a62015-06-11 14:46:54 +02002235 /*
2236 * This is OK, because current is on_cpu, which avoids it being
2237 * picked for load-balance and preemption/IRQs are still
2238 * disabled avoiding further scheduler activity on it and we've
2239 * not yet picked a replacement task.
2240 */
Matt Fleming5a91d732016-09-21 14:38:10 +01002241 rq_unpin_lock(rq, rf);
Peter Zijlstra2acca552011-04-05 17:23:50 +02002242 raw_spin_unlock(&rq->lock);
2243 raw_spin_lock(&p->pi_lock);
2244 raw_spin_lock(&rq->lock);
Matt Fleming5a91d732016-09-21 14:38:10 +01002245 rq_repin_lock(rq, rf);
Tejun Heo21aa9af2010-06-08 21:40:37 +02002246 }
Peter Zijlstra2acca552011-04-05 17:23:50 +02002247
Tejun Heo21aa9af2010-06-08 21:40:37 +02002248 if (!(p->state & TASK_NORMAL))
Peter Zijlstra2acca552011-04-05 17:23:50 +02002249 goto out;
Tejun Heo21aa9af2010-06-08 21:40:37 +02002250
Peter Zijlstrafbd705a2015-06-09 11:13:36 +02002251 trace_sched_waking(p);
2252
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002253 if (!task_on_rq_queued(p)) {
Pavankumar Kondetifaa04442018-06-25 16:13:39 +05302254 u64 wallclock = sched_ktime_clock();
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002255
2256 update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
2257 update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02002258 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
Srivatsa Vaddagiri52465c42016-09-09 19:50:27 +05302259 note_task_waking(p, wallclock);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002260 }
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02002261
Matt Fleming5a91d732016-09-21 14:38:10 +01002262 ttwu_do_wakeup(rq, p, 0, rf);
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05002263 ttwu_stat(p, smp_processor_id(), 0);
Peter Zijlstra2acca552011-04-05 17:23:50 +02002264out:
2265 raw_spin_unlock(&p->pi_lock);
Tejun Heo21aa9af2010-06-08 21:40:37 +02002266}
2267
2268/**
David Howells50fa6102009-04-28 15:01:38 +01002269 * wake_up_process - Wake up a specific process
2270 * @p: The process to be woken up.
2271 *
2272 * Attempt to wake up the nominated process and move it to the set of runnable
Yacine Belkadie69f6182013-07-12 20:45:47 +02002273 * processes.
2274 *
2275 * Return: 1 if the process was woken up, 0 if it was already running.
David Howells50fa6102009-04-28 15:01:38 +01002276 *
2277 * It may be assumed that this function implies a write memory barrier before
2278 * changing the task state if and only if any tasks are woken up.
2279 */
Harvey Harrison7ad5b3a2008-02-08 04:19:53 -08002280int wake_up_process(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002281{
Oleg Nesterov9067ac82013-01-21 20:48:17 +01002282 return try_to_wake_up(p, TASK_NORMAL, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002283}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002284EXPORT_SYMBOL(wake_up_process);
2285
Harvey Harrison7ad5b3a2008-02-08 04:19:53 -08002286int wake_up_state(struct task_struct *p, unsigned int state)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002287{
2288 return try_to_wake_up(p, state, 0);
2289}
2290
Linus Torvalds1da177e2005-04-16 15:20:36 -07002291/*
Juri Lellia5e7be32014-09-19 10:22:39 +01002292 * This function clears the sched_dl_entity static params.
2293 */
2294void __dl_clear_params(struct task_struct *p)
2295{
2296 struct sched_dl_entity *dl_se = &p->dl;
2297
2298 dl_se->dl_runtime = 0;
2299 dl_se->dl_deadline = 0;
2300 dl_se->dl_period = 0;
2301 dl_se->flags = 0;
2302 dl_se->dl_bw = 0;
Daniel Bristot de Oliveira0559ea32017-05-29 16:24:03 +02002303 dl_se->dl_density = 0;
Peter Zijlstra40767b02015-01-28 15:08:03 +01002304
2305 dl_se->dl_throttled = 0;
Peter Zijlstra40767b02015-01-28 15:08:03 +01002306 dl_se->dl_yielded = 0;
Juri Lellia5e7be32014-09-19 10:22:39 +01002307}
2308
2309/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002310 * Perform scheduler related setup for a newly forked process p.
2311 * p is forked by current.
Ingo Molnardd41f592007-07-09 18:51:59 +02002312 *
2313 * __sched_fork() is basic setup used by init_idle() too:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002314 */
Rik van Riel5e1576e2013-10-07 11:29:26 +01002315static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002316{
Peter Zijlstrafd2f4412011-04-05 17:23:44 +02002317 p->on_rq = 0;
2318
2319 p->se.on_rq = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02002320 p->se.exec_start = 0;
2321 p->se.sum_exec_runtime = 0;
Ingo Molnarf6cf8912007-08-28 12:53:24 +02002322 p->se.prev_sum_exec_runtime = 0;
Ingo Molnar6c594c22008-12-14 12:34:15 +01002323 p->se.nr_migrations = 0;
Peter Zijlstrada7a7352011-01-17 17:03:27 +01002324 p->se.vruntime = 0;
Joonwoo Park84a80882017-02-03 11:15:31 -08002325 p->last_sleep_ts = 0;
Pavankumar Kondeti7cc02922018-03-23 11:15:21 +05302326 p->last_cpu_selected_ts = 0;
Joonwoo Park84a80882017-02-03 11:15:31 -08002327
Peter Zijlstrafd2f4412011-04-05 17:23:44 +02002328 INIT_LIST_HEAD(&p->se.group_node);
Ingo Molnar6cfb0d52007-08-02 17:41:40 +02002329
Byungchul Parkad936d82015-10-24 01:16:19 +09002330#ifdef CONFIG_FAIR_GROUP_SCHED
2331 p->se.cfs_rq = NULL;
2332#endif
2333
Ingo Molnar6cfb0d52007-08-02 17:41:40 +02002334#ifdef CONFIG_SCHEDSTATS
Mel Gormancb251762016-02-05 09:08:36 +00002335 /* Even if schedstat is disabled, there should not be garbage */
Lucas De Marchi41acab82010-03-10 23:37:45 -03002336 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
Ingo Molnar6cfb0d52007-08-02 17:41:40 +02002337#endif
Nick Piggin476d1392005-06-25 14:57:29 -07002338
Dario Faggioliaab03e02013-11-28 11:14:43 +01002339 RB_CLEAR_NODE(&p->dl.rb_node);
Peter Zijlstra40767b02015-01-28 15:08:03 +01002340 init_dl_task_timer(&p->dl);
Juri Lellia5e7be32014-09-19 10:22:39 +01002341 __dl_clear_params(p);
Dario Faggioliaab03e02013-11-28 11:14:43 +01002342
Peter Zijlstrafa717062008-01-25 21:08:27 +01002343 INIT_LIST_HEAD(&p->rt.run_list);
Peter Zijlstraff77e462016-01-18 15:27:07 +01002344 p->rt.timeout = 0;
2345 p->rt.time_slice = sched_rr_timeslice;
2346 p->rt.on_rq = 0;
2347 p->rt.on_list = 0;
Nick Piggin476d1392005-06-25 14:57:29 -07002348
Avi Kivitye107be32007-07-26 13:40:43 +02002349#ifdef CONFIG_PREEMPT_NOTIFIERS
2350 INIT_HLIST_HEAD(&p->preempt_notifiers);
2351#endif
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002352
2353#ifdef CONFIG_NUMA_BALANCING
2354 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
Mel Gorman7e8d16b2013-10-07 11:28:54 +01002355 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002356 p->mm->numa_scan_seq = 0;
2357 }
2358
Rik van Riel5e1576e2013-10-07 11:29:26 +01002359 if (clone_flags & CLONE_VM)
2360 p->numa_preferred_nid = current->numa_preferred_nid;
2361 else
2362 p->numa_preferred_nid = -1;
2363
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002364 p->node_stamp = 0ULL;
2365 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
Peter Zijlstra4b96a29b2012-10-25 14:16:47 +02002366 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002367 p->numa_work.next = &p->numa_work;
Iulia Manda44dba3d2014-10-31 02:13:31 +02002368 p->numa_faults = NULL;
Rik van Riel7e2703e2014-01-27 17:03:45 -05002369 p->last_task_numa_placement = 0;
2370 p->last_sum_exec_runtime = 0;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002371
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002372 p->numa_group = NULL;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002373#endif /* CONFIG_NUMA_BALANCING */
Ingo Molnardd41f592007-07-09 18:51:59 +02002374}
2375
Srikar Dronamraju2a595722015-08-11 21:54:21 +05302376DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
2377
Mel Gorman1a687c22012-11-22 11:16:36 +00002378#ifdef CONFIG_NUMA_BALANCING
Mel Gorman3105b862012-11-23 11:23:49 +00002379
2380void set_numabalancing_state(bool enabled)
2381{
Srikar Dronamraju2a595722015-08-11 21:54:21 +05302382 if (enabled)
2383 static_branch_enable(&sched_numa_balancing);
2384 else
2385 static_branch_disable(&sched_numa_balancing);
Mel Gorman3105b862012-11-23 11:23:49 +00002386}
Andi Kleen54a43d52014-01-23 15:53:13 -08002387
2388#ifdef CONFIG_PROC_SYSCTL
2389int sysctl_numa_balancing(struct ctl_table *table, int write,
2390 void __user *buffer, size_t *lenp, loff_t *ppos)
2391{
2392 struct ctl_table t;
2393 int err;
Srikar Dronamraju2a595722015-08-11 21:54:21 +05302394 int state = static_branch_likely(&sched_numa_balancing);
Andi Kleen54a43d52014-01-23 15:53:13 -08002395
2396 if (write && !capable(CAP_SYS_ADMIN))
2397 return -EPERM;
2398
2399 t = *table;
2400 t.data = &state;
2401 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2402 if (err < 0)
2403 return err;
2404 if (write)
2405 set_numabalancing_state(state);
2406 return err;
2407}
2408#endif
2409#endif
Mel Gorman1a687c22012-11-22 11:16:36 +00002410
Mel Gormancb251762016-02-05 09:08:36 +00002411#ifdef CONFIG_SCHEDSTATS
Josh Poimboeuf4698f882016-06-07 14:43:16 -05002412
2413DEFINE_STATIC_KEY_FALSE(sched_schedstats);
2414static bool __initdata __sched_schedstats = false;
2415
Mel Gormancb251762016-02-05 09:08:36 +00002416static void set_schedstats(bool enabled)
2417{
2418 if (enabled)
2419 static_branch_enable(&sched_schedstats);
2420 else
2421 static_branch_disable(&sched_schedstats);
2422}
2423
2424void force_schedstat_enabled(void)
2425{
2426 if (!schedstat_enabled()) {
2427 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
2428 static_branch_enable(&sched_schedstats);
2429 }
2430}
2431
2432static int __init setup_schedstats(char *str)
2433{
2434 int ret = 0;
2435 if (!str)
2436 goto out;
2437
Josh Poimboeuf4698f882016-06-07 14:43:16 -05002438 /*
2439 * This code is called before jump labels have been set up, so we can't
2440 * change the static branch directly just yet. Instead set a temporary
2441 * variable so init_schedstats() can do it later.
2442 */
Mel Gormancb251762016-02-05 09:08:36 +00002443 if (!strcmp(str, "enable")) {
Josh Poimboeuf4698f882016-06-07 14:43:16 -05002444 __sched_schedstats = true;
Mel Gormancb251762016-02-05 09:08:36 +00002445 ret = 1;
2446 } else if (!strcmp(str, "disable")) {
Josh Poimboeuf4698f882016-06-07 14:43:16 -05002447 __sched_schedstats = false;
Mel Gormancb251762016-02-05 09:08:36 +00002448 ret = 1;
2449 }
2450out:
2451 if (!ret)
2452 pr_warn("Unable to parse schedstats=\n");
2453
2454 return ret;
2455}
2456__setup("schedstats=", setup_schedstats);
2457
Josh Poimboeuf4698f882016-06-07 14:43:16 -05002458static void __init init_schedstats(void)
2459{
2460 set_schedstats(__sched_schedstats);
2461}
2462
Mel Gormancb251762016-02-05 09:08:36 +00002463#ifdef CONFIG_PROC_SYSCTL
2464int sysctl_schedstats(struct ctl_table *table, int write,
2465 void __user *buffer, size_t *lenp, loff_t *ppos)
2466{
2467 struct ctl_table t;
2468 int err;
2469 int state = static_branch_likely(&sched_schedstats);
2470
2471 if (write && !capable(CAP_SYS_ADMIN))
2472 return -EPERM;
2473
2474 t = *table;
2475 t.data = &state;
2476 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2477 if (err < 0)
2478 return err;
2479 if (write)
2480 set_schedstats(state);
2481 return err;
2482}
Josh Poimboeuf4698f882016-06-07 14:43:16 -05002483#endif /* CONFIG_PROC_SYSCTL */
2484#else /* !CONFIG_SCHEDSTATS */
2485static inline void init_schedstats(void) {}
2486#endif /* CONFIG_SCHEDSTATS */
Mel Gormancb251762016-02-05 09:08:36 +00002487
Ingo Molnardd41f592007-07-09 18:51:59 +02002488/*
2489 * fork()/clone()-time setup:
2490 */
Dario Faggioliaab03e02013-11-28 11:14:43 +01002491int sched_fork(unsigned long clone_flags, struct task_struct *p)
Ingo Molnardd41f592007-07-09 18:51:59 +02002492{
Peter Zijlstra0122ec52011-04-05 17:23:51 +02002493 unsigned long flags;
Syed Rameez Mustafae5aaa482016-11-01 18:13:36 -07002494 int cpu;
2495
Pavankumar Kondeti736630c2018-09-20 15:31:36 +05302496 init_new_task_load(p);
Syed Rameez Mustafae5aaa482016-11-01 18:13:36 -07002497 cpu = get_cpu();
Ingo Molnardd41f592007-07-09 18:51:59 +02002498
Rik van Riel5e1576e2013-10-07 11:29:26 +01002499 __sched_fork(clone_flags, p);
Peter Zijlstra06b83b52009-12-16 18:04:35 +01002500 /*
Peter Zijlstra7dc603c2016-06-16 13:29:28 +02002501 * We mark the process as NEW here. This guarantees that
Peter Zijlstra06b83b52009-12-16 18:04:35 +01002502 * nobody will actually run it, and a signal or other external
2503 * event cannot wake it up and insert it on the runqueue either.
2504 */
Peter Zijlstra7dc603c2016-06-16 13:29:28 +02002505 p->state = TASK_NEW;
Ingo Molnardd41f592007-07-09 18:51:59 +02002506
Ingo Molnarb29739f2006-06-27 02:54:51 -07002507 /*
Mike Galbraithc350a042011-07-27 17:14:55 +02002508 * Make sure we do not leak PI boosting priority to the child.
2509 */
2510 p->prio = current->normal_prio;
2511
2512 /*
Mike Galbraithb9dc29e2009-06-17 10:46:01 +02002513 * Revert to default priority/policy on fork if requested.
2514 */
2515 if (unlikely(p->sched_reset_on_fork)) {
Dario Faggioliaab03e02013-11-28 11:14:43 +01002516 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
Mike Galbraithb9dc29e2009-06-17 10:46:01 +02002517 p->policy = SCHED_NORMAL;
Mike Galbraith6c697bd2009-06-17 10:48:02 +02002518 p->static_prio = NICE_TO_PRIO(0);
Mike Galbraithc350a042011-07-27 17:14:55 +02002519 p->rt_priority = 0;
2520 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2521 p->static_prio = NICE_TO_PRIO(0);
2522
2523 p->prio = p->normal_prio = __normal_prio(p);
2524 set_load_weight(p);
Mike Galbraith6c697bd2009-06-17 10:48:02 +02002525
Mike Galbraithb9dc29e2009-06-17 10:46:01 +02002526 /*
2527 * We don't need the reset flag anymore after the fork. It has
2528 * fulfilled its duty:
2529 */
2530 p->sched_reset_on_fork = 0;
2531 }
Lennart Poetteringca94c442009-06-15 17:17:47 +02002532
Dario Faggioliaab03e02013-11-28 11:14:43 +01002533 if (dl_prio(p->prio)) {
2534 put_cpu();
2535 return -EAGAIN;
2536 } else if (rt_prio(p->prio)) {
2537 p->sched_class = &rt_sched_class;
2538 } else {
Hiroshi Shimamoto2ddbf952007-10-15 17:00:11 +02002539 p->sched_class = &fair_sched_class;
Dario Faggioliaab03e02013-11-28 11:14:43 +01002540 }
Ingo Molnarb29739f2006-06-27 02:54:51 -07002541
Peter Zijlstra7dc603c2016-06-16 13:29:28 +02002542 init_entity_runnable_average(&p->se);
Peter Zijlstracd29fe62009-11-27 17:32:46 +01002543
Peter Zijlstra86951592010-06-22 11:44:53 +02002544 /*
2545 * The child is not yet in the pid-hash so no cgroup attach races,
2546 * and the cgroup is pinned to this child due to cgroup_fork()
2547 * is ran before sched_fork().
2548 *
2549 * Silence PROVE_RCU.
2550 */
Peter Zijlstra0122ec52011-04-05 17:23:51 +02002551 raw_spin_lock_irqsave(&p->pi_lock, flags);
Peter Zijlstrae210bff2016-06-16 18:51:48 +02002552 /*
2553 * We're setting the cpu for the first time, we don't migrate,
2554 * so use __set_task_cpu().
2555 */
2556 __set_task_cpu(p, cpu);
2557 if (p->sched_class->task_fork)
2558 p->sched_class->task_fork(p);
Peter Zijlstra0122ec52011-04-05 17:23:51 +02002559 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
Peter Zijlstra5f3edc12009-09-10 13:42:00 +02002560
Naveen N. Raof6db8342015-06-25 23:53:37 +05302561#ifdef CONFIG_SCHED_INFO
Ingo Molnardd41f592007-07-09 18:51:59 +02002562 if (likely(sched_info_on()))
Chandra Seetharaman52f17b62006-07-14 00:24:38 -07002563 memset(&p->sched_info, 0, sizeof(p->sched_info));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002564#endif
Peter Zijlstra3ca7a442011-04-05 17:23:40 +02002565#if defined(CONFIG_SMP)
2566 p->on_cpu = 0;
Nick Piggin4866cde2005-06-25 14:57:23 -07002567#endif
Peter Zijlstra01028742013-08-14 14:55:46 +02002568 init_task_preempt_count(p);
Dario Faggioli806c09a2010-11-30 19:51:33 +01002569#ifdef CONFIG_SMP
Gregory Haskins917b6272008-12-29 09:39:53 -05002570 plist_node_init(&p->pushable_tasks, MAX_PRIO);
Juri Lelli1baca4c2013-11-07 14:43:38 +01002571 RB_CLEAR_NODE(&p->pushable_dl_tasks);
Dario Faggioli806c09a2010-11-30 19:51:33 +01002572#endif
Gregory Haskins917b6272008-12-29 09:39:53 -05002573
Nick Piggin476d1392005-06-25 14:57:29 -07002574 put_cpu();
Dario Faggioliaab03e02013-11-28 11:14:43 +01002575 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002576}
2577
Dario Faggioli332ac172013-11-07 14:43:45 +01002578unsigned long to_ratio(u64 period, u64 runtime)
2579{
2580 if (runtime == RUNTIME_INF)
2581 return 1ULL << 20;
2582
2583 /*
2584 * Doing this here saves a lot of checks in all
2585 * the calling paths, and returning zero seems
2586 * safe for them anyway.
2587 */
2588 if (period == 0)
2589 return 0;
2590
2591 return div64_u64(runtime << 20, period);
2592}
2593
2594#ifdef CONFIG_SMP
2595inline struct dl_bw *dl_bw_of(int i)
2596{
Paul E. McKenneyf78f5b92015-06-18 15:50:02 -07002597 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
2598 "sched RCU must be held");
Dario Faggioli332ac172013-11-07 14:43:45 +01002599 return &cpu_rq(i)->rd->dl_bw;
2600}
2601
Peter Zijlstrade212f12013-12-19 11:54:45 +01002602static inline int dl_bw_cpus(int i)
Dario Faggioli332ac172013-11-07 14:43:45 +01002603{
Peter Zijlstrade212f12013-12-19 11:54:45 +01002604 struct root_domain *rd = cpu_rq(i)->rd;
2605 int cpus = 0;
2606
Paul E. McKenneyf78f5b92015-06-18 15:50:02 -07002607 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
2608 "sched RCU must be held");
Peter Zijlstrade212f12013-12-19 11:54:45 +01002609 for_each_cpu_and(i, rd->span, cpu_active_mask)
2610 cpus++;
2611
2612 return cpus;
Dario Faggioli332ac172013-11-07 14:43:45 +01002613}
2614#else
2615inline struct dl_bw *dl_bw_of(int i)
2616{
2617 return &cpu_rq(i)->dl.dl_bw;
2618}
2619
Peter Zijlstrade212f12013-12-19 11:54:45 +01002620static inline int dl_bw_cpus(int i)
Dario Faggioli332ac172013-11-07 14:43:45 +01002621{
2622 return 1;
2623}
2624#endif
2625
Dario Faggioli332ac172013-11-07 14:43:45 +01002626/*
2627 * We must be sure that accepting a new task (or allowing changing the
2628 * parameters of an existing one) is consistent with the bandwidth
2629 * constraints. If yes, this function also accordingly updates the currently
2630 * allocated bandwidth to reflect the new situation.
2631 *
2632 * This function is called while holding p's rq->lock.
Peter Zijlstra40767b02015-01-28 15:08:03 +01002633 *
2634 * XXX we should delay bw change until the task's 0-lag point, see
2635 * __setparam_dl().
Dario Faggioli332ac172013-11-07 14:43:45 +01002636 */
2637static int dl_overflow(struct task_struct *p, int policy,
2638 const struct sched_attr *attr)
2639{
2640
2641 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
Steven Rostedt4df16382014-02-19 13:53:35 -05002642 u64 period = attr->sched_period ?: attr->sched_deadline;
Dario Faggioli332ac172013-11-07 14:43:45 +01002643 u64 runtime = attr->sched_runtime;
2644 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
Peter Zijlstrade212f12013-12-19 11:54:45 +01002645 int cpus, err = -1;
Dario Faggioli332ac172013-11-07 14:43:45 +01002646
Xunlei Pangfec148c2016-04-14 20:19:28 +08002647 /* !deadline task may carry old deadline bandwidth */
2648 if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
Dario Faggioli332ac172013-11-07 14:43:45 +01002649 return 0;
2650
2651 /*
2652 * Either if a task, enters, leave, or stays -deadline but changes
2653 * its parameters, we may need to update accordingly the total
2654 * allocated bandwidth of the container.
2655 */
2656 raw_spin_lock(&dl_b->lock);
Peter Zijlstrade212f12013-12-19 11:54:45 +01002657 cpus = dl_bw_cpus(task_cpu(p));
Dario Faggioli332ac172013-11-07 14:43:45 +01002658 if (dl_policy(policy) && !task_has_dl_policy(p) &&
2659 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
2660 __dl_add(dl_b, new_bw);
2661 err = 0;
2662 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
2663 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
2664 __dl_clear(dl_b, p->dl.dl_bw);
2665 __dl_add(dl_b, new_bw);
2666 err = 0;
2667 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
2668 __dl_clear(dl_b, p->dl.dl_bw);
2669 err = 0;
2670 }
2671 raw_spin_unlock(&dl_b->lock);
2672
2673 return err;
2674}
2675
2676extern void init_dl_bw(struct dl_bw *dl_b);
2677
Linus Torvalds1da177e2005-04-16 15:20:36 -07002678/*
2679 * wake_up_new_task - wake up a newly created task for the first time.
2680 *
2681 * This function will do some initial scheduler statistics housekeeping
2682 * that must be done for every newly created context, then puts the task
2683 * on the runqueue and wakes it.
2684 */
Samir Bellabes3e51e3e2011-05-11 18:18:05 +02002685void wake_up_new_task(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002686{
Peter Zijlstraeb580752015-07-31 21:28:18 +02002687 struct rq_flags rf;
Ingo Molnardd41f592007-07-09 18:51:59 +02002688 struct rq *rq;
Peter Zijlstrafabf3182010-01-21 21:04:57 +01002689
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002690 add_new_task_to_grp(p);
Syed Rameez Mustafaa15ad1f2016-10-24 17:29:40 -07002691 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
Srivatsa Vaddagiri26c21542016-05-31 09:08:38 -07002692
Peter Zijlstra7dc603c2016-06-16 13:29:28 +02002693 p->state = TASK_RUNNING;
Peter Zijlstrafabf3182010-01-21 21:04:57 +01002694#ifdef CONFIG_SMP
2695 /*
2696 * Fork balancing, do it here and not earlier because:
2697 * - cpus_allowed can change in the fork path
2698 * - any previously selected cpu might disappear through hotplug
Peter Zijlstrae210bff2016-06-16 18:51:48 +02002699 *
2700 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
2701 * as we're not fully set-up yet.
Peter Zijlstrafabf3182010-01-21 21:04:57 +01002702 */
Peter Zijlstrae210bff2016-06-16 18:51:48 +02002703 __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
Peter Zijlstrafabf3182010-01-21 21:04:57 +01002704#endif
Peter Zijlstrab7fa30c2016-06-09 15:07:50 +02002705 rq = __task_rq_lock(p, &rf);
Yuyang Du2b8c41d2016-03-30 04:30:56 +08002706 post_init_entity_util_avg(&p->se);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002707
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07002708 mark_task_starting(p);
Juri Lelli43aac892015-06-26 12:14:23 +01002709 activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04002710 p->on_rq = TASK_ON_RQ_QUEUED;
Peter Zijlstrafbd705a2015-06-09 11:13:36 +02002711 trace_sched_wakeup_new(p);
Peter Zijlstraa7558e02009-09-14 20:02:34 +02002712 check_preempt_curr(rq, p, WF_FORK);
Steven Rostedt9a897c52008-01-25 21:08:22 +01002713#ifdef CONFIG_SMP
Peter Zijlstra0aaafaa2015-10-23 11:50:08 +02002714 if (p->sched_class->task_woken) {
2715 /*
2716 * Nothing relies on rq->lock after this, so its fine to
2717 * drop it.
2718 */
Matt Fleming5a91d732016-09-21 14:38:10 +01002719 rq_unpin_lock(rq, &rf);
Peter Zijlstraefbbd052009-12-16 18:04:40 +01002720 p->sched_class->task_woken(rq, p);
Matt Fleming5a91d732016-09-21 14:38:10 +01002721 rq_repin_lock(rq, &rf);
Peter Zijlstra0aaafaa2015-10-23 11:50:08 +02002722 }
Steven Rostedt9a897c52008-01-25 21:08:22 +01002723#endif
Peter Zijlstraeb580752015-07-31 21:28:18 +02002724 task_rq_unlock(rq, p, &rf);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002725}
2726
Avi Kivitye107be32007-07-26 13:40:43 +02002727#ifdef CONFIG_PREEMPT_NOTIFIERS
2728
Peter Zijlstra1cde2932015-06-08 16:00:30 +02002729static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
2730
Peter Zijlstra2ecd9d22015-07-03 18:53:58 +02002731void preempt_notifier_inc(void)
2732{
2733 static_key_slow_inc(&preempt_notifier_key);
2734}
2735EXPORT_SYMBOL_GPL(preempt_notifier_inc);
2736
2737void preempt_notifier_dec(void)
2738{
2739 static_key_slow_dec(&preempt_notifier_key);
2740}
2741EXPORT_SYMBOL_GPL(preempt_notifier_dec);
2742
Avi Kivitye107be32007-07-26 13:40:43 +02002743/**
Luis Henriques80dd99b2009-03-16 19:58:09 +00002744 * preempt_notifier_register - tell me when current is being preempted & rescheduled
Randy Dunlap421cee22007-07-31 00:37:50 -07002745 * @notifier: notifier struct to register
Avi Kivitye107be32007-07-26 13:40:43 +02002746 */
2747void preempt_notifier_register(struct preempt_notifier *notifier)
2748{
Peter Zijlstra2ecd9d22015-07-03 18:53:58 +02002749 if (!static_key_false(&preempt_notifier_key))
2750 WARN(1, "registering preempt_notifier while notifiers disabled\n");
2751
Avi Kivitye107be32007-07-26 13:40:43 +02002752 hlist_add_head(&notifier->link, &current->preempt_notifiers);
2753}
2754EXPORT_SYMBOL_GPL(preempt_notifier_register);
2755
2756/**
2757 * preempt_notifier_unregister - no longer interested in preemption notifications
Randy Dunlap421cee22007-07-31 00:37:50 -07002758 * @notifier: notifier struct to unregister
Avi Kivitye107be32007-07-26 13:40:43 +02002759 *
Mathieu Desnoyersd84525a2015-05-17 12:53:10 -04002760 * This is *not* safe to call from within a preemption notifier.
Avi Kivitye107be32007-07-26 13:40:43 +02002761 */
2762void preempt_notifier_unregister(struct preempt_notifier *notifier)
2763{
2764 hlist_del(&notifier->link);
2765}
2766EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2767
Peter Zijlstra1cde2932015-06-08 16:00:30 +02002768static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
Avi Kivitye107be32007-07-26 13:40:43 +02002769{
2770 struct preempt_notifier *notifier;
Avi Kivitye107be32007-07-26 13:40:43 +02002771
Sasha Levinb67bfe02013-02-27 17:06:00 -08002772 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
Avi Kivitye107be32007-07-26 13:40:43 +02002773 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2774}
2775
Peter Zijlstra1cde2932015-06-08 16:00:30 +02002776static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2777{
2778 if (static_key_false(&preempt_notifier_key))
2779 __fire_sched_in_preempt_notifiers(curr);
2780}
2781
Avi Kivitye107be32007-07-26 13:40:43 +02002782static void
Peter Zijlstra1cde2932015-06-08 16:00:30 +02002783__fire_sched_out_preempt_notifiers(struct task_struct *curr,
2784 struct task_struct *next)
Avi Kivitye107be32007-07-26 13:40:43 +02002785{
2786 struct preempt_notifier *notifier;
Avi Kivitye107be32007-07-26 13:40:43 +02002787
Sasha Levinb67bfe02013-02-27 17:06:00 -08002788 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
Avi Kivitye107be32007-07-26 13:40:43 +02002789 notifier->ops->sched_out(notifier, next);
2790}
2791
Peter Zijlstra1cde2932015-06-08 16:00:30 +02002792static __always_inline void
2793fire_sched_out_preempt_notifiers(struct task_struct *curr,
2794 struct task_struct *next)
2795{
2796 if (static_key_false(&preempt_notifier_key))
2797 __fire_sched_out_preempt_notifiers(curr, next);
2798}
2799
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02002800#else /* !CONFIG_PREEMPT_NOTIFIERS */
Avi Kivitye107be32007-07-26 13:40:43 +02002801
Peter Zijlstra1cde2932015-06-08 16:00:30 +02002802static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
Avi Kivitye107be32007-07-26 13:40:43 +02002803{
2804}
2805
Peter Zijlstra1cde2932015-06-08 16:00:30 +02002806static inline void
Avi Kivitye107be32007-07-26 13:40:43 +02002807fire_sched_out_preempt_notifiers(struct task_struct *curr,
2808 struct task_struct *next)
2809{
2810}
2811
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02002812#endif /* CONFIG_PREEMPT_NOTIFIERS */
Avi Kivitye107be32007-07-26 13:40:43 +02002813
Linus Torvalds1da177e2005-04-16 15:20:36 -07002814/**
Nick Piggin4866cde2005-06-25 14:57:23 -07002815 * prepare_task_switch - prepare to switch tasks
2816 * @rq: the runqueue preparing to switch
Randy Dunlap421cee22007-07-31 00:37:50 -07002817 * @prev: the current task that is being switched out
Nick Piggin4866cde2005-06-25 14:57:23 -07002818 * @next: the task we are going to switch to.
2819 *
2820 * This is called with the rq lock held and interrupts off. It must
2821 * be paired with a subsequent finish_task_switch after the context
2822 * switch.
2823 *
2824 * prepare_task_switch sets up locking and calls architecture specific
2825 * hooks.
2826 */
Avi Kivitye107be32007-07-26 13:40:43 +02002827static inline void
2828prepare_task_switch(struct rq *rq, struct task_struct *prev,
2829 struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -07002830{
Michael S. Tsirkin43148952013-09-22 17:20:54 +03002831 sched_info_switch(rq, prev, next);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002832 perf_event_task_sched_out(prev, next);
Avi Kivitye107be32007-07-26 13:40:43 +02002833 fire_sched_out_preempt_notifiers(prev, next);
Nick Piggin4866cde2005-06-25 14:57:23 -07002834 prepare_lock_switch(rq, next);
2835 prepare_arch_switch(next);
2836}
2837
2838/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07002839 * finish_task_switch - clean up after a task-switch
2840 * @prev: the thread we just switched away from.
2841 *
Nick Piggin4866cde2005-06-25 14:57:23 -07002842 * finish_task_switch must be called after the context switch, paired
2843 * with a prepare_task_switch call before the context switch.
2844 * finish_task_switch will reconcile locking set up by prepare_task_switch,
2845 * and do any other architecture-specific cleanup actions.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002846 *
2847 * Note that we may have delayed dropping an mm in context_switch(). If
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01002848 * so, we finish that here outside of the runqueue lock. (Doing it
Linus Torvalds1da177e2005-04-16 15:20:36 -07002849 * with the lock held can cause deadlocks; see schedule() for
2850 * details.)
Oleg Nesterovdfa50b62014-10-09 21:32:32 +02002851 *
2852 * The context switch have flipped the stack from under us and restored the
2853 * local variables which were saved when this task called schedule() in the
2854 * past. prev == current is still correct but we need to recalculate this_rq
2855 * because prev may have moved to another CPU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002856 */
Oleg Nesterovdfa50b62014-10-09 21:32:32 +02002857static struct rq *finish_task_switch(struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002858 __releases(rq->lock)
2859{
Oleg Nesterovdfa50b62014-10-09 21:32:32 +02002860 struct rq *rq = this_rq();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002861 struct mm_struct *mm = rq->prev_mm;
Oleg Nesterov55a101f2006-09-29 02:01:10 -07002862 long prev_state;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002863
Peter Zijlstra609ca062015-09-28 17:52:18 +02002864 /*
2865 * The previous task will have left us with a preempt_count of 2
2866 * because it left us after:
2867 *
2868 * schedule()
2869 * preempt_disable(); // 1
2870 * __schedule()
2871 * raw_spin_lock_irq(&rq->lock) // 2
2872 *
2873 * Also, see FORK_PREEMPT_COUNT.
2874 */
Peter Zijlstrae2bf1c42015-09-29 12:18:46 +02002875 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
2876 "corrupted preempt_count: %s/%d/0x%x\n",
2877 current->comm, current->pid, preempt_count()))
2878 preempt_count_set(FORK_PREEMPT_COUNT);
Peter Zijlstra609ca062015-09-28 17:52:18 +02002879
Linus Torvalds1da177e2005-04-16 15:20:36 -07002880 rq->prev_mm = NULL;
2881
2882 /*
2883 * A task struct has one reference for the use as "current".
Oleg Nesterovc394cc92006-09-29 02:01:11 -07002884 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
Oleg Nesterov55a101f2006-09-29 02:01:10 -07002885 * schedule one last time. The schedule call will never return, and
2886 * the scheduled task must drop that reference.
Peter Zijlstra95913d92015-09-29 14:45:09 +02002887 *
2888 * We must observe prev->state before clearing prev->on_cpu (in
2889 * finish_lock_switch), otherwise a concurrent wakeup can get prev
2890 * running on another CPU and we could rave with its RUNNING -> DEAD
2891 * transition, resulting in a double drop.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002892 */
Oleg Nesterov55a101f2006-09-29 02:01:10 -07002893 prev_state = prev->state;
Frederic Weisbeckerbf9fae92012-09-08 15:23:11 +02002894 vtime_task_switch(prev);
Stephane Eraniana8d757e2011-08-25 15:58:03 +02002895 perf_event_task_sched_in(prev, current);
Nick Piggin4866cde2005-06-25 14:57:23 -07002896 finish_lock_switch(rq, prev);
Catalin Marinas01f23e12011-11-27 21:43:10 +00002897 finish_arch_post_lock_switch();
Steven Rostedte8fa1362008-01-25 21:08:05 +01002898
Avi Kivitye107be32007-07-26 13:40:43 +02002899 fire_sched_in_preempt_notifiers(current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002900 if (mm)
2901 mmdrop(mm);
Oleg Nesterovc394cc92006-09-29 02:01:11 -07002902 if (unlikely(prev_state == TASK_DEAD)) {
Dario Faggiolie6c390f2013-11-07 14:43:35 +01002903 if (prev->sched_class->task_dead)
2904 prev->sched_class->task_dead(prev);
2905
bibo maoc6fd91f2006-03-26 01:38:20 -08002906 /*
2907 * Remove function-return probe instances associated with this
2908 * task and put them back on the free list.
Ingo Molnar9761eea2007-07-09 18:52:00 +02002909 */
bibo maoc6fd91f2006-03-26 01:38:20 -08002910 kprobe_flush_task(prev);
Andy Lutomirski68f24b082016-09-15 22:45:48 -07002911
2912 /* Task is done with its stack. */
2913 put_task_stack(prev);
2914
Linus Torvalds1da177e2005-04-16 15:20:36 -07002915 put_task_struct(prev);
bibo maoc6fd91f2006-03-26 01:38:20 -08002916 }
Frederic Weisbecker99e5ada2013-04-20 17:11:50 +02002917
Frederic Weisbeckerde734f82015-06-11 18:07:12 +02002918 tick_nohz_task_switch();
Oleg Nesterovdfa50b62014-10-09 21:32:32 +02002919 return rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002920}
2921
Gregory Haskins3f029d32009-07-29 11:08:47 -04002922#ifdef CONFIG_SMP
2923
Gregory Haskins3f029d32009-07-29 11:08:47 -04002924/* rq->lock is NOT held, but preemption is disabled */
Peter Zijlstrae3fca9e2015-06-11 14:46:37 +02002925static void __balance_callback(struct rq *rq)
Gregory Haskins3f029d32009-07-29 11:08:47 -04002926{
Peter Zijlstrae3fca9e2015-06-11 14:46:37 +02002927 struct callback_head *head, *next;
2928 void (*func)(struct rq *rq);
2929 unsigned long flags;
Gregory Haskins3f029d32009-07-29 11:08:47 -04002930
Peter Zijlstrae3fca9e2015-06-11 14:46:37 +02002931 raw_spin_lock_irqsave(&rq->lock, flags);
2932 head = rq->balance_callback;
2933 rq->balance_callback = NULL;
2934 while (head) {
2935 func = (void (*)(struct rq *))head->func;
2936 next = head->next;
2937 head->next = NULL;
2938 head = next;
Gregory Haskins3f029d32009-07-29 11:08:47 -04002939
Peter Zijlstrae3fca9e2015-06-11 14:46:37 +02002940 func(rq);
Gregory Haskins3f029d32009-07-29 11:08:47 -04002941 }
Peter Zijlstrae3fca9e2015-06-11 14:46:37 +02002942 raw_spin_unlock_irqrestore(&rq->lock, flags);
2943}
2944
2945static inline void balance_callback(struct rq *rq)
2946{
2947 if (unlikely(rq->balance_callback))
2948 __balance_callback(rq);
Gregory Haskins3f029d32009-07-29 11:08:47 -04002949}
2950
2951#else
2952
Peter Zijlstrae3fca9e2015-06-11 14:46:37 +02002953static inline void balance_callback(struct rq *rq)
Gregory Haskins3f029d32009-07-29 11:08:47 -04002954{
2955}
2956
2957#endif
2958
Linus Torvalds1da177e2005-04-16 15:20:36 -07002959/**
2960 * schedule_tail - first thing a freshly forked thread must call.
2961 * @prev: the thread we just switched away from.
2962 */
Andi Kleen722a9f92014-05-02 00:44:38 +02002963asmlinkage __visible void schedule_tail(struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002964 __releases(rq->lock)
2965{
Oleg Nesterov1a43a142014-10-08 21:36:44 +02002966 struct rq *rq;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002967
Peter Zijlstra609ca062015-09-28 17:52:18 +02002968 /*
2969 * New tasks start with FORK_PREEMPT_COUNT, see there and
2970 * finish_task_switch() for details.
2971 *
2972 * finish_task_switch() will drop rq->lock() and lower preempt_count
2973 * and the preempt_enable() will end up enabling preemption (on
2974 * PREEMPT_COUNT kernels).
2975 */
2976
Oleg Nesterovdfa50b62014-10-09 21:32:32 +02002977 rq = finish_task_switch(prev);
Peter Zijlstrae3fca9e2015-06-11 14:46:37 +02002978 balance_callback(rq);
Oleg Nesterov1a43a142014-10-08 21:36:44 +02002979 preempt_enable();
Steven Rostedtda19ab52009-07-29 00:21:22 -04002980
Linus Torvalds1da177e2005-04-16 15:20:36 -07002981 if (current->set_child_tid)
Pavel Emelyanovb4888932007-10-18 23:40:14 -07002982 put_user(task_pid_vnr(current), current->set_child_tid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002983}
2984
2985/*
Oleg Nesterovdfa50b62014-10-09 21:32:32 +02002986 * context_switch - switch to the new MM and the new thread's register state.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002987 */
Josh Poimboeuf04936942016-02-28 22:22:39 -06002988static __always_inline struct rq *
Ingo Molnar70b97a72006-07-03 00:25:42 -07002989context_switch(struct rq *rq, struct task_struct *prev,
Matt Fleming5a91d732016-09-21 14:38:10 +01002990 struct task_struct *next, struct rq_flags *rf)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002991{
Ingo Molnardd41f592007-07-09 18:51:59 +02002992 struct mm_struct *mm, *oldmm;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002993
Avi Kivitye107be32007-07-26 13:40:43 +02002994 prepare_task_switch(rq, prev, next);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01002995
Ingo Molnardd41f592007-07-09 18:51:59 +02002996 mm = next->mm;
2997 oldmm = prev->active_mm;
Zachary Amsden9226d122007-02-13 13:26:21 +01002998 /*
2999 * For paravirt, this is coupled with an exit in switch_to to
3000 * combine the page table reload and the switch backend into
3001 * one hypercall.
3002 */
Jeremy Fitzhardinge224101e2009-02-18 11:18:57 -08003003 arch_start_context_switch(prev);
Zachary Amsden9226d122007-02-13 13:26:21 +01003004
Heiko Carstens31915ab2010-09-16 14:42:25 +02003005 if (!mm) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003006 next->active_mm = oldmm;
3007 atomic_inc(&oldmm->mm_count);
3008 enter_lazy_tlb(oldmm, next);
3009 } else
Andy Lutomirskif98db602016-04-26 09:39:06 -07003010 switch_mm_irqs_off(oldmm, mm, next);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003011
Heiko Carstens31915ab2010-09-16 14:42:25 +02003012 if (!prev->mm) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003013 prev->active_mm = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003014 rq->prev_mm = oldmm;
3015 }
Ingo Molnar3a5f5e42006-07-14 00:24:27 -07003016 /*
3017 * Since the runqueue lock will be released by the next
3018 * task (which is an invalid locking op but in the case
3019 * of the scheduler it's an obvious special-case), so we
3020 * do an early lockdep release here:
3021 */
Matt Fleming5a91d732016-09-21 14:38:10 +01003022 rq_unpin_lock(rq, rf);
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07003023 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003024
3025 /* Here we just switch the register state and the stack. */
3026 switch_to(prev, next, prev);
Ingo Molnardd41f592007-07-09 18:51:59 +02003027 barrier();
Oleg Nesterovdfa50b62014-10-09 21:32:32 +02003028
3029 return finish_task_switch(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003030}
3031
3032/*
Sha Zhengju1c3e8262013-02-20 17:14:38 +08003033 * nr_running and nr_context_switches:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003034 *
3035 * externally visible scheduler statistics: current number of runnable
Sha Zhengju1c3e8262013-02-20 17:14:38 +08003036 * threads, total number of context switches performed since bootup.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003037 */
3038unsigned long nr_running(void)
3039{
3040 unsigned long i, sum = 0;
3041
3042 for_each_online_cpu(i)
3043 sum += cpu_rq(i)->nr_running;
3044
3045 return sum;
3046}
3047
Tim Chen2ee507c2014-07-31 10:29:48 -07003048/*
3049 * Check if only the current task is running on the cpu.
Dominik Dingel00cc1632015-09-18 11:27:45 +02003050 *
3051 * Caution: this function does not check that the caller has disabled
3052 * preemption, thus the result might have a time-of-check-to-time-of-use
3053 * race. The caller is responsible to use it correctly, for example:
3054 *
3055 * - from a non-preemptable section (of course)
3056 *
3057 * - from a thread that is bound to a single CPU
3058 *
3059 * - in a loop with very short iterations (e.g. a polling loop)
Tim Chen2ee507c2014-07-31 10:29:48 -07003060 */
3061bool single_task_running(void)
3062{
Dominik Dingel00cc1632015-09-18 11:27:45 +02003063 return raw_rq()->nr_running == 1;
Tim Chen2ee507c2014-07-31 10:29:48 -07003064}
3065EXPORT_SYMBOL(single_task_running);
3066
Linus Torvalds1da177e2005-04-16 15:20:36 -07003067unsigned long long nr_context_switches(void)
3068{
Steven Rostedtcc94abf2006-06-27 02:54:31 -07003069 int i;
3070 unsigned long long sum = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003071
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08003072 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003073 sum += cpu_rq(i)->nr_switches;
3074
3075 return sum;
3076}
3077
3078unsigned long nr_iowait(void)
3079{
3080 unsigned long i, sum = 0;
3081
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08003082 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003083 sum += atomic_read(&cpu_rq(i)->nr_iowait);
3084
3085 return sum;
3086}
3087
Peter Zijlstra8c215bd2010-07-01 09:07:17 +02003088unsigned long nr_iowait_cpu(int cpu)
Arjan van de Ven69d25872009-09-21 17:04:08 -07003089{
Peter Zijlstra8c215bd2010-07-01 09:07:17 +02003090 struct rq *this = cpu_rq(cpu);
Arjan van de Ven69d25872009-09-21 17:04:08 -07003091 return atomic_read(&this->nr_iowait);
3092}
3093
Joseph Lo77501862013-04-22 14:39:18 +08003094#ifdef CONFIG_CPU_QUIET
3095u64 nr_running_integral(unsigned int cpu)
3096{
3097 unsigned int seqcnt;
3098 u64 integral;
3099 struct rq *q;
3100
3101 if (cpu >= nr_cpu_ids)
3102 return 0;
3103
3104 q = cpu_rq(cpu);
3105
3106 /*
3107 * Update average to avoid reading stalled value if there were
3108 * no run-queue changes for a long time. On the other hand if
3109 * the changes are happening right now, just read current value
3110 * directly.
3111 */
3112
3113 seqcnt = read_seqcount_begin(&q->ave_seqcnt);
3114 integral = do_nr_running_integral(q);
3115 if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) {
3116 read_seqcount_begin(&q->ave_seqcnt);
3117 integral = q->nr_running_integral;
3118 }
3119
3120 return integral;
3121}
3122#endif
3123
Mel Gorman372ba8c2014-08-06 14:19:21 +01003124void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
3125{
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02003126 struct rq *rq = this_rq();
3127 *nr_waiters = atomic_read(&rq->nr_iowait);
3128 *load = rq->load.weight;
Mel Gorman372ba8c2014-08-06 14:19:21 +01003129}
3130
Puja Gupta487dec62017-06-27 10:13:50 -07003131#ifdef CONFIG_SMP
Ingo Molnardd41f592007-07-09 18:51:59 +02003132
Ingo Molnar48f24c42006-07-03 00:25:40 -07003133/*
Peter Zijlstra38022902009-12-16 18:04:37 +01003134 * sched_exec - execve() is a valuable balancing opportunity, because at
3135 * this point the task has the smallest effective memory and cache footprint.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003136 */
Peter Zijlstra38022902009-12-16 18:04:37 +01003137void sched_exec(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003138{
Peter Zijlstra38022902009-12-16 18:04:37 +01003139 struct task_struct *p = current;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003140 unsigned long flags;
Peter Zijlstra0017d732010-03-24 18:34:10 +01003141 int dest_cpu;
Peter Zijlstra38022902009-12-16 18:04:37 +01003142
Peter Zijlstra8f42ced2011-04-05 17:23:53 +02003143 raw_spin_lock_irqsave(&p->pi_lock, flags);
Peter Zijlstraac66f542013-10-07 11:29:16 +01003144 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
Peter Zijlstra0017d732010-03-24 18:34:10 +01003145 if (dest_cpu == smp_processor_id())
3146 goto unlock;
Peter Zijlstra38022902009-12-16 18:04:37 +01003147
Olav Haugan3f2cb302016-05-31 14:34:46 -07003148 if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) {
Tejun Heo969c7922010-05-06 18:49:21 +02003149 struct migration_arg arg = { p, dest_cpu };
Ingo Molnar36c8b582006-07-03 00:25:41 -07003150
Peter Zijlstra8f42ced2011-04-05 17:23:53 +02003151 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3152 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003153 return;
3154 }
Peter Zijlstra0017d732010-03-24 18:34:10 +01003155unlock:
Peter Zijlstra8f42ced2011-04-05 17:23:53 +02003156 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003157}
3158
Linus Torvalds1da177e2005-04-16 15:20:36 -07003159#endif
3160
Linus Torvalds1da177e2005-04-16 15:20:36 -07003161DEFINE_PER_CPU(struct kernel_stat, kstat);
Glauber Costa3292beb2011-11-28 14:45:17 -02003162DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003163
3164EXPORT_PER_CPU_SYMBOL(kstat);
Glauber Costa3292beb2011-11-28 14:45:17 -02003165EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003166
3167/*
Giovanni Gherdovich60756202016-08-05 10:21:56 +02003168 * The function fair_sched_class.update_curr accesses the struct curr
3169 * and its field curr->exec_start; when called from task_sched_runtime(),
3170 * we observe a high rate of cache misses in practice.
3171 * Prefetching this data results in improved performance.
3172 */
3173static inline void prefetch_curr_exec_start(struct task_struct *p)
3174{
3175#ifdef CONFIG_FAIR_GROUP_SCHED
3176 struct sched_entity *curr = (&p->se)->cfs_rq->curr;
3177#else
3178 struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
3179#endif
3180 prefetch(curr);
3181 prefetch(&curr->exec_start);
3182}
3183
3184/*
Hidetoshi Setoc5f8d992009-03-31 16:56:03 +09003185 * Return accounted runtime for the task.
3186 * In case the task is currently running, return the runtime plus current's
3187 * pending runtime that have not been accounted yet.
3188 */
3189unsigned long long task_sched_runtime(struct task_struct *p)
3190{
Peter Zijlstraeb580752015-07-31 21:28:18 +02003191 struct rq_flags rf;
Hidetoshi Setoc5f8d992009-03-31 16:56:03 +09003192 struct rq *rq;
Stanislaw Gruszka6e998912014-11-12 16:58:44 +01003193 u64 ns;
Ingo Molnar48f24c42006-07-03 00:25:40 -07003194
Peter Zijlstra911b2892013-11-11 18:21:56 +01003195#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
3196 /*
3197 * 64-bit doesn't need locks to atomically read a 64bit value.
3198 * So we have a optimization chance when the task's delta_exec is 0.
3199 * Reading ->on_cpu is racy, but this is ok.
3200 *
3201 * If we race with it leaving cpu, we'll take a lock. So we're correct.
3202 * If we race with it entering cpu, unaccounted time is 0. This is
3203 * indistinguishable from the read occurring a few cycles earlier.
Mike Galbraith4036ac12014-06-24 07:49:40 +02003204 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
3205 * been accounted, so we're correct here as well.
Peter Zijlstra911b2892013-11-11 18:21:56 +01003206 */
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04003207 if (!p->on_cpu || !task_on_rq_queued(p))
Peter Zijlstra911b2892013-11-11 18:21:56 +01003208 return p->se.sum_exec_runtime;
3209#endif
3210
Peter Zijlstraeb580752015-07-31 21:28:18 +02003211 rq = task_rq_lock(p, &rf);
Stanislaw Gruszka6e998912014-11-12 16:58:44 +01003212 /*
3213 * Must be ->curr _and_ ->on_rq. If dequeued, we would
3214 * project cycles that may never be accounted to this
3215 * thread, breaking clock_gettime().
3216 */
3217 if (task_current(rq, p) && task_on_rq_queued(p)) {
Giovanni Gherdovich60756202016-08-05 10:21:56 +02003218 prefetch_curr_exec_start(p);
Stanislaw Gruszka6e998912014-11-12 16:58:44 +01003219 update_rq_clock(rq);
3220 p->sched_class->update_curr(rq);
3221 }
3222 ns = p->se.sum_exec_runtime;
Peter Zijlstraeb580752015-07-31 21:28:18 +02003223 task_rq_unlock(rq, p, &rf);
Hidetoshi Setoc5f8d992009-03-31 16:56:03 +09003224
3225 return ns;
3226}
3227
Joonwoo Park01388ef2017-01-20 10:54:34 -08003228unsigned int capacity_margin_freq = 1280; /* ~20% margin */
3229
Balbir Singh49048622008-09-05 18:12:23 +02003230/*
Christoph Lameter7835b982006-12-10 02:20:22 -08003231 * This function gets called by the timer code, with HZ frequency.
3232 * We call it with interrupts disabled.
Christoph Lameter7835b982006-12-10 02:20:22 -08003233 */
3234void scheduler_tick(void)
3235{
Christoph Lameter7835b982006-12-10 02:20:22 -08003236 int cpu = smp_processor_id();
3237 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02003238 struct task_struct *curr = rq->curr;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003239 u64 wallclock;
3240 bool early_notif;
3241 u32 old_load;
3242 struct related_thread_group *grp;
Puja Gupta8cd9db42017-09-21 10:58:56 -07003243 unsigned int flag = 0;
Peter Zijlstra3e51f332008-05-03 18:29:28 +02003244
3245 sched_clock_tick();
Christoph Lameter7835b982006-12-10 02:20:22 -08003246
Thomas Gleixner05fa7852009-11-17 14:28:38 +01003247 raw_spin_lock(&rq->lock);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003248
3249 old_load = task_load(curr);
3250 set_window_start(rq);
3251
Pavankumar Kondetifaa04442018-06-25 16:13:39 +05303252 wallclock = sched_ktime_clock();
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003253 update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
Vikram Mulukutla4b54aae2017-03-20 13:41:37 -07003254
Joonwoo Parkf995dd12016-12-22 12:08:50 -08003255 update_rq_clock(rq);
3256 curr->sched_class->task_tick(rq, curr, 0);
3257 cpu_load_update_active(rq);
3258 calc_global_load_tick(rq);
Syed Rameez Mustafa20acfe72017-01-30 09:35:46 +05303259
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003260 early_notif = early_detection_notify(rq, wallclock);
Puja Gupta8cd9db42017-09-21 10:58:56 -07003261 if (early_notif)
3262 flag = SCHED_CPUFREQ_WALT | SCHED_CPUFREQ_EARLY_DET;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003263
Puja Gupta8cd9db42017-09-21 10:58:56 -07003264 cpufreq_update_util(rq, flag);
jianzhoub82a5df2019-04-28 13:43:53 +08003265
Johannes Weiner3df0e592018-10-26 15:06:27 -07003266 psi_task_tick(rq);
jianzhoub82a5df2019-04-28 13:43:53 +08003267
Thomas Gleixner05fa7852009-11-17 14:28:38 +01003268 raw_spin_unlock(&rq->lock);
Ingo Molnardd41f592007-07-09 18:51:59 +02003269
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003270 if (early_notif)
3271 atomic_notifier_call_chain(&load_alert_notifier_head,
3272 0, (void *)(long)cpu);
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02003273 perf_event_task_tick();
Peter Zijlstrae220d2d2009-05-23 18:28:55 +02003274
Christoph Lametere418e1c2006-12-10 02:20:23 -08003275#ifdef CONFIG_SMP
Suresh Siddha6eb57e02011-10-03 15:09:01 -07003276 rq->idle_balance = idle_cpu(cpu);
Daniel Lezcano7caff662014-01-06 12:34:38 +01003277 trigger_load_balance(rq);
Christoph Lametere418e1c2006-12-10 02:20:23 -08003278#endif
Frederic Weisbecker265f22a2013-05-03 03:39:05 +02003279 rq_last_tick_reset(rq);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003280
3281 rcu_read_lock();
3282 grp = task_related_thread_group(curr);
3283 if (update_preferred_cluster(grp, curr, old_load))
3284 set_preferred_cluster(grp);
3285 rcu_read_unlock();
3286
3287 if (curr->sched_class == &fair_sched_class)
3288 check_for_migration(rq, curr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003289}
3290
Frederic Weisbecker265f22a2013-05-03 03:39:05 +02003291#ifdef CONFIG_NO_HZ_FULL
3292/**
3293 * scheduler_tick_max_deferment
3294 *
3295 * Keep at least one tick per second when a single
3296 * active task is running because the scheduler doesn't
3297 * yet completely support full dynticks environment.
3298 *
3299 * This makes sure that uptime, CFS vruntime, load
3300 * balancing, etc... continue to move forward, even
3301 * with a very low granularity.
Yacine Belkadie69f6182013-07-12 20:45:47 +02003302 *
3303 * Return: Maximum deferment in nanoseconds.
Frederic Weisbecker265f22a2013-05-03 03:39:05 +02003304 */
3305u64 scheduler_tick_max_deferment(void)
3306{
3307 struct rq *rq = this_rq();
Jason Low316c1608d2015-04-28 13:00:20 -07003308 unsigned long next, now = READ_ONCE(jiffies);
Frederic Weisbecker265f22a2013-05-03 03:39:05 +02003309
3310 next = rq->last_sched_tick + HZ;
3311
3312 if (time_before_eq(next, now))
3313 return 0;
3314
Kevin Hilman8fe8ff02014-01-15 14:51:38 +01003315 return jiffies_to_nsecs(next - now);
Frederic Weisbecker265f22a2013-05-03 03:39:05 +02003316}
3317#endif
3318
Steven Rostedt7e49fcc2009-01-22 19:01:40 -05003319#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3320 defined(CONFIG_PREEMPT_TRACER))
Steven Rostedt47252cf2016-03-21 11:23:39 -04003321/*
Pavankumar Kondeti97f08d42018-01-09 11:02:30 +05303322 * preemptoff stack tracing threshold in ns.
3323 * default: 1ms
3324 */
3325unsigned int sysctl_preemptoff_tracing_threshold_ns = 1000000UL;
3326
3327struct preempt_store {
3328 u64 ts;
3329 unsigned long caddr[4];
3330 bool irqs_disabled;
3331};
3332
3333static DEFINE_PER_CPU(struct preempt_store, the_ps);
3334/*
Steven Rostedt47252cf2016-03-21 11:23:39 -04003335 * If the value passed in is equal to the current preempt count
3336 * then we just disabled preemption. Start timing the latency.
3337 */
3338static inline void preempt_latency_start(int val)
3339{
Pavankumar Kondeti97f08d42018-01-09 11:02:30 +05303340 struct preempt_store *ps = &per_cpu(the_ps, raw_smp_processor_id());
3341
Steven Rostedt47252cf2016-03-21 11:23:39 -04003342 if (preempt_count() == val) {
3343 unsigned long ip = get_lock_parent_ip();
3344#ifdef CONFIG_DEBUG_PREEMPT
3345 current->preempt_disable_ip = ip;
3346#endif
Pavankumar Kondeti97f08d42018-01-09 11:02:30 +05303347 ps->ts = sched_clock();
3348 ps->caddr[0] = CALLER_ADDR0;
3349 ps->caddr[1] = CALLER_ADDR1;
3350 ps->caddr[2] = CALLER_ADDR2;
3351 ps->caddr[3] = CALLER_ADDR3;
3352 ps->irqs_disabled = irqs_disabled();
3353
Steven Rostedt47252cf2016-03-21 11:23:39 -04003354 trace_preempt_off(CALLER_ADDR0, ip);
3355 }
3356}
Steven Rostedt7e49fcc2009-01-22 19:01:40 -05003357
Masami Hiramatsuedafe3a2014-04-17 17:18:42 +09003358void preempt_count_add(int val)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003359{
Steven Rostedt6cd8a4b2008-05-12 21:20:42 +02003360#ifdef CONFIG_DEBUG_PREEMPT
Linus Torvalds1da177e2005-04-16 15:20:36 -07003361 /*
3362 * Underflow?
3363 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003364 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3365 return;
Steven Rostedt6cd8a4b2008-05-12 21:20:42 +02003366#endif
Peter Zijlstrabdb43802013-09-10 12:15:23 +02003367 __preempt_count_add(val);
Steven Rostedt6cd8a4b2008-05-12 21:20:42 +02003368#ifdef CONFIG_DEBUG_PREEMPT
Linus Torvalds1da177e2005-04-16 15:20:36 -07003369 /*
3370 * Spinlock count overflowing soon?
3371 */
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08003372 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3373 PREEMPT_MASK - 10);
Steven Rostedt6cd8a4b2008-05-12 21:20:42 +02003374#endif
Steven Rostedt47252cf2016-03-21 11:23:39 -04003375 preempt_latency_start(val);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003376}
Peter Zijlstrabdb43802013-09-10 12:15:23 +02003377EXPORT_SYMBOL(preempt_count_add);
Masami Hiramatsuedafe3a2014-04-17 17:18:42 +09003378NOKPROBE_SYMBOL(preempt_count_add);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003379
Steven Rostedt47252cf2016-03-21 11:23:39 -04003380/*
3381 * If the value passed in equals to the current preempt count
3382 * then we just enabled preemption. Stop timing the latency.
3383 */
3384static inline void preempt_latency_stop(int val)
3385{
Pavankumar Kondeti97f08d42018-01-09 11:02:30 +05303386 if (preempt_count() == val) {
3387 struct preempt_store *ps = &per_cpu(the_ps,
3388 raw_smp_processor_id());
3389 u64 delta = sched_clock() - ps->ts;
3390
3391 /*
3392 * Trace preempt disable stack if preemption
3393 * is disabled for more than the threshold.
3394 */
3395 if (delta > sysctl_preemptoff_tracing_threshold_ns)
3396 trace_sched_preempt_disable(delta, ps->irqs_disabled,
3397 ps->caddr[0], ps->caddr[1],
3398 ps->caddr[2], ps->caddr[3]);
Steven Rostedt47252cf2016-03-21 11:23:39 -04003399 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
Pavankumar Kondeti97f08d42018-01-09 11:02:30 +05303400 }
Steven Rostedt47252cf2016-03-21 11:23:39 -04003401}
3402
Masami Hiramatsuedafe3a2014-04-17 17:18:42 +09003403void preempt_count_sub(int val)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003404{
Steven Rostedt6cd8a4b2008-05-12 21:20:42 +02003405#ifdef CONFIG_DEBUG_PREEMPT
Linus Torvalds1da177e2005-04-16 15:20:36 -07003406 /*
3407 * Underflow?
3408 */
Ingo Molnar01e3eb82009-01-12 13:00:50 +01003409 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003410 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003411 /*
3412 * Is the spinlock portion underflowing?
3413 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003414 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3415 !(preempt_count() & PREEMPT_MASK)))
3416 return;
Steven Rostedt6cd8a4b2008-05-12 21:20:42 +02003417#endif
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003418
Steven Rostedt47252cf2016-03-21 11:23:39 -04003419 preempt_latency_stop(val);
Peter Zijlstrabdb43802013-09-10 12:15:23 +02003420 __preempt_count_sub(val);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003421}
Peter Zijlstrabdb43802013-09-10 12:15:23 +02003422EXPORT_SYMBOL(preempt_count_sub);
Masami Hiramatsuedafe3a2014-04-17 17:18:42 +09003423NOKPROBE_SYMBOL(preempt_count_sub);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003424
Steven Rostedt47252cf2016-03-21 11:23:39 -04003425#else
3426static inline void preempt_latency_start(int val) { }
3427static inline void preempt_latency_stop(int val) { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003428#endif
3429
3430/*
Ingo Molnardd41f592007-07-09 18:51:59 +02003431 * Print scheduling while atomic bug:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003432 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003433static noinline void __schedule_bug(struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003434{
Vegard Nossumd1c6d142016-07-23 09:46:39 +02003435 /* Save this before calling printk(), since that will clobber it */
3436 unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
3437
Dave Jones664dfa62011-12-22 16:39:30 -05003438 if (oops_in_progress)
3439 return;
3440
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01003441 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3442 prev->comm, prev->pid, preempt_count());
Satyam Sharma838225b2007-10-24 18:23:50 +02003443
Ingo Molnardd41f592007-07-09 18:51:59 +02003444 debug_show_held_locks(prev);
Arjan van de Vene21f5b12008-05-23 09:05:58 -07003445 print_modules();
Ingo Molnardd41f592007-07-09 18:51:59 +02003446 if (irqs_disabled())
3447 print_irqtrace_events(prev);
Vegard Nossumd1c6d142016-07-23 09:46:39 +02003448 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
3449 && in_atomic_preempt_off()) {
Thomas Gleixner8f47b182014-02-07 20:58:39 +01003450 pr_err("Preemption disabled at:");
Vegard Nossumd1c6d142016-07-23 09:46:39 +02003451 print_ip_sym(preempt_disable_ip);
Thomas Gleixner8f47b182014-02-07 20:58:39 +01003452 pr_cont("\n");
3453 }
Daniel Bristot de Oliveira748c7202016-06-03 17:10:18 -03003454 if (panic_on_warn)
3455 panic("scheduling while atomic\n");
3456
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003457#ifdef CONFIG_PANIC_ON_SCHED_BUG
3458 BUG();
3459#endif
Stephen Boyd6135fc12012-03-28 17:10:47 -07003460 dump_stack();
Rusty Russell373d4d02013-01-21 17:17:39 +10303461 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
Ingo Molnardd41f592007-07-09 18:51:59 +02003462}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003463
Ingo Molnardd41f592007-07-09 18:51:59 +02003464/*
3465 * Various schedule()-time debugging checks and statistics:
3466 */
3467static inline void schedule_debug(struct task_struct *prev)
3468{
Aaron Tomlin0d9e2632014-09-12 14:16:19 +01003469#ifdef CONFIG_SCHED_STACK_END_CHECK
Jann Horn29d64552016-06-01 11:55:07 +02003470 if (task_stack_end_corrupted(prev))
3471 panic("corrupted stack end detected inside scheduler\n");
Aaron Tomlin0d9e2632014-09-12 14:16:19 +01003472#endif
Peter Zijlstrab99def82015-09-28 18:02:03 +02003473
Peter Zijlstra1dc0fff2015-09-28 17:57:39 +02003474 if (unlikely(in_atomic_preempt_off())) {
Ingo Molnardd41f592007-07-09 18:51:59 +02003475 __schedule_bug(prev);
Peter Zijlstra1dc0fff2015-09-28 17:57:39 +02003476 preempt_count_set(PREEMPT_DISABLED);
3477 }
Paul E. McKenneyb3fbab02011-05-24 08:31:09 -07003478 rcu_sleep_check();
Ingo Molnardd41f592007-07-09 18:51:59 +02003479
Linus Torvalds1da177e2005-04-16 15:20:36 -07003480 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3481
Josh Poimboeufae928822016-06-17 12:43:24 -05003482 schedstat_inc(this_rq()->sched_count);
Ingo Molnardd41f592007-07-09 18:51:59 +02003483}
3484
3485/*
3486 * Pick up the highest-prio task:
3487 */
3488static inline struct task_struct *
Matt Fleming5a91d732016-09-21 14:38:10 +01003489pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
Ingo Molnardd41f592007-07-09 18:51:59 +02003490{
Peter Zijlstra37e117c2014-02-14 12:25:08 +01003491 const struct sched_class *class = &fair_sched_class;
Ingo Molnardd41f592007-07-09 18:51:59 +02003492 struct task_struct *p;
3493
3494 /*
3495 * Optimization: we know that if all tasks are in
3496 * the fair class we can call that function directly:
3497 */
Peter Zijlstra37e117c2014-02-14 12:25:08 +01003498 if (likely(prev->sched_class == class &&
Peter Zijlstra38033c32014-01-23 20:32:21 +01003499 rq->nr_running == rq->cfs.h_nr_running)) {
Matt Fleming5a91d732016-09-21 14:38:10 +01003500 p = fair_sched_class.pick_next_task(rq, prev, rf);
Peter Zijlstra6ccdc842014-04-24 12:00:47 +02003501 if (unlikely(p == RETRY_TASK))
3502 goto again;
3503
3504 /* assumes fair_sched_class->next == idle_sched_class */
3505 if (unlikely(!p))
Matt Fleming5a91d732016-09-21 14:38:10 +01003506 p = idle_sched_class.pick_next_task(rq, prev, rf);
Peter Zijlstra6ccdc842014-04-24 12:00:47 +02003507
3508 return p;
Ingo Molnardd41f592007-07-09 18:51:59 +02003509 }
3510
Peter Zijlstra37e117c2014-02-14 12:25:08 +01003511again:
Peter Zijlstra34f971f2010-09-22 13:53:15 +02003512 for_each_class(class) {
Matt Fleming5a91d732016-09-21 14:38:10 +01003513 p = class->pick_next_task(rq, prev, rf);
Peter Zijlstra37e117c2014-02-14 12:25:08 +01003514 if (p) {
3515 if (unlikely(p == RETRY_TASK))
3516 goto again;
Ingo Molnardd41f592007-07-09 18:51:59 +02003517 return p;
Peter Zijlstra37e117c2014-02-14 12:25:08 +01003518 }
Ingo Molnardd41f592007-07-09 18:51:59 +02003519 }
Peter Zijlstra34f971f2010-09-22 13:53:15 +02003520
3521 BUG(); /* the idle class will always have a runnable task */
Ingo Molnardd41f592007-07-09 18:51:59 +02003522}
3523
3524/*
Thomas Gleixnerc259e012011-06-22 19:47:00 +02003525 * __schedule() is the main scheduler function.
Pekka Enbergedde96e2012-08-04 11:49:47 +03003526 *
3527 * The main means of driving the scheduler and thus entering this function are:
3528 *
3529 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
3530 *
3531 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
3532 * paths. For example, see arch/x86/entry_64.S.
3533 *
3534 * To drive preemption between tasks, the scheduler sets the flag in timer
3535 * interrupt handler scheduler_tick().
3536 *
3537 * 3. Wakeups don't really cause entry into schedule(). They add a
3538 * task to the run-queue and that's it.
3539 *
3540 * Now, if the new task added to the run-queue preempts the current
3541 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
3542 * called on the nearest possible occasion:
3543 *
3544 * - If the kernel is preemptible (CONFIG_PREEMPT=y):
3545 *
3546 * - in syscall or exception context, at the next outmost
3547 * preempt_enable(). (this might be as soon as the wake_up()'s
3548 * spin_unlock()!)
3549 *
3550 * - in IRQ context, return from interrupt-handler to
3551 * preemptible context
3552 *
3553 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
3554 * then at the next:
3555 *
3556 * - cond_resched() call
3557 * - explicit schedule() call
3558 * - return from syscall or exception to user-space
3559 * - return from interrupt-handler to user-space
Frederic Weisbeckerbfd9b2b2015-01-28 01:24:09 +01003560 *
Frederic Weisbeckerb30f0e32015-05-12 16:41:49 +02003561 * WARNING: must be called with preemption disabled!
Ingo Molnardd41f592007-07-09 18:51:59 +02003562 */
Peter Zijlstra499d7952015-09-28 18:52:36 +02003563static void __sched notrace __schedule(bool preempt)
Ingo Molnardd41f592007-07-09 18:51:59 +02003564{
3565 struct task_struct *prev, *next;
Harvey Harrison67ca7bd2008-02-15 09:56:36 -08003566 unsigned long *switch_count;
Matt Fleming5a91d732016-09-21 14:38:10 +01003567 struct rq_flags rf;
Ingo Molnardd41f592007-07-09 18:51:59 +02003568 struct rq *rq;
Peter Zijlstra31656512008-07-18 18:01:23 +02003569 int cpu;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07003570 u64 wallclock;
Ingo Molnardd41f592007-07-09 18:51:59 +02003571
Ingo Molnardd41f592007-07-09 18:51:59 +02003572 cpu = smp_processor_id();
3573 rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02003574 prev = rq->curr;
Ingo Molnardd41f592007-07-09 18:51:59 +02003575
Ingo Molnardd41f592007-07-09 18:51:59 +02003576 schedule_debug(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003577
Peter Zijlstra31656512008-07-18 18:01:23 +02003578 if (sched_feat(HRTICK))
Mike Galbraithf333fdc2008-05-12 21:20:55 +02003579 hrtick_clear(rq);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01003580
Paul E. McKenney46a5d162015-10-07 09:10:48 -07003581 local_irq_disable();
3582 rcu_note_context_switch();
3583
Oleg Nesterove0acd0a2013-08-12 18:14:00 +02003584 /*
3585 * Make sure that signal_pending_state()->signal_pending() below
3586 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
3587 * done by the caller to avoid the race with signal_wake_up().
3588 */
3589 smp_mb__before_spinlock();
Paul E. McKenney46a5d162015-10-07 09:10:48 -07003590 raw_spin_lock(&rq->lock);
Matt Fleming5a91d732016-09-21 14:38:10 +01003591 rq_pin_lock(rq, &rf);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003592
Peter Zijlstra9edfbfe2015-01-05 11:18:11 +01003593 rq->clock_skip_update <<= 1; /* promote REQ to ACT */
3594
Oleg Nesterov246d86b2010-05-19 14:57:11 +02003595 switch_count = &prev->nivcsw;
Peter Zijlstrafc13aeb2015-09-28 18:05:34 +02003596 if (!preempt && prev->state) {
Tejun Heo21aa9af2010-06-08 21:40:37 +02003597 if (unlikely(signal_pending_state(prev->state, prev))) {
Ingo Molnardd41f592007-07-09 18:51:59 +02003598 prev->state = TASK_RUNNING;
Tejun Heo21aa9af2010-06-08 21:40:37 +02003599 } else {
Peter Zijlstra2acca552011-04-05 17:23:50 +02003600 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3601 prev->on_rq = 0;
3602
Tejun Heo21aa9af2010-06-08 21:40:37 +02003603 /*
Peter Zijlstra2acca552011-04-05 17:23:50 +02003604 * If a worker went to sleep, notify and ask workqueue
3605 * whether it wants to wake up a task to maintain
3606 * concurrency.
Tejun Heo21aa9af2010-06-08 21:40:37 +02003607 */
3608 if (prev->flags & PF_WQ_WORKER) {
3609 struct task_struct *to_wakeup;
3610
Alexander Gordeev9b7f6592016-03-02 12:53:31 +01003611 to_wakeup = wq_worker_sleeping(prev);
Tejun Heo21aa9af2010-06-08 21:40:37 +02003612 if (to_wakeup)
Matt Fleming5a91d732016-09-21 14:38:10 +01003613 try_to_wake_up_local(to_wakeup, &rf);
Tejun Heo21aa9af2010-06-08 21:40:37 +02003614 }
Tejun Heo21aa9af2010-06-08 21:40:37 +02003615 }
Ingo Molnardd41f592007-07-09 18:51:59 +02003616 switch_count = &prev->nvcsw;
3617 }
3618
Peter Zijlstra9edfbfe2015-01-05 11:18:11 +01003619 if (task_on_rq_queued(prev))
Peter Zijlstra606dba22012-02-11 06:05:00 +01003620 update_rq_clock(rq);
3621
Matt Fleming5a91d732016-09-21 14:38:10 +01003622 next = pick_next_task(rq, prev, &rf);
jianzhoub82a5df2019-04-28 13:43:53 +08003623 wallclock = sched_ktime_clock();
Mike Galbraithf26f9af2010-12-08 11:05:42 +01003624 clear_tsk_need_resched(prev);
Peter Zijlstraf27dde82013-08-14 14:55:31 +02003625 clear_preempt_need_resched();
Peter Zijlstra9edfbfe2015-01-05 11:18:11 +01003626 rq->clock_skip_update = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003627
Linus Torvalds1da177e2005-04-16 15:20:36 -07003628 if (likely(prev != next)) {
Joonwoo Park84a80882017-02-03 11:15:31 -08003629 if (!prev->on_rq)
3630 prev->last_sleep_ts = wallclock;
3631
Pavankumar Kondeti601ebbd2016-09-30 19:48:22 +05303632 update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
3633 update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003634 rq->nr_switches++;
3635 rq->curr = next;
3636 ++*switch_count;
3637
Peter Zijlstrac73464b2015-09-28 18:06:56 +02003638 trace_sched_switch(preempt, prev, next);
Matt Fleming5a91d732016-09-21 14:38:10 +01003639 rq = context_switch(rq, prev, next, &rf); /* unlocks the rq */
Peter Zijlstracbce1a62015-06-11 14:46:54 +02003640 } else {
Pavankumar Kondeti601ebbd2016-09-30 19:48:22 +05303641 update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0);
Matt Fleming5a91d732016-09-21 14:38:10 +01003642 rq_unpin_lock(rq, &rf);
Thomas Gleixner05fa7852009-11-17 14:28:38 +01003643 raw_spin_unlock_irq(&rq->lock);
Peter Zijlstracbce1a62015-06-11 14:46:54 +02003644 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003645
Peter Zijlstrae3fca9e2015-06-11 14:46:37 +02003646 balance_callback(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003647}
Thomas Gleixnerc259e012011-06-22 19:47:00 +02003648
Peter Zijlstra9af65282016-09-13 18:37:29 +02003649void __noreturn do_task_dead(void)
3650{
3651 /*
3652 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
3653 * when the following two conditions become true.
3654 * - There is race condition of mmap_sem (It is acquired by
3655 * exit_mm()), and
3656 * - SMI occurs before setting TASK_RUNINNG.
3657 * (or hypervisor of virtual machine switches to other guest)
3658 * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
3659 *
3660 * To avoid it, we have to wait for releasing tsk->pi_lock which
3661 * is held by try_to_wake_up()
3662 */
3663 smp_mb();
3664 raw_spin_unlock_wait(&current->pi_lock);
3665
3666 /* causes final put_task_struct in finish_task_switch(). */
3667 __set_current_state(TASK_DEAD);
3668 current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
3669 __schedule(false);
3670 BUG();
3671 /* Avoid "noreturn function does return". */
3672 for (;;)
3673 cpu_relax(); /* For when BUG is null */
3674}
3675
Thomas Gleixner9c40cef2011-06-22 19:47:01 +02003676static inline void sched_submit_work(struct task_struct *tsk)
3677{
Thomas Gleixner3c7d5182011-07-17 20:46:52 +02003678 if (!tsk->state || tsk_is_pi_blocked(tsk))
Thomas Gleixner9c40cef2011-06-22 19:47:01 +02003679 return;
3680 /*
3681 * If we are going to sleep and we have plugged IO queued,
3682 * make sure to submit it to avoid deadlocks.
3683 */
3684 if (blk_needs_flush_plug(tsk))
3685 blk_schedule_flush_plug(tsk);
3686}
3687
Andi Kleen722a9f92014-05-02 00:44:38 +02003688asmlinkage __visible void __sched schedule(void)
Thomas Gleixnerc259e012011-06-22 19:47:00 +02003689{
Thomas Gleixner9c40cef2011-06-22 19:47:01 +02003690 struct task_struct *tsk = current;
3691
3692 sched_submit_work(tsk);
Frederic Weisbeckerbfd9b2b2015-01-28 01:24:09 +01003693 do {
Frederic Weisbeckerb30f0e32015-05-12 16:41:49 +02003694 preempt_disable();
Peter Zijlstrafc13aeb2015-09-28 18:05:34 +02003695 __schedule(false);
Frederic Weisbeckerb30f0e32015-05-12 16:41:49 +02003696 sched_preempt_enable_no_resched();
Frederic Weisbeckerbfd9b2b2015-01-28 01:24:09 +01003697 } while (need_resched());
Thomas Gleixnerc259e012011-06-22 19:47:00 +02003698}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003699EXPORT_SYMBOL(schedule);
3700
Frederic Weisbecker91d1aa432012-11-27 19:33:25 +01003701#ifdef CONFIG_CONTEXT_TRACKING
Andi Kleen722a9f92014-05-02 00:44:38 +02003702asmlinkage __visible void __sched schedule_user(void)
Frederic Weisbecker20ab65e32012-07-11 20:26:37 +02003703{
3704 /*
3705 * If we come here after a random call to set_need_resched(),
3706 * or we have been woken up remotely but the IPI has not yet arrived,
3707 * we haven't yet exited the RCU idle mode. Do it here manually until
3708 * we find a better solution.
Andy Lutomirski7cc78f82014-12-03 15:37:08 -08003709 *
3710 * NB: There are buggy callers of this function. Ideally we
Frederic Weisbeckerc467ea72015-03-04 18:06:33 +01003711 * should warn if prev_state != CONTEXT_USER, but that will trigger
Andy Lutomirski7cc78f82014-12-03 15:37:08 -08003712 * too frequently to make sense yet.
Frederic Weisbecker20ab65e32012-07-11 20:26:37 +02003713 */
Andy Lutomirski7cc78f82014-12-03 15:37:08 -08003714 enum ctx_state prev_state = exception_enter();
Frederic Weisbecker20ab65e32012-07-11 20:26:37 +02003715 schedule();
Andy Lutomirski7cc78f82014-12-03 15:37:08 -08003716 exception_exit(prev_state);
Frederic Weisbecker20ab65e32012-07-11 20:26:37 +02003717}
3718#endif
3719
Thomas Gleixnerc5491ea2011-03-21 12:09:35 +01003720/**
3721 * schedule_preempt_disabled - called with preemption disabled
3722 *
3723 * Returns with preemption disabled. Note: preempt_count must be 1
3724 */
3725void __sched schedule_preempt_disabled(void)
3726{
Thomas Gleixnerba74c142011-03-21 13:32:17 +01003727 sched_preempt_enable_no_resched();
Thomas Gleixnerc5491ea2011-03-21 12:09:35 +01003728 schedule();
3729 preempt_disable();
3730}
3731
Frederic Weisbecker06b1f802015-02-16 19:20:07 +01003732static void __sched notrace preempt_schedule_common(void)
Frederic Weisbeckera18b5d02015-01-22 18:08:04 +01003733{
3734 do {
Steven Rostedt47252cf2016-03-21 11:23:39 -04003735 /*
3736 * Because the function tracer can trace preempt_count_sub()
3737 * and it also uses preempt_enable/disable_notrace(), if
3738 * NEED_RESCHED is set, the preempt_enable_notrace() called
3739 * by the function tracer will call this function again and
3740 * cause infinite recursion.
3741 *
3742 * Preemption must be disabled here before the function
3743 * tracer can trace. Break up preempt_disable() into two
3744 * calls. One to disable preemption without fear of being
3745 * traced. The other to still record the preemption latency,
3746 * which can also be traced by the function tracer.
3747 */
Peter Zijlstra499d7952015-09-28 18:52:36 +02003748 preempt_disable_notrace();
Steven Rostedt47252cf2016-03-21 11:23:39 -04003749 preempt_latency_start(1);
Peter Zijlstrafc13aeb2015-09-28 18:05:34 +02003750 __schedule(true);
Steven Rostedt47252cf2016-03-21 11:23:39 -04003751 preempt_latency_stop(1);
Peter Zijlstra499d7952015-09-28 18:52:36 +02003752 preempt_enable_no_resched_notrace();
Frederic Weisbeckera18b5d02015-01-22 18:08:04 +01003753
3754 /*
3755 * Check again in case we missed a preemption opportunity
3756 * between schedule and now.
3757 */
Frederic Weisbeckera18b5d02015-01-22 18:08:04 +01003758 } while (need_resched());
3759}
3760
Linus Torvalds1da177e2005-04-16 15:20:36 -07003761#ifdef CONFIG_PREEMPT
3762/*
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003763 * this is the entry point to schedule() from in-kernel preemption
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01003764 * off of preempt_enable. Kernel preemptions off return from interrupt
Linus Torvalds1da177e2005-04-16 15:20:36 -07003765 * occur there and call schedule directly.
3766 */
Andi Kleen722a9f92014-05-02 00:44:38 +02003767asmlinkage __visible void __sched notrace preempt_schedule(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003768{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003769 /*
3770 * If there is a non-zero preempt_count or interrupts are disabled,
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01003771 * we do not want to preempt the current task. Just return..
Linus Torvalds1da177e2005-04-16 15:20:36 -07003772 */
Frederic Weisbeckerfbb00b52013-06-19 23:56:22 +02003773 if (likely(!preemptible()))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003774 return;
3775
Frederic Weisbeckera18b5d02015-01-22 18:08:04 +01003776 preempt_schedule_common();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003777}
Masami Hiramatsu376e2422014-04-17 17:17:05 +09003778NOKPROBE_SYMBOL(preempt_schedule);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003779EXPORT_SYMBOL(preempt_schedule);
Oleg Nesterov009f60e2014-10-05 22:23:22 +02003780
Oleg Nesterov009f60e2014-10-05 22:23:22 +02003781/**
Frederic Weisbecker4eaca0a2015-06-04 17:39:08 +02003782 * preempt_schedule_notrace - preempt_schedule called by tracing
Oleg Nesterov009f60e2014-10-05 22:23:22 +02003783 *
3784 * The tracing infrastructure uses preempt_enable_notrace to prevent
3785 * recursion and tracing preempt enabling caused by the tracing
3786 * infrastructure itself. But as tracing can happen in areas coming
3787 * from userspace or just about to enter userspace, a preempt enable
3788 * can occur before user_exit() is called. This will cause the scheduler
3789 * to be called when the system is still in usermode.
3790 *
3791 * To prevent this, the preempt_enable_notrace will use this function
3792 * instead of preempt_schedule() to exit user context if needed before
3793 * calling the scheduler.
3794 */
Frederic Weisbecker4eaca0a2015-06-04 17:39:08 +02003795asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
Oleg Nesterov009f60e2014-10-05 22:23:22 +02003796{
3797 enum ctx_state prev_ctx;
3798
3799 if (likely(!preemptible()))
3800 return;
3801
3802 do {
Steven Rostedt47252cf2016-03-21 11:23:39 -04003803 /*
3804 * Because the function tracer can trace preempt_count_sub()
3805 * and it also uses preempt_enable/disable_notrace(), if
3806 * NEED_RESCHED is set, the preempt_enable_notrace() called
3807 * by the function tracer will call this function again and
3808 * cause infinite recursion.
3809 *
3810 * Preemption must be disabled here before the function
3811 * tracer can trace. Break up preempt_disable() into two
3812 * calls. One to disable preemption without fear of being
3813 * traced. The other to still record the preemption latency,
3814 * which can also be traced by the function tracer.
3815 */
Peter Zijlstra3d8f74d2015-09-28 18:09:19 +02003816 preempt_disable_notrace();
Steven Rostedt47252cf2016-03-21 11:23:39 -04003817 preempt_latency_start(1);
Oleg Nesterov009f60e2014-10-05 22:23:22 +02003818 /*
3819 * Needs preempt disabled in case user_exit() is traced
3820 * and the tracer calls preempt_enable_notrace() causing
3821 * an infinite recursion.
3822 */
3823 prev_ctx = exception_enter();
Peter Zijlstrafc13aeb2015-09-28 18:05:34 +02003824 __schedule(true);
Oleg Nesterov009f60e2014-10-05 22:23:22 +02003825 exception_exit(prev_ctx);
3826
Steven Rostedt47252cf2016-03-21 11:23:39 -04003827 preempt_latency_stop(1);
Peter Zijlstra3d8f74d2015-09-28 18:09:19 +02003828 preempt_enable_no_resched_notrace();
Oleg Nesterov009f60e2014-10-05 22:23:22 +02003829 } while (need_resched());
3830}
Frederic Weisbecker4eaca0a2015-06-04 17:39:08 +02003831EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
Oleg Nesterov009f60e2014-10-05 22:23:22 +02003832
Thomas Gleixner32e475d2013-11-21 12:41:44 +01003833#endif /* CONFIG_PREEMPT */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003834
3835/*
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003836 * this is the entry point to schedule() from kernel preemption
Linus Torvalds1da177e2005-04-16 15:20:36 -07003837 * off of irq context.
3838 * Note, that this is called and return with irqs disabled. This will
3839 * protect us against recursive calling from irq.
3840 */
Andi Kleen722a9f92014-05-02 00:44:38 +02003841asmlinkage __visible void __sched preempt_schedule_irq(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003842{
Frederic Weisbeckerb22366c2013-02-24 12:59:30 +01003843 enum ctx_state prev_state;
Ingo Molnar6478d882008-01-25 21:08:33 +01003844
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003845 /* Catch callers which need to be fixed */
Peter Zijlstraf27dde82013-08-14 14:55:31 +02003846 BUG_ON(preempt_count() || !irqs_disabled());
Linus Torvalds1da177e2005-04-16 15:20:36 -07003847
Frederic Weisbeckerb22366c2013-02-24 12:59:30 +01003848 prev_state = exception_enter();
3849
Andi Kleen3a5c3592007-10-15 17:00:14 +02003850 do {
Peter Zijlstra3d8f74d2015-09-28 18:09:19 +02003851 preempt_disable();
Andi Kleen3a5c3592007-10-15 17:00:14 +02003852 local_irq_enable();
Peter Zijlstrafc13aeb2015-09-28 18:05:34 +02003853 __schedule(true);
Andi Kleen3a5c3592007-10-15 17:00:14 +02003854 local_irq_disable();
Peter Zijlstra3d8f74d2015-09-28 18:09:19 +02003855 sched_preempt_enable_no_resched();
Lai Jiangshan5ed0cec2009-03-06 19:40:20 +08003856 } while (need_resched());
Frederic Weisbeckerb22366c2013-02-24 12:59:30 +01003857
3858 exception_exit(prev_state);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003859}
3860
Peter Zijlstra63859d42009-09-15 19:14:42 +02003861int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003862 void *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003863{
Peter Zijlstra63859d42009-09-15 19:14:42 +02003864 return try_to_wake_up(curr->private, mode, wake_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003865}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003866EXPORT_SYMBOL(default_wake_function);
3867
Ingo Molnarb29739f2006-06-27 02:54:51 -07003868#ifdef CONFIG_RT_MUTEXES
3869
3870/*
3871 * rt_mutex_setprio - set the current priority of a task
3872 * @p: task
3873 * @prio: prio value (kernel-internal form)
3874 *
3875 * This function changes the 'effective' priority of a task. It does
3876 * not touch ->normal_prio like __setscheduler().
3877 *
Thomas Gleixnerc365c292014-02-07 20:58:42 +01003878 * Used by the rt_mutex code to implement priority inheritance
3879 * logic. Call site only calls if the priority of the task changed.
Ingo Molnarb29739f2006-06-27 02:54:51 -07003880 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003881void rt_mutex_setprio(struct task_struct *p, int prio)
Ingo Molnarb29739f2006-06-27 02:54:51 -07003882{
Peter Zijlstraff77e462016-01-18 15:27:07 +01003883 int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
Thomas Gleixner83ab0aa2010-02-17 09:05:48 +01003884 const struct sched_class *prev_class;
Peter Zijlstraeb580752015-07-31 21:28:18 +02003885 struct rq_flags rf;
3886 struct rq *rq;
Ingo Molnarb29739f2006-06-27 02:54:51 -07003887
Dario Faggioliaab03e02013-11-28 11:14:43 +01003888 BUG_ON(prio > MAX_PRIO);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003889
Peter Zijlstraeb580752015-07-31 21:28:18 +02003890 rq = __task_rq_lock(p, &rf);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003891
Thomas Gleixner1c4dd992011-06-06 20:07:38 +02003892 /*
3893 * Idle task boosting is a nono in general. There is one
3894 * exception, when PREEMPT_RT and NOHZ is active:
3895 *
3896 * The idle task calls get_next_timer_interrupt() and holds
3897 * the timer wheel base->lock on the CPU and another CPU wants
3898 * to access the timer (probably to cancel it). We can safely
3899 * ignore the boosting request, as the idle CPU runs this code
3900 * with interrupts disabled and will complete the lock
3901 * protected section without being interrupted. So there is no
3902 * real need to boost.
3903 */
3904 if (unlikely(p == rq->idle)) {
3905 WARN_ON(p != rq->curr);
3906 WARN_ON(p->pi_blocked_on);
3907 goto out_unlock;
3908 }
3909
Steven Rostedta8027072010-09-20 15:13:34 -04003910 trace_sched_pi_setprio(p, prio);
Andrew Mortond5f9f942007-05-08 20:27:06 -07003911 oldprio = p->prio;
Peter Zijlstraff77e462016-01-18 15:27:07 +01003912
3913 if (oldprio == prio)
3914 queue_flag &= ~DEQUEUE_MOVE;
3915
Thomas Gleixner83ab0aa2010-02-17 09:05:48 +01003916 prev_class = p->sched_class;
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04003917 queued = task_on_rq_queued(p);
Dmitry Adamushko051a1d12007-12-18 15:21:13 +01003918 running = task_current(rq, p);
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04003919 if (queued)
Peter Zijlstraff77e462016-01-18 15:27:07 +01003920 dequeue_task(rq, p, queue_flag);
Hiroshi Shimamoto0e1f3482008-03-10 11:01:20 -07003921 if (running)
Kirill Tkhaif3cd1c42014-09-12 17:41:40 +04003922 put_prev_task(rq, p);
Ingo Molnardd41f592007-07-09 18:51:59 +02003923
Dario Faggioli2d3d8912013-11-07 14:43:44 +01003924 /*
3925 * Boosting condition are:
3926 * 1. -rt task is running and holds mutex A
3927 * --> -dl task blocks on mutex A
3928 *
3929 * 2. -dl task is running and holds mutex A
3930 * --> -dl task blocks on mutex A and could preempt the
3931 * running task
3932 */
3933 if (dl_prio(prio)) {
Oleg Nesterov466af292014-06-06 18:52:06 +02003934 struct task_struct *pi_task = rt_mutex_get_top_task(p);
3935 if (!dl_prio(p->normal_prio) ||
3936 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
Dario Faggioli2d3d8912013-11-07 14:43:44 +01003937 p->dl.dl_boosted = 1;
Peter Zijlstraff77e462016-01-18 15:27:07 +01003938 queue_flag |= ENQUEUE_REPLENISH;
Dario Faggioli2d3d8912013-11-07 14:43:44 +01003939 } else
3940 p->dl.dl_boosted = 0;
Dario Faggioliaab03e02013-11-28 11:14:43 +01003941 p->sched_class = &dl_sched_class;
Dario Faggioli2d3d8912013-11-07 14:43:44 +01003942 } else if (rt_prio(prio)) {
3943 if (dl_prio(oldprio))
3944 p->dl.dl_boosted = 0;
3945 if (oldprio < prio)
Peter Zijlstraff77e462016-01-18 15:27:07 +01003946 queue_flag |= ENQUEUE_HEAD;
Ingo Molnardd41f592007-07-09 18:51:59 +02003947 p->sched_class = &rt_sched_class;
Dario Faggioli2d3d8912013-11-07 14:43:44 +01003948 } else {
3949 if (dl_prio(oldprio))
3950 p->dl.dl_boosted = 0;
Brian Silverman746db942015-02-18 16:23:56 -08003951 if (rt_prio(oldprio))
3952 p->rt.timeout = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02003953 p->sched_class = &fair_sched_class;
Dario Faggioli2d3d8912013-11-07 14:43:44 +01003954 }
Ingo Molnardd41f592007-07-09 18:51:59 +02003955
Ingo Molnarb29739f2006-06-27 02:54:51 -07003956 p->prio = prio;
3957
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04003958 if (queued)
Peter Zijlstraff77e462016-01-18 15:27:07 +01003959 enqueue_task(rq, p, queue_flag);
Vincent Guittota399d232016-09-12 09:47:52 +02003960 if (running)
Peter Zijlstrab2bf6c32016-09-20 22:00:38 +02003961 set_curr_task(rq, p);
Steven Rostedtcb469842008-01-25 21:08:22 +01003962
Peter Zijlstrada7a7352011-01-17 17:03:27 +01003963 check_class_changed(rq, p, prev_class, oldprio);
Thomas Gleixner1c4dd992011-06-06 20:07:38 +02003964out_unlock:
Peter Zijlstra4c9a4bc2015-06-11 14:46:39 +02003965 preempt_disable(); /* avoid rq from going away on us */
Peter Zijlstraeb580752015-07-31 21:28:18 +02003966 __task_rq_unlock(rq, &rf);
Peter Zijlstra4c9a4bc2015-06-11 14:46:39 +02003967
3968 balance_callback(rq);
3969 preempt_enable();
Ingo Molnarb29739f2006-06-27 02:54:51 -07003970}
Ingo Molnarb29739f2006-06-27 02:54:51 -07003971#endif
Dario Faggiolid50dde52013-11-07 14:43:36 +01003972
Ingo Molnar36c8b582006-07-03 00:25:41 -07003973void set_user_nice(struct task_struct *p, long nice)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003974{
Peter Zijlstra49bd21e2016-09-20 22:06:01 +02003975 bool queued, running;
3976 int old_prio, delta;
Peter Zijlstraeb580752015-07-31 21:28:18 +02003977 struct rq_flags rf;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003978 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003979
Dongsheng Yang75e45d52014-02-11 15:34:50 +08003980 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003981 return;
3982 /*
3983 * We have to be careful, if called from sys_setpriority(),
3984 * the task might be in the middle of scheduling on another CPU.
3985 */
Peter Zijlstraeb580752015-07-31 21:28:18 +02003986 rq = task_rq_lock(p, &rf);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003987 /*
3988 * The RT priorities are set via sched_setscheduler(), but we still
3989 * allow the 'normal' nice value to be set - but as expected
3990 * it wont have any effect on scheduling until the task is
Dario Faggioliaab03e02013-11-28 11:14:43 +01003991 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003992 */
Dario Faggioliaab03e02013-11-28 11:14:43 +01003993 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003994 p->static_prio = NICE_TO_PRIO(nice);
3995 goto out_unlock;
3996 }
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04003997 queued = task_on_rq_queued(p);
Peter Zijlstra49bd21e2016-09-20 22:06:01 +02003998 running = task_current(rq, p);
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04003999 if (queued)
Peter Zijlstra1de64442015-09-30 17:44:13 +02004000 dequeue_task(rq, p, DEQUEUE_SAVE);
Peter Zijlstra49bd21e2016-09-20 22:06:01 +02004001 if (running)
4002 put_prev_task(rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004003
Linus Torvalds1da177e2005-04-16 15:20:36 -07004004 p->static_prio = NICE_TO_PRIO(nice);
Peter Williams2dd73a42006-06-27 02:54:34 -07004005 set_load_weight(p);
Ingo Molnarb29739f2006-06-27 02:54:51 -07004006 old_prio = p->prio;
4007 p->prio = effective_prio(p);
4008 delta = p->prio - old_prio;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004009
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04004010 if (queued) {
Peter Zijlstra1de64442015-09-30 17:44:13 +02004011 enqueue_task(rq, p, ENQUEUE_RESTORE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004012 /*
Andrew Mortond5f9f942007-05-08 20:27:06 -07004013 * If the task increased its priority or is running and
4014 * lowered its priority, then reschedule its CPU:
Linus Torvalds1da177e2005-04-16 15:20:36 -07004015 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07004016 if (delta < 0 || (delta > 0 && task_running(rq, p)))
Kirill Tkhai88751252014-06-29 00:03:57 +04004017 resched_curr(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004018 }
Peter Zijlstra49bd21e2016-09-20 22:06:01 +02004019 if (running)
4020 set_curr_task(rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004021out_unlock:
Peter Zijlstraeb580752015-07-31 21:28:18 +02004022 task_rq_unlock(rq, p, &rf);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004023}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004024EXPORT_SYMBOL(set_user_nice);
4025
Matt Mackalle43379f2005-05-01 08:59:00 -07004026/*
4027 * can_nice - check if a task can reduce its nice value
4028 * @p: task
4029 * @nice: nice value
4030 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004031int can_nice(const struct task_struct *p, const int nice)
Matt Mackalle43379f2005-05-01 08:59:00 -07004032{
Matt Mackall024f4742005-08-18 11:24:19 -07004033 /* convert nice value [19,-20] to rlimit style value [1,40] */
Dongsheng Yang7aa2c012014-05-08 18:33:49 +09004034 int nice_rlim = nice_to_rlimit(nice);
Ingo Molnar48f24c42006-07-03 00:25:40 -07004035
Jiri Slaby78d7d402010-03-05 13:42:54 -08004036 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
Matt Mackalle43379f2005-05-01 08:59:00 -07004037 capable(CAP_SYS_NICE));
4038}
4039
Linus Torvalds1da177e2005-04-16 15:20:36 -07004040#ifdef __ARCH_WANT_SYS_NICE
4041
4042/*
4043 * sys_nice - change the priority of the current process.
4044 * @increment: priority increment
4045 *
4046 * sys_setpriority is a more generic, but much slower function that
4047 * does similar things.
4048 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01004049SYSCALL_DEFINE1(nice, int, increment)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004050{
Ingo Molnar48f24c42006-07-03 00:25:40 -07004051 long nice, retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004052
4053 /*
4054 * Setpriority might change our priority at the same moment.
4055 * We don't have to worry. Conceptually one call occurs first
4056 * and we have a single winner.
4057 */
Dongsheng Yanga9467fa2014-05-08 18:35:15 +09004058 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
Dongsheng Yangd0ea0262014-01-27 22:00:45 -05004059 nice = task_nice(current) + increment;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004060
Dongsheng Yanga9467fa2014-05-08 18:35:15 +09004061 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
Matt Mackalle43379f2005-05-01 08:59:00 -07004062 if (increment < 0 && !can_nice(current, nice))
4063 return -EPERM;
4064
Linus Torvalds1da177e2005-04-16 15:20:36 -07004065 retval = security_task_setnice(current, nice);
4066 if (retval)
4067 return retval;
4068
4069 set_user_nice(current, nice);
4070 return 0;
4071}
4072
4073#endif
4074
4075/**
4076 * task_prio - return the priority value of a given task.
4077 * @p: the task in question.
4078 *
Yacine Belkadie69f6182013-07-12 20:45:47 +02004079 * Return: The priority value as seen by users in /proc.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004080 * RT tasks are offset by -200. Normal tasks are centered
4081 * around 0, value goes from -16 to +15.
4082 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004083int task_prio(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004084{
4085 return p->prio - MAX_RT_PRIO;
4086}
4087
4088/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07004089 * idle_cpu - is a given cpu idle currently?
4090 * @cpu: the processor in question.
Yacine Belkadie69f6182013-07-12 20:45:47 +02004091 *
4092 * Return: 1 if the CPU is currently idle. 0 otherwise.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004093 */
4094int idle_cpu(int cpu)
4095{
Thomas Gleixner908a3282011-09-15 15:32:06 +02004096 struct rq *rq = cpu_rq(cpu);
4097
4098 if (rq->curr != rq->idle)
4099 return 0;
4100
4101 if (rq->nr_running)
4102 return 0;
4103
4104#ifdef CONFIG_SMP
4105 if (!llist_empty(&rq->wake_list))
4106 return 0;
4107#endif
4108
4109 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004110}
4111
Linus Torvalds1da177e2005-04-16 15:20:36 -07004112/**
4113 * idle_task - return the idle task for a given cpu.
4114 * @cpu: the processor in question.
Yacine Belkadie69f6182013-07-12 20:45:47 +02004115 *
4116 * Return: The idle task for the cpu @cpu.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004117 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004118struct task_struct *idle_task(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004119{
4120 return cpu_rq(cpu)->idle;
4121}
4122
4123/**
4124 * find_process_by_pid - find a process with a matching PID value.
4125 * @pid: the pid in question.
Yacine Belkadie69f6182013-07-12 20:45:47 +02004126 *
4127 * The task of @pid, if found. %NULL otherwise.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004128 */
Alexey Dobriyana9957442007-10-15 17:00:13 +02004129static struct task_struct *find_process_by_pid(pid_t pid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004130{
Pavel Emelyanov228ebcb2007-10-18 23:40:16 -07004131 return pid ? find_task_by_vpid(pid) : current;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004132}
4133
Dario Faggioliaab03e02013-11-28 11:14:43 +01004134/*
4135 * This function initializes the sched_dl_entity of a newly becoming
4136 * SCHED_DEADLINE task.
4137 *
4138 * Only the static values are considered here, the actual runtime and the
4139 * absolute deadline will be properly calculated when the task is enqueued
4140 * for the first time with its new policy.
4141 */
4142static void
4143__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
4144{
4145 struct sched_dl_entity *dl_se = &p->dl;
4146
Dario Faggioliaab03e02013-11-28 11:14:43 +01004147 dl_se->dl_runtime = attr->sched_runtime;
4148 dl_se->dl_deadline = attr->sched_deadline;
Harald Gustafsson755378a2013-11-07 14:43:40 +01004149 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
Dario Faggioliaab03e02013-11-28 11:14:43 +01004150 dl_se->flags = attr->sched_flags;
Dario Faggioli332ac172013-11-07 14:43:45 +01004151 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
Daniel Bristot de Oliveira0559ea32017-05-29 16:24:03 +02004152 dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
Peter Zijlstra40767b02015-01-28 15:08:03 +01004153
4154 /*
4155 * Changing the parameters of a task is 'tricky' and we're not doing
4156 * the correct thing -- also see task_dead_dl() and switched_from_dl().
4157 *
4158 * What we SHOULD do is delay the bandwidth release until the 0-lag
4159 * point. This would include retaining the task_struct until that time
4160 * and change dl_overflow() to not immediately decrement the current
4161 * amount.
4162 *
4163 * Instead we retain the current runtime/deadline and let the new
4164 * parameters take effect after the current reservation period lapses.
4165 * This is safe (albeit pessimistic) because the 0-lag point is always
4166 * before the current scheduling deadline.
4167 *
4168 * We can still have temporary overloads because we do not delay the
4169 * change in bandwidth until that time; so admission control is
4170 * not on the safe side. It does however guarantee tasks will never
4171 * consume more than promised.
4172 */
Dario Faggioliaab03e02013-11-28 11:14:43 +01004173}
4174
Steven Rostedtc13db6b2014-07-23 11:28:26 -04004175/*
4176 * sched_setparam() passes in -1 for its policy, to let the functions
4177 * it calls know not to change it.
4178 */
4179#define SETPARAM_POLICY -1
4180
Thomas Gleixnerc365c292014-02-07 20:58:42 +01004181static void __setscheduler_params(struct task_struct *p,
4182 const struct sched_attr *attr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004183{
Dario Faggiolid50dde52013-11-07 14:43:36 +01004184 int policy = attr->sched_policy;
4185
Steven Rostedtc13db6b2014-07-23 11:28:26 -04004186 if (policy == SETPARAM_POLICY)
Peter Zijlstra39fd8fd2014-01-15 16:33:20 +01004187 policy = p->policy;
4188
Linus Torvalds1da177e2005-04-16 15:20:36 -07004189 p->policy = policy;
Dario Faggiolid50dde52013-11-07 14:43:36 +01004190
Dario Faggioliaab03e02013-11-28 11:14:43 +01004191 if (dl_policy(policy))
4192 __setparam_dl(p, attr);
Peter Zijlstra39fd8fd2014-01-15 16:33:20 +01004193 else if (fair_policy(policy))
Dario Faggiolid50dde52013-11-07 14:43:36 +01004194 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
4195
Peter Zijlstra39fd8fd2014-01-15 16:33:20 +01004196 /*
4197 * __sched_setscheduler() ensures attr->sched_priority == 0 when
4198 * !rt_policy. Always setting this ensures that things like
4199 * getparam()/getattr() don't report silly values for !rt tasks.
4200 */
4201 p->rt_priority = attr->sched_priority;
Steven Rostedt383afd02014-03-11 19:24:20 -04004202 p->normal_prio = normal_prio(p);
Thomas Gleixnerc365c292014-02-07 20:58:42 +01004203 set_load_weight(p);
4204}
Peter Zijlstra39fd8fd2014-01-15 16:33:20 +01004205
Thomas Gleixnerc365c292014-02-07 20:58:42 +01004206/* Actually do priority change: must hold pi & rq lock. */
4207static void __setscheduler(struct rq *rq, struct task_struct *p,
Thomas Gleixner0782e632015-05-05 19:49:49 +02004208 const struct sched_attr *attr, bool keep_boost)
Thomas Gleixnerc365c292014-02-07 20:58:42 +01004209{
4210 __setscheduler_params(p, attr);
Dario Faggiolid50dde52013-11-07 14:43:36 +01004211
Steven Rostedt383afd02014-03-11 19:24:20 -04004212 /*
Thomas Gleixner0782e632015-05-05 19:49:49 +02004213 * Keep a potential priority boosting if called from
4214 * sched_setscheduler().
Steven Rostedt383afd02014-03-11 19:24:20 -04004215 */
Thomas Gleixner0782e632015-05-05 19:49:49 +02004216 if (keep_boost)
4217 p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
4218 else
4219 p->prio = normal_prio(p);
Steven Rostedt383afd02014-03-11 19:24:20 -04004220
Dario Faggioliaab03e02013-11-28 11:14:43 +01004221 if (dl_prio(p->prio))
4222 p->sched_class = &dl_sched_class;
4223 else if (rt_prio(p->prio))
Peter Zijlstraffd44db2009-11-10 20:12:01 +01004224 p->sched_class = &rt_sched_class;
4225 else
4226 p->sched_class = &fair_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004227}
Dario Faggioliaab03e02013-11-28 11:14:43 +01004228
4229static void
4230__getparam_dl(struct task_struct *p, struct sched_attr *attr)
4231{
4232 struct sched_dl_entity *dl_se = &p->dl;
4233
4234 attr->sched_priority = p->rt_priority;
4235 attr->sched_runtime = dl_se->dl_runtime;
4236 attr->sched_deadline = dl_se->dl_deadline;
Harald Gustafsson755378a2013-11-07 14:43:40 +01004237 attr->sched_period = dl_se->dl_period;
Dario Faggioliaab03e02013-11-28 11:14:43 +01004238 attr->sched_flags = dl_se->flags;
4239}
4240
4241/*
4242 * This function validates the new parameters of a -deadline task.
4243 * We ask for the deadline not being zero, and greater or equal
Harald Gustafsson755378a2013-11-07 14:43:40 +01004244 * than the runtime, as well as the period of being zero or
Dario Faggioli332ac172013-11-07 14:43:45 +01004245 * greater than deadline. Furthermore, we have to be sure that
Juri Lellib08278192014-05-13 14:11:31 +02004246 * user parameters are above the internal resolution of 1us (we
4247 * check sched_runtime only since it is always the smaller one) and
4248 * below 2^63 ns (we have to check both sched_deadline and
4249 * sched_period, as the latter can be zero).
Dario Faggioliaab03e02013-11-28 11:14:43 +01004250 */
4251static bool
4252__checkparam_dl(const struct sched_attr *attr)
4253{
Juri Lellib08278192014-05-13 14:11:31 +02004254 /* deadline != 0 */
4255 if (attr->sched_deadline == 0)
4256 return false;
4257
4258 /*
4259 * Since we truncate DL_SCALE bits, make sure we're at least
4260 * that big.
4261 */
4262 if (attr->sched_runtime < (1ULL << DL_SCALE))
4263 return false;
4264
4265 /*
4266 * Since we use the MSB for wrap-around and sign issues, make
4267 * sure it's not set (mind that period can be equal to zero).
4268 */
4269 if (attr->sched_deadline & (1ULL << 63) ||
4270 attr->sched_period & (1ULL << 63))
4271 return false;
4272
4273 /* runtime <= deadline <= period (if period != 0) */
4274 if ((attr->sched_period != 0 &&
4275 attr->sched_period < attr->sched_deadline) ||
4276 attr->sched_deadline < attr->sched_runtime)
4277 return false;
4278
4279 return true;
Dario Faggioliaab03e02013-11-28 11:14:43 +01004280}
4281
David Howellsc69e8d92008-11-14 10:39:19 +11004282/*
4283 * check the target process has a UID that matches the current process's
4284 */
4285static bool check_same_owner(struct task_struct *p)
4286{
4287 const struct cred *cred = current_cred(), *pcred;
4288 bool match;
4289
4290 rcu_read_lock();
4291 pcred = __task_cred(p);
Eric W. Biederman9c806aa2012-02-02 18:54:02 -08004292 match = (uid_eq(cred->euid, pcred->euid) ||
4293 uid_eq(cred->euid, pcred->uid));
David Howellsc69e8d92008-11-14 10:39:19 +11004294 rcu_read_unlock();
4295 return match;
4296}
4297
Wanpeng Li75381602014-11-26 08:44:04 +08004298static bool dl_param_changed(struct task_struct *p,
4299 const struct sched_attr *attr)
4300{
4301 struct sched_dl_entity *dl_se = &p->dl;
4302
4303 if (dl_se->dl_runtime != attr->sched_runtime ||
4304 dl_se->dl_deadline != attr->sched_deadline ||
4305 dl_se->dl_period != attr->sched_period ||
4306 dl_se->flags != attr->sched_flags)
4307 return true;
4308
4309 return false;
4310}
4311
Dario Faggiolid50dde52013-11-07 14:43:36 +01004312static int __sched_setscheduler(struct task_struct *p,
4313 const struct sched_attr *attr,
Peter Zijlstradbc7f062015-06-11 14:46:38 +02004314 bool user, bool pi)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004315{
Steven Rostedt383afd02014-03-11 19:24:20 -04004316 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
4317 MAX_RT_PRIO - 1 - attr->sched_priority;
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04004318 int retval, oldprio, oldpolicy = -1, queued, running;
Thomas Gleixner0782e632015-05-05 19:49:49 +02004319 int new_effective_prio, policy = attr->sched_policy;
Thomas Gleixner83ab0aa2010-02-17 09:05:48 +01004320 const struct sched_class *prev_class;
Peter Zijlstraeb580752015-07-31 21:28:18 +02004321 struct rq_flags rf;
Lennart Poetteringca94c442009-06-15 17:17:47 +02004322 int reset_on_fork;
Peter Zijlstraff77e462016-01-18 15:27:07 +01004323 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
Peter Zijlstraeb580752015-07-31 21:28:18 +02004324 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004325
Steven Rostedt (VMware)5a416ed2017-03-09 10:18:42 -05004326 /* The pi code expects interrupts enabled */
4327 BUG_ON(pi && in_interrupt());
jianzhouf41ab602019-01-03 16:41:57 +08004328
Linus Torvalds1da177e2005-04-16 15:20:36 -07004329recheck:
4330 /* double check policy once rq lock held */
Lennart Poetteringca94c442009-06-15 17:17:47 +02004331 if (policy < 0) {
4332 reset_on_fork = p->sched_reset_on_fork;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004333 policy = oldpolicy = p->policy;
Lennart Poetteringca94c442009-06-15 17:17:47 +02004334 } else {
Peter Zijlstra7479f3c2014-01-15 17:05:04 +01004335 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
Lennart Poetteringca94c442009-06-15 17:17:47 +02004336
Henrik Austad20f9cd22015-09-09 17:00:41 +02004337 if (!valid_policy(policy))
Lennart Poetteringca94c442009-06-15 17:17:47 +02004338 return -EINVAL;
4339 }
4340
Peter Zijlstra7479f3c2014-01-15 17:05:04 +01004341 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
4342 return -EINVAL;
4343
Linus Torvalds1da177e2005-04-16 15:20:36 -07004344 /*
4345 * Valid priorities for SCHED_FIFO and SCHED_RR are
Ingo Molnardd41f592007-07-09 18:51:59 +02004346 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4347 * SCHED_BATCH and SCHED_IDLE is 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004348 */
Peter Zijlstra0bb040a2014-01-15 17:15:13 +01004349 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
Dario Faggiolid50dde52013-11-07 14:43:36 +01004350 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004351 return -EINVAL;
Dario Faggioliaab03e02013-11-28 11:14:43 +01004352 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
4353 (rt_policy(policy) != (attr->sched_priority != 0)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004354 return -EINVAL;
4355
Olivier Croquette37e4ab32005-06-25 14:57:32 -07004356 /*
4357 * Allow unprivileged RT tasks to decrease priority:
4358 */
Rusty Russell961ccdd2008-06-23 13:55:38 +10004359 if (user && !capable(CAP_SYS_NICE)) {
Dario Faggiolid50dde52013-11-07 14:43:36 +01004360 if (fair_policy(policy)) {
Dongsheng Yangd0ea0262014-01-27 22:00:45 -05004361 if (attr->sched_nice < task_nice(p) &&
Peter Zijlstraeaad4512014-01-16 17:54:25 +01004362 !can_nice(p, attr->sched_nice))
Dario Faggiolid50dde52013-11-07 14:43:36 +01004363 return -EPERM;
4364 }
4365
Ingo Molnare05606d2007-07-09 18:51:59 +02004366 if (rt_policy(policy)) {
Oleg Nesterova44702e2010-06-11 01:09:44 +02004367 unsigned long rlim_rtprio =
4368 task_rlimit(p, RLIMIT_RTPRIO);
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004369
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004370 /* can't set/change the rt policy */
4371 if (policy != p->policy && !rlim_rtprio)
4372 return -EPERM;
4373
4374 /* can't increase priority */
Dario Faggiolid50dde52013-11-07 14:43:36 +01004375 if (attr->sched_priority > p->rt_priority &&
4376 attr->sched_priority > rlim_rtprio)
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004377 return -EPERM;
4378 }
Darren Hartc02aa732011-02-17 15:37:07 -08004379
Juri Lellid44753b2014-03-03 12:09:21 +01004380 /*
4381 * Can't set/change SCHED_DEADLINE policy at all for now
4382 * (safest behavior); in the future we would like to allow
4383 * unprivileged DL tasks to increase their relative deadline
4384 * or reduce their runtime (both ways reducing utilization)
4385 */
4386 if (dl_policy(policy))
4387 return -EPERM;
4388
Ingo Molnardd41f592007-07-09 18:51:59 +02004389 /*
Darren Hartc02aa732011-02-17 15:37:07 -08004390 * Treat SCHED_IDLE as nice 20. Only allow a switch to
4391 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
Ingo Molnardd41f592007-07-09 18:51:59 +02004392 */
Henrik Austad20f9cd22015-09-09 17:00:41 +02004393 if (idle_policy(p->policy) && !idle_policy(policy)) {
Dongsheng Yangd0ea0262014-01-27 22:00:45 -05004394 if (!can_nice(p, task_nice(p)))
Darren Hartc02aa732011-02-17 15:37:07 -08004395 return -EPERM;
4396 }
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004397
Olivier Croquette37e4ab32005-06-25 14:57:32 -07004398 /* can't change other user's priorities */
David Howellsc69e8d92008-11-14 10:39:19 +11004399 if (!check_same_owner(p))
Olivier Croquette37e4ab32005-06-25 14:57:32 -07004400 return -EPERM;
Lennart Poetteringca94c442009-06-15 17:17:47 +02004401
4402 /* Normal users shall not reset the sched_reset_on_fork flag */
4403 if (p->sched_reset_on_fork && !reset_on_fork)
4404 return -EPERM;
Olivier Croquette37e4ab32005-06-25 14:57:32 -07004405 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004406
Jeremy Fitzhardinge725aad22008-08-03 09:33:03 -07004407 if (user) {
KOSAKI Motohirob0ae1982010-10-15 04:21:18 +09004408 retval = security_task_setscheduler(p);
Jeremy Fitzhardinge725aad22008-08-03 09:33:03 -07004409 if (retval)
4410 return retval;
4411 }
4412
Linus Torvalds1da177e2005-04-16 15:20:36 -07004413 /*
Ingo Molnarb29739f2006-06-27 02:54:51 -07004414 * make sure no PI-waiters arrive (or leave) while we are
4415 * changing the priority of the task:
Peter Zijlstra0122ec52011-04-05 17:23:51 +02004416 *
Lucas De Marchi25985ed2011-03-30 22:57:33 -03004417 * To be able to change p->policy safely, the appropriate
Linus Torvalds1da177e2005-04-16 15:20:36 -07004418 * runqueue lock must be held.
4419 */
Peter Zijlstraeb580752015-07-31 21:28:18 +02004420 rq = task_rq_lock(p, &rf);
Peter Zijlstradc61b1d2010-06-08 11:40:42 +02004421
Peter Zijlstra34f971f2010-09-22 13:53:15 +02004422 /*
4423 * Changing the policy of the stop threads its a very bad idea
4424 */
4425 if (p == rq->stop) {
Peter Zijlstraeb580752015-07-31 21:28:18 +02004426 task_rq_unlock(rq, p, &rf);
Peter Zijlstra34f971f2010-09-22 13:53:15 +02004427 return -EINVAL;
4428 }
4429
Dario Faggiolia51e9192011-03-24 14:00:18 +01004430 /*
Thomas Gleixnerd6b1e912014-02-07 20:58:40 +01004431 * If not changing anything there's no need to proceed further,
4432 * but store a possible modification of reset_on_fork.
Dario Faggiolia51e9192011-03-24 14:00:18 +01004433 */
Dario Faggiolid50dde52013-11-07 14:43:36 +01004434 if (unlikely(policy == p->policy)) {
Dongsheng Yangd0ea0262014-01-27 22:00:45 -05004435 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
Dario Faggiolid50dde52013-11-07 14:43:36 +01004436 goto change;
4437 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
4438 goto change;
Wanpeng Li75381602014-11-26 08:44:04 +08004439 if (dl_policy(policy) && dl_param_changed(p, attr))
Dario Faggioliaab03e02013-11-28 11:14:43 +01004440 goto change;
Dario Faggiolid50dde52013-11-07 14:43:36 +01004441
Thomas Gleixnerd6b1e912014-02-07 20:58:40 +01004442 p->sched_reset_on_fork = reset_on_fork;
Peter Zijlstraeb580752015-07-31 21:28:18 +02004443 task_rq_unlock(rq, p, &rf);
Dario Faggiolia51e9192011-03-24 14:00:18 +01004444 return 0;
4445 }
Dario Faggiolid50dde52013-11-07 14:43:36 +01004446change:
Dario Faggiolia51e9192011-03-24 14:00:18 +01004447
Peter Zijlstradc61b1d2010-06-08 11:40:42 +02004448 if (user) {
Dario Faggioli332ac172013-11-07 14:43:45 +01004449#ifdef CONFIG_RT_GROUP_SCHED
Peter Zijlstradc61b1d2010-06-08 11:40:42 +02004450 /*
4451 * Do not allow realtime tasks into groups that have no runtime
4452 * assigned.
4453 */
4454 if (rt_bandwidth_enabled() && rt_policy(policy) &&
Mike Galbraithf4493772011-01-13 04:54:50 +01004455 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4456 !task_group_is_autogroup(task_group(p))) {
Peter Zijlstraeb580752015-07-31 21:28:18 +02004457 task_rq_unlock(rq, p, &rf);
Peter Zijlstradc61b1d2010-06-08 11:40:42 +02004458 return -EPERM;
4459 }
Peter Zijlstradc61b1d2010-06-08 11:40:42 +02004460#endif
Dario Faggioli332ac172013-11-07 14:43:45 +01004461#ifdef CONFIG_SMP
4462 if (dl_bandwidth_enabled() && dl_policy(policy)) {
4463 cpumask_t *span = rq->rd->span;
Dario Faggioli332ac172013-11-07 14:43:45 +01004464
4465 /*
4466 * Don't allow tasks with an affinity mask smaller than
4467 * the entire root_domain to become SCHED_DEADLINE. We
4468 * will also fail if there's no bandwidth available.
4469 */
Peter Zijlstrae4099a52013-12-17 10:03:34 +01004470 if (!cpumask_subset(span, &p->cpus_allowed) ||
4471 rq->rd->dl_bw.bw == 0) {
Peter Zijlstraeb580752015-07-31 21:28:18 +02004472 task_rq_unlock(rq, p, &rf);
Dario Faggioli332ac172013-11-07 14:43:45 +01004473 return -EPERM;
4474 }
4475 }
4476#endif
4477 }
Peter Zijlstradc61b1d2010-06-08 11:40:42 +02004478
Linus Torvalds1da177e2005-04-16 15:20:36 -07004479 /* recheck policy now with rq lock held */
4480 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4481 policy = oldpolicy = -1;
Peter Zijlstraeb580752015-07-31 21:28:18 +02004482 task_rq_unlock(rq, p, &rf);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004483 goto recheck;
4484 }
Dario Faggioli332ac172013-11-07 14:43:45 +01004485
4486 /*
4487 * If setscheduling to SCHED_DEADLINE (or changing the parameters
4488 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
4489 * is available.
4490 */
Peter Zijlstrae4099a52013-12-17 10:03:34 +01004491 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
Peter Zijlstraeb580752015-07-31 21:28:18 +02004492 task_rq_unlock(rq, p, &rf);
Dario Faggioli332ac172013-11-07 14:43:45 +01004493 return -EBUSY;
4494 }
4495
Thomas Gleixnerc365c292014-02-07 20:58:42 +01004496 p->sched_reset_on_fork = reset_on_fork;
4497 oldprio = p->prio;
4498
Peter Zijlstradbc7f062015-06-11 14:46:38 +02004499 if (pi) {
4500 /*
4501 * Take priority boosted tasks into account. If the new
4502 * effective priority is unchanged, we just store the new
4503 * normal parameters and do not touch the scheduler class and
4504 * the runqueue. This will be done when the task deboost
4505 * itself.
4506 */
4507 new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
Peter Zijlstraff77e462016-01-18 15:27:07 +01004508 if (new_effective_prio == oldprio)
4509 queue_flags &= ~DEQUEUE_MOVE;
Thomas Gleixnerc365c292014-02-07 20:58:42 +01004510 }
4511
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04004512 queued = task_on_rq_queued(p);
Dmitry Adamushko051a1d12007-12-18 15:21:13 +01004513 running = task_current(rq, p);
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04004514 if (queued)
Peter Zijlstraff77e462016-01-18 15:27:07 +01004515 dequeue_task(rq, p, queue_flags);
Hiroshi Shimamoto0e1f3482008-03-10 11:01:20 -07004516 if (running)
Kirill Tkhaif3cd1c42014-09-12 17:41:40 +04004517 put_prev_task(rq, p);
Dmitry Adamushkof6b53202007-10-15 17:00:08 +02004518
Thomas Gleixner83ab0aa2010-02-17 09:05:48 +01004519 prev_class = p->sched_class;
Peter Zijlstradbc7f062015-06-11 14:46:38 +02004520 __setscheduler(rq, p, attr, pi);
Dmitry Adamushkof6b53202007-10-15 17:00:08 +02004521
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04004522 if (queued) {
Thomas Gleixner81a44c52014-02-07 20:58:41 +01004523 /*
4524 * We enqueue to tail when the priority of a task is
4525 * increased (user space view).
4526 */
Peter Zijlstraff77e462016-01-18 15:27:07 +01004527 if (oldprio < p->prio)
4528 queue_flags |= ENQUEUE_HEAD;
Peter Zijlstra1de64442015-09-30 17:44:13 +02004529
Peter Zijlstraff77e462016-01-18 15:27:07 +01004530 enqueue_task(rq, p, queue_flags);
Thomas Gleixner81a44c52014-02-07 20:58:41 +01004531 }
Vincent Guittota399d232016-09-12 09:47:52 +02004532 if (running)
Peter Zijlstrab2bf6c32016-09-20 22:00:38 +02004533 set_curr_task(rq, p);
Steven Rostedtcb469842008-01-25 21:08:22 +01004534
Peter Zijlstrada7a7352011-01-17 17:03:27 +01004535 check_class_changed(rq, p, prev_class, oldprio);
Peter Zijlstra4c9a4bc2015-06-11 14:46:39 +02004536 preempt_disable(); /* avoid rq from going away on us */
Peter Zijlstraeb580752015-07-31 21:28:18 +02004537 task_rq_unlock(rq, p, &rf);
Ingo Molnarb29739f2006-06-27 02:54:51 -07004538
Peter Zijlstradbc7f062015-06-11 14:46:38 +02004539 if (pi)
4540 rt_mutex_adjust_pi(p);
Thomas Gleixner95e02ca2006-06-27 02:55:02 -07004541
Peter Zijlstra4c9a4bc2015-06-11 14:46:39 +02004542 /*
4543 * Run balance callbacks after we've adjusted the PI chain.
4544 */
4545 balance_callback(rq);
4546 preempt_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004547
4548 return 0;
4549}
Rusty Russell961ccdd2008-06-23 13:55:38 +10004550
Peter Zijlstra7479f3c2014-01-15 17:05:04 +01004551static int _sched_setscheduler(struct task_struct *p, int policy,
4552 const struct sched_param *param, bool check)
4553{
4554 struct sched_attr attr = {
4555 .sched_policy = policy,
4556 .sched_priority = param->sched_priority,
4557 .sched_nice = PRIO_TO_NICE(p->static_prio),
4558 };
4559
Steven Rostedtc13db6b2014-07-23 11:28:26 -04004560 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
4561 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
Peter Zijlstra7479f3c2014-01-15 17:05:04 +01004562 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4563 policy &= ~SCHED_RESET_ON_FORK;
4564 attr.sched_policy = policy;
4565 }
4566
Peter Zijlstradbc7f062015-06-11 14:46:38 +02004567 return __sched_setscheduler(p, &attr, check, true);
Peter Zijlstra7479f3c2014-01-15 17:05:04 +01004568}
Rusty Russell961ccdd2008-06-23 13:55:38 +10004569/**
4570 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4571 * @p: the task in question.
4572 * @policy: new policy.
4573 * @param: structure containing the new RT priority.
4574 *
Yacine Belkadie69f6182013-07-12 20:45:47 +02004575 * Return: 0 on success. An error code otherwise.
4576 *
Rusty Russell961ccdd2008-06-23 13:55:38 +10004577 * NOTE that the task may be already dead.
4578 */
4579int sched_setscheduler(struct task_struct *p, int policy,
KOSAKI Motohirofe7de492010-10-20 16:01:12 -07004580 const struct sched_param *param)
Rusty Russell961ccdd2008-06-23 13:55:38 +10004581{
Peter Zijlstra7479f3c2014-01-15 17:05:04 +01004582 return _sched_setscheduler(p, policy, param, true);
Rusty Russell961ccdd2008-06-23 13:55:38 +10004583}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004584EXPORT_SYMBOL_GPL(sched_setscheduler);
4585
Dario Faggiolid50dde52013-11-07 14:43:36 +01004586int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
4587{
Peter Zijlstradbc7f062015-06-11 14:46:38 +02004588 return __sched_setscheduler(p, attr, true, true);
Dario Faggiolid50dde52013-11-07 14:43:36 +01004589}
4590EXPORT_SYMBOL_GPL(sched_setattr);
4591
Rusty Russell961ccdd2008-06-23 13:55:38 +10004592/**
4593 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
4594 * @p: the task in question.
4595 * @policy: new policy.
4596 * @param: structure containing the new RT priority.
4597 *
4598 * Just like sched_setscheduler, only don't bother checking if the
4599 * current context has permission. For example, this is needed in
4600 * stop_machine(): we create temporary high priority worker threads,
4601 * but our caller might not have that capability.
Yacine Belkadie69f6182013-07-12 20:45:47 +02004602 *
4603 * Return: 0 on success. An error code otherwise.
Rusty Russell961ccdd2008-06-23 13:55:38 +10004604 */
4605int sched_setscheduler_nocheck(struct task_struct *p, int policy,
KOSAKI Motohirofe7de492010-10-20 16:01:12 -07004606 const struct sched_param *param)
Rusty Russell961ccdd2008-06-23 13:55:38 +10004607{
Peter Zijlstra7479f3c2014-01-15 17:05:04 +01004608 return _sched_setscheduler(p, policy, param, false);
Rusty Russell961ccdd2008-06-23 13:55:38 +10004609}
Davidlohr Bueso84778472015-09-02 01:28:44 -07004610EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
Rusty Russell961ccdd2008-06-23 13:55:38 +10004611
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004612static int
4613do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004614{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004615 struct sched_param lparam;
4616 struct task_struct *p;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004617 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004618
4619 if (!param || pid < 0)
4620 return -EINVAL;
4621 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4622 return -EFAULT;
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004623
4624 rcu_read_lock();
4625 retval = -ESRCH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004626 p = find_process_by_pid(pid);
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004627 if (p != NULL)
4628 retval = sched_setscheduler(p, policy, &lparam);
4629 rcu_read_unlock();
Ingo Molnar36c8b582006-07-03 00:25:41 -07004630
Linus Torvalds1da177e2005-04-16 15:20:36 -07004631 return retval;
4632}
4633
Dario Faggiolid50dde52013-11-07 14:43:36 +01004634/*
4635 * Mimics kernel/events/core.c perf_copy_attr().
4636 */
4637static int sched_copy_attr(struct sched_attr __user *uattr,
4638 struct sched_attr *attr)
4639{
4640 u32 size;
4641 int ret;
4642
4643 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
4644 return -EFAULT;
4645
4646 /*
4647 * zero the full structure, so that a short copy will be nice.
4648 */
4649 memset(attr, 0, sizeof(*attr));
4650
4651 ret = get_user(size, &uattr->size);
4652 if (ret)
4653 return ret;
4654
4655 if (size > PAGE_SIZE) /* silly large */
4656 goto err_size;
4657
4658 if (!size) /* abi compat */
4659 size = SCHED_ATTR_SIZE_VER0;
4660
4661 if (size < SCHED_ATTR_SIZE_VER0)
4662 goto err_size;
4663
4664 /*
4665 * If we're handed a bigger struct than we know of,
4666 * ensure all the unknown bits are 0 - i.e. new
4667 * user-space does not rely on any kernel feature
4668 * extensions we dont know about yet.
4669 */
4670 if (size > sizeof(*attr)) {
4671 unsigned char __user *addr;
4672 unsigned char __user *end;
4673 unsigned char val;
4674
4675 addr = (void __user *)uattr + sizeof(*attr);
4676 end = (void __user *)uattr + size;
4677
4678 for (; addr < end; addr++) {
4679 ret = get_user(val, addr);
4680 if (ret)
4681 return ret;
4682 if (val)
4683 goto err_size;
4684 }
4685 size = sizeof(*attr);
4686 }
4687
4688 ret = copy_from_user(attr, uattr, size);
4689 if (ret)
4690 return -EFAULT;
4691
4692 /*
4693 * XXX: do we want to be lenient like existing syscalls; or do we want
4694 * to be strict and return an error on out-of-bounds values?
4695 */
Dongsheng Yang75e45d52014-02-11 15:34:50 +08004696 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
Dario Faggiolid50dde52013-11-07 14:43:36 +01004697
Michael Kerriske78c7bc2014-05-09 16:54:28 +02004698 return 0;
Dario Faggiolid50dde52013-11-07 14:43:36 +01004699
4700err_size:
4701 put_user(sizeof(*attr), &uattr->size);
Michael Kerriske78c7bc2014-05-09 16:54:28 +02004702 return -E2BIG;
Dario Faggiolid50dde52013-11-07 14:43:36 +01004703}
4704
Linus Torvalds1da177e2005-04-16 15:20:36 -07004705/**
4706 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4707 * @pid: the pid in question.
4708 * @policy: new policy.
4709 * @param: structure containing the new RT priority.
Yacine Belkadie69f6182013-07-12 20:45:47 +02004710 *
4711 * Return: 0 on success. An error code otherwise.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004712 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01004713SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4714 struct sched_param __user *, param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004715{
Jason Baronc21761f2006-01-18 17:43:03 -08004716 /* negative values for policy are not valid */
4717 if (policy < 0)
4718 return -EINVAL;
4719
Linus Torvalds1da177e2005-04-16 15:20:36 -07004720 return do_sched_setscheduler(pid, policy, param);
4721}
4722
4723/**
4724 * sys_sched_setparam - set/change the RT priority of a thread
4725 * @pid: the pid in question.
4726 * @param: structure containing the new RT priority.
Yacine Belkadie69f6182013-07-12 20:45:47 +02004727 *
4728 * Return: 0 on success. An error code otherwise.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004729 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01004730SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004731{
Steven Rostedtc13db6b2014-07-23 11:28:26 -04004732 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004733}
4734
4735/**
Dario Faggiolid50dde52013-11-07 14:43:36 +01004736 * sys_sched_setattr - same as above, but with extended sched_attr
4737 * @pid: the pid in question.
Juri Lelli5778fcc2014-01-14 16:10:39 +01004738 * @uattr: structure containing the extended parameters.
Masanari Iidadb66d752014-04-18 01:59:15 +09004739 * @flags: for future extension.
Dario Faggiolid50dde52013-11-07 14:43:36 +01004740 */
Peter Zijlstra6d35ab42014-02-14 17:19:29 +01004741SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
4742 unsigned int, flags)
Dario Faggiolid50dde52013-11-07 14:43:36 +01004743{
4744 struct sched_attr attr;
4745 struct task_struct *p;
4746 int retval;
4747
Peter Zijlstra6d35ab42014-02-14 17:19:29 +01004748 if (!uattr || pid < 0 || flags)
Dario Faggiolid50dde52013-11-07 14:43:36 +01004749 return -EINVAL;
4750
Michael Kerrisk143cf232014-05-09 16:54:15 +02004751 retval = sched_copy_attr(uattr, &attr);
4752 if (retval)
4753 return retval;
Dario Faggiolid50dde52013-11-07 14:43:36 +01004754
Richard Weinbergerb14ed2c2014-06-02 22:38:34 +02004755 if ((int)attr.sched_policy < 0)
Peter Zijlstradbdb2272014-05-09 10:49:03 +02004756 return -EINVAL;
Dario Faggiolid50dde52013-11-07 14:43:36 +01004757
4758 rcu_read_lock();
4759 retval = -ESRCH;
4760 p = find_process_by_pid(pid);
4761 if (p != NULL)
4762 retval = sched_setattr(p, &attr);
4763 rcu_read_unlock();
4764
4765 return retval;
4766}
4767
4768/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07004769 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4770 * @pid: the pid in question.
Yacine Belkadie69f6182013-07-12 20:45:47 +02004771 *
4772 * Return: On success, the policy of the thread. Otherwise, a negative error
4773 * code.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004774 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01004775SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004776{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004777 struct task_struct *p;
Andi Kleen3a5c3592007-10-15 17:00:14 +02004778 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004779
4780 if (pid < 0)
Andi Kleen3a5c3592007-10-15 17:00:14 +02004781 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004782
4783 retval = -ESRCH;
Thomas Gleixner5fe85be2009-12-09 10:14:58 +00004784 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004785 p = find_process_by_pid(pid);
4786 if (p) {
4787 retval = security_task_getscheduler(p);
4788 if (!retval)
Lennart Poetteringca94c442009-06-15 17:17:47 +02004789 retval = p->policy
4790 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004791 }
Thomas Gleixner5fe85be2009-12-09 10:14:58 +00004792 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004793 return retval;
4794}
4795
4796/**
Lennart Poetteringca94c442009-06-15 17:17:47 +02004797 * sys_sched_getparam - get the RT priority of a thread
Linus Torvalds1da177e2005-04-16 15:20:36 -07004798 * @pid: the pid in question.
4799 * @param: structure containing the RT priority.
Yacine Belkadie69f6182013-07-12 20:45:47 +02004800 *
4801 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
4802 * code.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004803 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01004804SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004805{
Peter Zijlstrace5f7f82014-05-12 22:50:34 +02004806 struct sched_param lp = { .sched_priority = 0 };
Ingo Molnar36c8b582006-07-03 00:25:41 -07004807 struct task_struct *p;
Andi Kleen3a5c3592007-10-15 17:00:14 +02004808 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004809
4810 if (!param || pid < 0)
Andi Kleen3a5c3592007-10-15 17:00:14 +02004811 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004812
Thomas Gleixner5fe85be2009-12-09 10:14:58 +00004813 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004814 p = find_process_by_pid(pid);
4815 retval = -ESRCH;
4816 if (!p)
4817 goto out_unlock;
4818
4819 retval = security_task_getscheduler(p);
4820 if (retval)
4821 goto out_unlock;
4822
Peter Zijlstrace5f7f82014-05-12 22:50:34 +02004823 if (task_has_rt_policy(p))
4824 lp.sched_priority = p->rt_priority;
Thomas Gleixner5fe85be2009-12-09 10:14:58 +00004825 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004826
4827 /*
4828 * This one might sleep, we cannot do it with a spinlock held ...
4829 */
4830 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4831
Linus Torvalds1da177e2005-04-16 15:20:36 -07004832 return retval;
4833
4834out_unlock:
Thomas Gleixner5fe85be2009-12-09 10:14:58 +00004835 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004836 return retval;
4837}
4838
Dario Faggiolid50dde52013-11-07 14:43:36 +01004839static int sched_read_attr(struct sched_attr __user *uattr,
4840 struct sched_attr *attr,
4841 unsigned int usize)
4842{
4843 int ret;
4844
4845 if (!access_ok(VERIFY_WRITE, uattr, usize))
4846 return -EFAULT;
4847
4848 /*
4849 * If we're handed a smaller struct than we know of,
4850 * ensure all the unknown bits are 0 - i.e. old
4851 * user-space does not get uncomplete information.
4852 */
4853 if (usize < sizeof(*attr)) {
4854 unsigned char *addr;
4855 unsigned char *end;
4856
4857 addr = (void *)attr + usize;
4858 end = (void *)attr + sizeof(*attr);
4859
4860 for (; addr < end; addr++) {
4861 if (*addr)
Michael Kerrisk22400672014-05-09 16:54:33 +02004862 return -EFBIG;
Dario Faggiolid50dde52013-11-07 14:43:36 +01004863 }
4864
4865 attr->size = usize;
4866 }
4867
Vegard Nossum4efbc452014-02-16 22:24:17 +01004868 ret = copy_to_user(uattr, attr, attr->size);
Dario Faggiolid50dde52013-11-07 14:43:36 +01004869 if (ret)
4870 return -EFAULT;
4871
Michael Kerrisk22400672014-05-09 16:54:33 +02004872 return 0;
Dario Faggiolid50dde52013-11-07 14:43:36 +01004873}
4874
4875/**
Dario Faggioliaab03e02013-11-28 11:14:43 +01004876 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
Dario Faggiolid50dde52013-11-07 14:43:36 +01004877 * @pid: the pid in question.
Juri Lelli5778fcc2014-01-14 16:10:39 +01004878 * @uattr: structure containing the extended parameters.
Dario Faggiolid50dde52013-11-07 14:43:36 +01004879 * @size: sizeof(attr) for fwd/bwd comp.
Masanari Iidadb66d752014-04-18 01:59:15 +09004880 * @flags: for future extension.
Dario Faggiolid50dde52013-11-07 14:43:36 +01004881 */
Peter Zijlstra6d35ab42014-02-14 17:19:29 +01004882SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
4883 unsigned int, size, unsigned int, flags)
Dario Faggiolid50dde52013-11-07 14:43:36 +01004884{
4885 struct sched_attr attr = {
4886 .size = sizeof(struct sched_attr),
4887 };
4888 struct task_struct *p;
4889 int retval;
4890
4891 if (!uattr || pid < 0 || size > PAGE_SIZE ||
Peter Zijlstra6d35ab42014-02-14 17:19:29 +01004892 size < SCHED_ATTR_SIZE_VER0 || flags)
Dario Faggiolid50dde52013-11-07 14:43:36 +01004893 return -EINVAL;
4894
4895 rcu_read_lock();
4896 p = find_process_by_pid(pid);
4897 retval = -ESRCH;
4898 if (!p)
4899 goto out_unlock;
4900
4901 retval = security_task_getscheduler(p);
4902 if (retval)
4903 goto out_unlock;
4904
4905 attr.sched_policy = p->policy;
Peter Zijlstra7479f3c2014-01-15 17:05:04 +01004906 if (p->sched_reset_on_fork)
4907 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
Dario Faggioliaab03e02013-11-28 11:14:43 +01004908 if (task_has_dl_policy(p))
4909 __getparam_dl(p, &attr);
4910 else if (task_has_rt_policy(p))
Dario Faggiolid50dde52013-11-07 14:43:36 +01004911 attr.sched_priority = p->rt_priority;
4912 else
Dongsheng Yangd0ea0262014-01-27 22:00:45 -05004913 attr.sched_nice = task_nice(p);
Dario Faggiolid50dde52013-11-07 14:43:36 +01004914
4915 rcu_read_unlock();
4916
4917 retval = sched_read_attr(uattr, &attr, size);
4918 return retval;
4919
4920out_unlock:
4921 rcu_read_unlock();
4922 return retval;
4923}
4924
Rusty Russell96f874e2008-11-25 02:35:14 +10304925long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004926{
Rusty Russell5a16f3d2008-11-25 02:35:11 +10304927 cpumask_var_t cpus_allowed, new_mask;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004928 struct task_struct *p;
4929 int retval;
Olav Haugandc10c3f2016-12-07 16:34:49 -08004930 int dest_cpu;
4931 cpumask_t allowed_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004932
Thomas Gleixner23f5d142009-12-09 10:15:01 +00004933 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004934
4935 p = find_process_by_pid(pid);
4936 if (!p) {
Thomas Gleixner23f5d142009-12-09 10:15:01 +00004937 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004938 return -ESRCH;
4939 }
4940
Thomas Gleixner23f5d142009-12-09 10:15:01 +00004941 /* Prevent p going away */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004942 get_task_struct(p);
Thomas Gleixner23f5d142009-12-09 10:15:01 +00004943 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004944
Tejun Heo14a40ff2013-03-19 13:45:20 -07004945 if (p->flags & PF_NO_SETAFFINITY) {
4946 retval = -EINVAL;
4947 goto out_put_task;
4948 }
Rusty Russell5a16f3d2008-11-25 02:35:11 +10304949 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4950 retval = -ENOMEM;
4951 goto out_put_task;
4952 }
4953 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4954 retval = -ENOMEM;
4955 goto out_free_cpus_allowed;
4956 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004957 retval = -EPERM;
Eric W. Biederman4c44aaa2012-07-26 05:05:21 -07004958 if (!check_same_owner(p)) {
4959 rcu_read_lock();
4960 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4961 rcu_read_unlock();
Kirill Tkhai16303ab2014-09-22 22:36:30 +04004962 goto out_free_new_mask;
Eric W. Biederman4c44aaa2012-07-26 05:05:21 -07004963 }
4964 rcu_read_unlock();
4965 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004966
KOSAKI Motohirob0ae1982010-10-15 04:21:18 +09004967 retval = security_task_setscheduler(p);
David Quigleye7834f82006-06-23 02:03:59 -07004968 if (retval)
Kirill Tkhai16303ab2014-09-22 22:36:30 +04004969 goto out_free_new_mask;
David Quigleye7834f82006-06-23 02:03:59 -07004970
Peter Zijlstrae4099a52013-12-17 10:03:34 +01004971
4972 cpuset_cpus_allowed(p, cpus_allowed);
4973 cpumask_and(new_mask, in_mask, cpus_allowed);
4974
Dario Faggioli332ac172013-11-07 14:43:45 +01004975 /*
4976 * Since bandwidth control happens on root_domain basis,
4977 * if admission test is enabled, we only admit -deadline
4978 * tasks allowed to run on all the CPUs in the task's
4979 * root_domain.
4980 */
4981#ifdef CONFIG_SMP
Kirill Tkhaif1e3a092014-09-22 22:36:36 +04004982 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
4983 rcu_read_lock();
4984 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
Dario Faggioli332ac172013-11-07 14:43:45 +01004985 retval = -EBUSY;
Kirill Tkhaif1e3a092014-09-22 22:36:36 +04004986 rcu_read_unlock();
Kirill Tkhai16303ab2014-09-22 22:36:30 +04004987 goto out_free_new_mask;
Dario Faggioli332ac172013-11-07 14:43:45 +01004988 }
Kirill Tkhaif1e3a092014-09-22 22:36:36 +04004989 rcu_read_unlock();
Dario Faggioli332ac172013-11-07 14:43:45 +01004990 }
4991#endif
Peter Zijlstra49246272010-10-17 21:46:10 +02004992again:
Olav Haugandc10c3f2016-12-07 16:34:49 -08004993 cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask);
4994 dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask);
4995 if (dest_cpu < nr_cpu_ids) {
4996 retval = __set_cpus_allowed_ptr(p, new_mask, true);
4997 if (!retval) {
4998 cpuset_cpus_allowed(p, cpus_allowed);
4999 if (!cpumask_subset(new_mask, cpus_allowed)) {
5000 /*
5001 * We must have raced with a concurrent cpuset
5002 * update. Just reset the cpus_allowed to the
5003 * cpuset's cpus_allowed
5004 */
5005 cpumask_copy(new_mask, cpus_allowed);
5006 goto again;
5007 }
Paul Menage8707d8b2007-10-18 23:40:22 -07005008 }
Olav Haugandc10c3f2016-12-07 16:34:49 -08005009 } else {
5010 retval = -EINVAL;
Paul Menage8707d8b2007-10-18 23:40:22 -07005011 }
Olav Haugandc10c3f2016-12-07 16:34:49 -08005012
Pavankumar Kondeti435eea92019-02-28 10:40:39 +05305013 if (!retval && !(p->flags & PF_KTHREAD))
5014 cpumask_and(&p->cpus_requested, in_mask, cpu_possible_mask);
5015
Kirill Tkhai16303ab2014-09-22 22:36:30 +04005016out_free_new_mask:
Rusty Russell5a16f3d2008-11-25 02:35:11 +10305017 free_cpumask_var(new_mask);
5018out_free_cpus_allowed:
5019 free_cpumask_var(cpus_allowed);
5020out_put_task:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005021 put_task_struct(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005022 return retval;
5023}
5024
5025static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
Rusty Russell96f874e2008-11-25 02:35:14 +10305026 struct cpumask *new_mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005027{
Rusty Russell96f874e2008-11-25 02:35:14 +10305028 if (len < cpumask_size())
5029 cpumask_clear(new_mask);
5030 else if (len > cpumask_size())
5031 len = cpumask_size();
5032
Linus Torvalds1da177e2005-04-16 15:20:36 -07005033 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5034}
5035
5036/**
5037 * sys_sched_setaffinity - set the cpu affinity of a process
5038 * @pid: pid of the process
5039 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5040 * @user_mask_ptr: user-space pointer to the new cpu mask
Yacine Belkadie69f6182013-07-12 20:45:47 +02005041 *
5042 * Return: 0 on success. An error code otherwise.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005043 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01005044SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5045 unsigned long __user *, user_mask_ptr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005046{
Rusty Russell5a16f3d2008-11-25 02:35:11 +10305047 cpumask_var_t new_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005048 int retval;
5049
Rusty Russell5a16f3d2008-11-25 02:35:11 +10305050 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5051 return -ENOMEM;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005052
Rusty Russell5a16f3d2008-11-25 02:35:11 +10305053 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5054 if (retval == 0)
5055 retval = sched_setaffinity(pid, new_mask);
5056 free_cpumask_var(new_mask);
5057 return retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005058}
5059
Rusty Russell96f874e2008-11-25 02:35:14 +10305060long sched_getaffinity(pid_t pid, struct cpumask *mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005061{
Ingo Molnar36c8b582006-07-03 00:25:41 -07005062 struct task_struct *p;
Thomas Gleixner31605682009-12-08 20:24:16 +00005063 unsigned long flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005064 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005065
Thomas Gleixner23f5d142009-12-09 10:15:01 +00005066 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005067
5068 retval = -ESRCH;
5069 p = find_process_by_pid(pid);
5070 if (!p)
5071 goto out_unlock;
5072
David Quigleye7834f82006-06-23 02:03:59 -07005073 retval = security_task_getscheduler(p);
5074 if (retval)
5075 goto out_unlock;
5076
Peter Zijlstra013fdb82011-04-05 17:23:45 +02005077 raw_spin_lock_irqsave(&p->pi_lock, flags);
Peter Zijlstra6acce3e2013-10-11 14:38:20 +02005078 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
Lingutla Chandrasekhar189f8c02017-12-22 17:22:25 +05305079
5080 /* The userspace tasks are forbidden to run on
5081 * isolated CPUs. So exclude isolated CPUs from
5082 * the getaffinity.
5083 */
5084 if (!(p->flags & PF_KTHREAD))
5085 cpumask_andnot(mask, mask, cpu_isolated_mask);
5086
Peter Zijlstra013fdb82011-04-05 17:23:45 +02005087 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005088
5089out_unlock:
Thomas Gleixner23f5d142009-12-09 10:15:01 +00005090 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005091
Ulrich Drepper9531b622007-08-09 11:16:46 +02005092 return retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005093}
5094
5095/**
5096 * sys_sched_getaffinity - get the cpu affinity of a process
5097 * @pid: pid of the process
5098 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5099 * @user_mask_ptr: user-space pointer to hold the current cpu mask
Yacine Belkadie69f6182013-07-12 20:45:47 +02005100 *
Zev Weiss599b4842016-06-26 16:13:23 -05005101 * Return: size of CPU mask copied to user_mask_ptr on success. An
5102 * error code otherwise.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005103 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01005104SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5105 unsigned long __user *, user_mask_ptr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005106{
5107 int ret;
Rusty Russellf17c8602008-11-25 02:35:11 +10305108 cpumask_var_t mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005109
Anton Blanchard84fba5e2010-04-06 17:02:19 +10005110 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
KOSAKI Motohirocd3d8032010-03-12 16:15:36 +09005111 return -EINVAL;
5112 if (len & (sizeof(unsigned long)-1))
Linus Torvalds1da177e2005-04-16 15:20:36 -07005113 return -EINVAL;
5114
Rusty Russellf17c8602008-11-25 02:35:11 +10305115 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5116 return -ENOMEM;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005117
Rusty Russellf17c8602008-11-25 02:35:11 +10305118 ret = sched_getaffinity(pid, mask);
5119 if (ret == 0) {
KOSAKI Motohiro8bc037f2010-03-17 09:36:58 +09005120 size_t retlen = min_t(size_t, len, cpumask_size());
KOSAKI Motohirocd3d8032010-03-12 16:15:36 +09005121
5122 if (copy_to_user(user_mask_ptr, mask, retlen))
Rusty Russellf17c8602008-11-25 02:35:11 +10305123 ret = -EFAULT;
5124 else
KOSAKI Motohirocd3d8032010-03-12 16:15:36 +09005125 ret = retlen;
Rusty Russellf17c8602008-11-25 02:35:11 +10305126 }
5127 free_cpumask_var(mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005128
Rusty Russellf17c8602008-11-25 02:35:11 +10305129 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005130}
5131
5132/**
5133 * sys_sched_yield - yield the current processor to other threads.
5134 *
Ingo Molnardd41f592007-07-09 18:51:59 +02005135 * This function yields the current CPU to other tasks. If there are no
5136 * other threads running on this CPU then this function will return.
Yacine Belkadie69f6182013-07-12 20:45:47 +02005137 *
5138 * Return: 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005139 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01005140SYSCALL_DEFINE0(sched_yield)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005141{
Johannes Weiner6a99eb12018-10-26 15:06:23 -07005142 struct rq_flags rf;
5143 struct rq *rq;
5144
5145 rq = this_rq_lock_irq(&rf);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005146
Josh Poimboeufae928822016-06-17 12:43:24 -05005147 schedstat_inc(rq->yld_count);
Dmitry Adamushko4530d7a2007-10-15 17:00:08 +02005148 current->sched_class->yield_task(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005149
5150 /*
5151 * Since we are going to call schedule() anyway, there's
5152 * no need to preempt or enable interrupts:
5153 */
Johannes Weiner6a99eb12018-10-26 15:06:23 -07005154 preempt_disable();
5155 rq_unlock(rq, &rf);
Thomas Gleixnerba74c142011-03-21 13:32:17 +01005156 sched_preempt_enable_no_resched();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005157
5158 schedule();
5159
5160 return 0;
5161}
5162
Peter Zijlstra35a773a2016-09-19 12:57:53 +02005163#ifndef CONFIG_PREEMPT
Herbert Xu02b67cc32008-01-25 21:08:28 +01005164int __sched _cond_resched(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005165{
Konstantin Khlebnikovfe32d3c2015-07-15 12:52:04 +03005166 if (should_resched(0)) {
Frederic Weisbeckera18b5d02015-01-22 18:08:04 +01005167 preempt_schedule_common();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005168 return 1;
5169 }
5170 return 0;
5171}
Herbert Xu02b67cc32008-01-25 21:08:28 +01005172EXPORT_SYMBOL(_cond_resched);
Peter Zijlstra35a773a2016-09-19 12:57:53 +02005173#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005174
5175/*
Frederic Weisbecker613afbf2009-07-16 15:44:29 +02005176 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
Linus Torvalds1da177e2005-04-16 15:20:36 -07005177 * call schedule, and on return reacquire the lock.
5178 *
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01005179 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
Linus Torvalds1da177e2005-04-16 15:20:36 -07005180 * operations here to prevent schedule() from being called twice (once via
5181 * spin_unlock(), once by hand).
5182 */
Frederic Weisbecker613afbf2009-07-16 15:44:29 +02005183int __cond_resched_lock(spinlock_t *lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005184{
Konstantin Khlebnikovfe32d3c2015-07-15 12:52:04 +03005185 int resched = should_resched(PREEMPT_LOCK_OFFSET);
Jan Kara6df3cec2005-06-13 15:52:32 -07005186 int ret = 0;
5187
Peter Zijlstraf607c662009-07-20 19:16:29 +02005188 lockdep_assert_held(lock);
5189
Paul E. McKenney4a81e832014-06-20 16:49:01 -07005190 if (spin_needbreak(lock) || resched) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005191 spin_unlock(lock);
Peter Zijlstrad86ee482009-07-10 14:57:57 +02005192 if (resched)
Frederic Weisbeckera18b5d02015-01-22 18:08:04 +01005193 preempt_schedule_common();
Nick Piggin95c354f2008-01-30 13:31:20 +01005194 else
5195 cpu_relax();
Jan Kara6df3cec2005-06-13 15:52:32 -07005196 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005197 spin_lock(lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005198 }
Jan Kara6df3cec2005-06-13 15:52:32 -07005199 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005200}
Frederic Weisbecker613afbf2009-07-16 15:44:29 +02005201EXPORT_SYMBOL(__cond_resched_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005202
Frederic Weisbecker613afbf2009-07-16 15:44:29 +02005203int __sched __cond_resched_softirq(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005204{
5205 BUG_ON(!in_softirq());
5206
Konstantin Khlebnikovfe32d3c2015-07-15 12:52:04 +03005207 if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
Thomas Gleixner98d825672007-05-23 13:58:18 -07005208 local_bh_enable();
Frederic Weisbeckera18b5d02015-01-22 18:08:04 +01005209 preempt_schedule_common();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005210 local_bh_disable();
5211 return 1;
5212 }
5213 return 0;
5214}
Frederic Weisbecker613afbf2009-07-16 15:44:29 +02005215EXPORT_SYMBOL(__cond_resched_softirq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005216
Linus Torvalds1da177e2005-04-16 15:20:36 -07005217/**
5218 * yield - yield the current processor to other threads.
5219 *
Peter Zijlstra8e3fabf2012-03-06 18:54:26 +01005220 * Do not ever use this function, there's a 99% chance you're doing it wrong.
5221 *
5222 * The scheduler is at all times free to pick the calling task as the most
5223 * eligible task to run, if removing the yield() call from your code breaks
5224 * it, its already broken.
5225 *
5226 * Typical broken usage is:
5227 *
5228 * while (!event)
5229 * yield();
5230 *
5231 * where one assumes that yield() will let 'the other' process run that will
5232 * make event true. If the current task is a SCHED_FIFO task that will never
5233 * happen. Never use yield() as a progress guarantee!!
5234 *
5235 * If you want to use yield() to wait for something, use wait_event().
5236 * If you want to use yield() to be 'nice' for others, use cond_resched().
5237 * If you still want to use yield(), do not!
Linus Torvalds1da177e2005-04-16 15:20:36 -07005238 */
5239void __sched yield(void)
5240{
5241 set_current_state(TASK_RUNNING);
5242 sys_sched_yield();
5243}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005244EXPORT_SYMBOL(yield);
5245
Mike Galbraithd95f4122011-02-01 09:50:51 -05005246/**
5247 * yield_to - yield the current processor to another thread in
5248 * your thread group, or accelerate that thread toward the
5249 * processor it's on.
Randy Dunlap16addf92011-03-18 09:34:53 -07005250 * @p: target task
5251 * @preempt: whether task preemption is allowed or not
Mike Galbraithd95f4122011-02-01 09:50:51 -05005252 *
5253 * It's the caller's job to ensure that the target task struct
5254 * can't go away on us before we can do any checks.
5255 *
Yacine Belkadie69f6182013-07-12 20:45:47 +02005256 * Return:
Peter Zijlstra7b270f62013-01-22 13:09:13 +05305257 * true (>0) if we indeed boosted the target task.
5258 * false (0) if we failed to boost the target.
5259 * -ESRCH if there's no task to yield to.
Mike Galbraithd95f4122011-02-01 09:50:51 -05005260 */
Dan Carpenterfa933842014-05-23 13:20:42 +03005261int __sched yield_to(struct task_struct *p, bool preempt)
Mike Galbraithd95f4122011-02-01 09:50:51 -05005262{
5263 struct task_struct *curr = current;
5264 struct rq *rq, *p_rq;
5265 unsigned long flags;
Dan Carpenterc3c18642013-02-05 14:37:51 +03005266 int yielded = 0;
Mike Galbraithd95f4122011-02-01 09:50:51 -05005267
5268 local_irq_save(flags);
5269 rq = this_rq();
5270
5271again:
5272 p_rq = task_rq(p);
Peter Zijlstra7b270f62013-01-22 13:09:13 +05305273 /*
5274 * If we're the only runnable task on the rq and target rq also
5275 * has only one task, there's absolutely no point in yielding.
5276 */
5277 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
5278 yielded = -ESRCH;
5279 goto out_irq;
5280 }
5281
Mike Galbraithd95f4122011-02-01 09:50:51 -05005282 double_rq_lock(rq, p_rq);
Shigeru Yoshida39e24d8f2013-11-23 18:38:01 +09005283 if (task_rq(p) != p_rq) {
Mike Galbraithd95f4122011-02-01 09:50:51 -05005284 double_rq_unlock(rq, p_rq);
5285 goto again;
5286 }
5287
5288 if (!curr->sched_class->yield_to_task)
Peter Zijlstra7b270f62013-01-22 13:09:13 +05305289 goto out_unlock;
Mike Galbraithd95f4122011-02-01 09:50:51 -05005290
5291 if (curr->sched_class != p->sched_class)
Peter Zijlstra7b270f62013-01-22 13:09:13 +05305292 goto out_unlock;
Mike Galbraithd95f4122011-02-01 09:50:51 -05005293
5294 if (task_running(p_rq, p) || p->state)
Peter Zijlstra7b270f62013-01-22 13:09:13 +05305295 goto out_unlock;
Mike Galbraithd95f4122011-02-01 09:50:51 -05005296
5297 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
Venkatesh Pallipadi6d1cafd2011-03-01 16:28:21 -08005298 if (yielded) {
Josh Poimboeufae928822016-06-17 12:43:24 -05005299 schedstat_inc(rq->yld_count);
Venkatesh Pallipadi6d1cafd2011-03-01 16:28:21 -08005300 /*
5301 * Make p's CPU reschedule; pick_next_entity takes care of
5302 * fairness.
5303 */
5304 if (preempt && rq != p_rq)
Kirill Tkhai88751252014-06-29 00:03:57 +04005305 resched_curr(p_rq);
Venkatesh Pallipadi6d1cafd2011-03-01 16:28:21 -08005306 }
Mike Galbraithd95f4122011-02-01 09:50:51 -05005307
Peter Zijlstra7b270f62013-01-22 13:09:13 +05305308out_unlock:
Mike Galbraithd95f4122011-02-01 09:50:51 -05005309 double_rq_unlock(rq, p_rq);
Peter Zijlstra7b270f62013-01-22 13:09:13 +05305310out_irq:
Mike Galbraithd95f4122011-02-01 09:50:51 -05005311 local_irq_restore(flags);
5312
Peter Zijlstra7b270f62013-01-22 13:09:13 +05305313 if (yielded > 0)
Mike Galbraithd95f4122011-02-01 09:50:51 -05005314 schedule();
5315
5316 return yielded;
5317}
5318EXPORT_SYMBOL_GPL(yield_to);
5319
Linus Torvalds1da177e2005-04-16 15:20:36 -07005320/*
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01005321 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
Linus Torvalds1da177e2005-04-16 15:20:36 -07005322 * that process accounting knows that this is a task in IO wait state.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005323 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005324long __sched io_schedule_timeout(long timeout)
5325{
NeilBrown9cff8ad2015-02-13 15:49:17 +11005326 int old_iowait = current->in_iowait;
5327 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005328 long ret;
5329
Arjan van de Ven8f0dfc32009-07-20 11:26:58 -07005330 current->in_iowait = 1;
Shaohua Li10d784e2015-05-08 10:51:29 -07005331 blk_schedule_flush_plug(current);
NeilBrown9cff8ad2015-02-13 15:49:17 +11005332
5333 delayacct_blkio_start();
5334 rq = raw_rq();
5335 atomic_inc(&rq->nr_iowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005336 ret = schedule_timeout(timeout);
NeilBrown9cff8ad2015-02-13 15:49:17 +11005337 current->in_iowait = old_iowait;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005338 atomic_dec(&rq->nr_iowait);
Shailabh Nagar0ff92242006-07-14 00:24:37 -07005339 delayacct_blkio_end();
NeilBrown9cff8ad2015-02-13 15:49:17 +11005340
Linus Torvalds1da177e2005-04-16 15:20:36 -07005341 return ret;
5342}
NeilBrown9cff8ad2015-02-13 15:49:17 +11005343EXPORT_SYMBOL(io_schedule_timeout);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005344
5345/**
5346 * sys_sched_get_priority_max - return maximum RT priority.
5347 * @policy: scheduling class.
5348 *
Yacine Belkadie69f6182013-07-12 20:45:47 +02005349 * Return: On success, this syscall returns the maximum
5350 * rt_priority that can be used by a given scheduling class.
5351 * On failure, a negative error code is returned.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005352 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01005353SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005354{
5355 int ret = -EINVAL;
5356
5357 switch (policy) {
5358 case SCHED_FIFO:
5359 case SCHED_RR:
5360 ret = MAX_USER_RT_PRIO-1;
5361 break;
Dario Faggioliaab03e02013-11-28 11:14:43 +01005362 case SCHED_DEADLINE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005363 case SCHED_NORMAL:
Ingo Molnarb0a94992006-01-14 13:20:41 -08005364 case SCHED_BATCH:
Ingo Molnardd41f592007-07-09 18:51:59 +02005365 case SCHED_IDLE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005366 ret = 0;
5367 break;
5368 }
5369 return ret;
5370}
5371
5372/**
5373 * sys_sched_get_priority_min - return minimum RT priority.
5374 * @policy: scheduling class.
5375 *
Yacine Belkadie69f6182013-07-12 20:45:47 +02005376 * Return: On success, this syscall returns the minimum
5377 * rt_priority that can be used by a given scheduling class.
5378 * On failure, a negative error code is returned.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005379 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01005380SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005381{
5382 int ret = -EINVAL;
5383
5384 switch (policy) {
5385 case SCHED_FIFO:
5386 case SCHED_RR:
5387 ret = 1;
5388 break;
Dario Faggioliaab03e02013-11-28 11:14:43 +01005389 case SCHED_DEADLINE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005390 case SCHED_NORMAL:
Ingo Molnarb0a94992006-01-14 13:20:41 -08005391 case SCHED_BATCH:
Ingo Molnardd41f592007-07-09 18:51:59 +02005392 case SCHED_IDLE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005393 ret = 0;
5394 }
5395 return ret;
5396}
5397
5398/**
5399 * sys_sched_rr_get_interval - return the default timeslice of a process.
5400 * @pid: pid of the process.
5401 * @interval: userspace pointer to the timeslice value.
5402 *
5403 * this syscall writes the default timeslice value of a given process
5404 * into the user-space timespec buffer. A value of '0' means infinity.
Yacine Belkadie69f6182013-07-12 20:45:47 +02005405 *
5406 * Return: On success, 0 and the timeslice is in @interval. Otherwise,
5407 * an error code.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005408 */
Heiko Carstens17da2bd2009-01-14 14:14:10 +01005409SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
Heiko Carstens754fe8d2009-01-14 14:14:09 +01005410 struct timespec __user *, interval)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005411{
Ingo Molnar36c8b582006-07-03 00:25:41 -07005412 struct task_struct *p;
Dmitry Adamushkoa4ec24b2007-10-15 17:00:13 +02005413 unsigned int time_slice;
Peter Zijlstraeb580752015-07-31 21:28:18 +02005414 struct rq_flags rf;
5415 struct timespec t;
Thomas Gleixnerdba091b2009-12-09 09:32:03 +01005416 struct rq *rq;
Andi Kleen3a5c3592007-10-15 17:00:14 +02005417 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005418
5419 if (pid < 0)
Andi Kleen3a5c3592007-10-15 17:00:14 +02005420 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005421
5422 retval = -ESRCH;
Thomas Gleixner1a551ae2009-12-09 10:15:11 +00005423 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005424 p = find_process_by_pid(pid);
5425 if (!p)
5426 goto out_unlock;
5427
5428 retval = security_task_getscheduler(p);
5429 if (retval)
5430 goto out_unlock;
5431
Peter Zijlstraeb580752015-07-31 21:28:18 +02005432 rq = task_rq_lock(p, &rf);
Peter Zijlstraa57beec2014-01-27 11:54:13 +01005433 time_slice = 0;
5434 if (p->sched_class->get_rr_interval)
5435 time_slice = p->sched_class->get_rr_interval(rq, p);
Peter Zijlstraeb580752015-07-31 21:28:18 +02005436 task_rq_unlock(rq, p, &rf);
Dmitry Adamushkoa4ec24b2007-10-15 17:00:13 +02005437
Thomas Gleixner1a551ae2009-12-09 10:15:11 +00005438 rcu_read_unlock();
Dmitry Adamushkoa4ec24b2007-10-15 17:00:13 +02005439 jiffies_to_timespec(time_slice, &t);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005440 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005441 return retval;
Andi Kleen3a5c3592007-10-15 17:00:14 +02005442
Linus Torvalds1da177e2005-04-16 15:20:36 -07005443out_unlock:
Thomas Gleixner1a551ae2009-12-09 10:15:11 +00005444 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005445 return retval;
5446}
5447
Steven Rostedt7c731e02008-05-12 21:20:41 +02005448static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
Ingo Molnar36c8b582006-07-03 00:25:41 -07005449
Ingo Molnar82a1fcb2008-01-25 21:08:02 +01005450void sched_show_task(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005451{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005452 unsigned long free = 0;
Paul E. McKenney4e797522012-11-07 13:35:32 -08005453 int ppid;
Tetsuo Handa1f8a7632014-12-05 21:22:22 +09005454 unsigned long state = p->state;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005455
Tetsuo Handa38200502016-11-02 19:50:29 +09005456 if (!try_get_task_stack(p))
5457 return;
Tetsuo Handa1f8a7632014-12-05 21:22:22 +09005458 if (state)
5459 state = __ffs(state) + 1;
Erik Gilling28d06862010-11-19 18:08:51 -08005460 printk(KERN_INFO "%-15.15s %c", p->comm,
Andreas Mohr2ed6e342006-07-10 04:43:52 -07005461 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
Linus Torvalds1da177e2005-04-16 15:20:36 -07005462 if (state == TASK_RUNNING)
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01005463 printk(KERN_CONT " running task ");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005464#ifdef CONFIG_DEBUG_STACK_USAGE
Eric Sandeen7c9f8862008-04-22 16:38:23 -05005465 free = stack_not_used(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005466#endif
Oleg Nesterova90e9842014-12-10 15:45:21 -08005467 ppid = 0;
Paul E. McKenney4e797522012-11-07 13:35:32 -08005468 rcu_read_lock();
Oleg Nesterova90e9842014-12-10 15:45:21 -08005469 if (pid_alive(p))
5470 ppid = task_pid_nr(rcu_dereference(p->real_parent));
Paul E. McKenney4e797522012-11-07 13:35:32 -08005471 rcu_read_unlock();
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01005472 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
Paul E. McKenney4e797522012-11-07 13:35:32 -08005473 task_pid_nr(p), ppid,
David Rientjesaa47b7e2009-05-04 01:38:05 -07005474 (unsigned long)task_thread_info(p)->flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005475
Tejun Heo3d1cb202013-04-30 15:27:22 -07005476 print_worker_info(KERN_INFO, p);
Nick Piggin5fb5e6d2008-01-25 21:08:34 +01005477 show_stack(p, NULL);
Tetsuo Handa38200502016-11-02 19:50:29 +09005478 put_task_stack(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005479}
5480
Ingo Molnare59e2ae2006-12-06 20:35:59 -08005481void show_state_filter(unsigned long state_filter)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005482{
Ingo Molnar36c8b582006-07-03 00:25:41 -07005483 struct task_struct *g, *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005484
Ingo Molnar4bd77322007-07-11 21:21:47 +02005485#if BITS_PER_LONG == 32
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01005486 printk(KERN_INFO
5487 " task PC stack pid father\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005488#else
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01005489 printk(KERN_INFO
5490 " task PC stack pid father\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005491#endif
Thomas Gleixner510f5ac2011-07-17 20:47:54 +02005492 rcu_read_lock();
Oleg Nesterov5d07f422014-08-13 21:19:53 +02005493 for_each_process_thread(g, p) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005494 /*
5495 * reset the NMI-timeout, listing all files on a slow
Lucas De Marchi25985ed2011-03-30 22:57:33 -03005496 * console might take a lot of time:
Andrey Ryabinin57675cb2016-06-09 15:20:05 +03005497 * Also, reset softlockup watchdogs on all CPUs, because
5498 * another CPU might be blocked waiting for us to process
5499 * an IPI.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005500 */
5501 touch_nmi_watchdog();
Andrey Ryabinin57675cb2016-06-09 15:20:05 +03005502 touch_all_softlockup_watchdogs();
Ingo Molnar39bc89f2007-04-25 20:50:03 -07005503 if (!state_filter || (p->state & state_filter))
Ingo Molnar82a1fcb2008-01-25 21:08:02 +01005504 sched_show_task(p);
Oleg Nesterov5d07f422014-08-13 21:19:53 +02005505 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005506
Ingo Molnardd41f592007-07-09 18:51:59 +02005507#ifdef CONFIG_SCHED_DEBUG
Rabin Vincentfb90a6e2016-04-04 15:42:02 +02005508 if (!state_filter)
5509 sysrq_sched_debug_show();
Ingo Molnardd41f592007-07-09 18:51:59 +02005510#endif
Thomas Gleixner510f5ac2011-07-17 20:47:54 +02005511 rcu_read_unlock();
Ingo Molnare59e2ae2006-12-06 20:35:59 -08005512 /*
5513 * Only show locks if all tasks are dumped:
5514 */
Shmulik Ladkani93335a22009-11-25 15:23:41 +02005515 if (!state_filter)
Ingo Molnare59e2ae2006-12-06 20:35:59 -08005516 debug_show_all_locks();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005517}
5518
Paul Gortmaker0db06282013-06-19 14:53:51 -04005519void init_idle_bootup_task(struct task_struct *idle)
Ingo Molnar1df21052007-07-09 18:51:58 +02005520{
Ingo Molnardd41f592007-07-09 18:51:59 +02005521 idle->sched_class = &idle_sched_class;
Ingo Molnar1df21052007-07-09 18:51:58 +02005522}
5523
Ingo Molnarf340c0d2005-06-28 16:40:42 +02005524/**
5525 * init_idle - set up an idle thread for a given CPU
5526 * @idle: task in question
5527 * @cpu: cpu the idle task belongs to
5528 *
5529 * NOTE: this function does not set the idle thread's NEED_RESCHED
5530 * flag, to make booting more robust.
5531 */
Pavankumar Kondeti736630c2018-09-20 15:31:36 +05305532void init_idle(struct task_struct *idle, int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005533{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005534 struct rq *rq = cpu_rq(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005535 unsigned long flags;
5536
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005537 __sched_fork(0, idle);
5538
Peter Zijlstra25834c72015-05-15 17:43:34 +02005539 raw_spin_lock_irqsave(&idle->pi_lock, flags);
5540 raw_spin_lock(&rq->lock);
Ingo Molnar5cbd54e2008-11-12 20:05:50 +01005541
Peter Zijlstra06b83b52009-12-16 18:04:35 +01005542 idle->state = TASK_RUNNING;
Ingo Molnardd41f592007-07-09 18:51:59 +02005543 idle->se.exec_start = sched_clock();
5544
Mark Rutlande1b77c92016-03-09 14:08:18 -08005545 kasan_unpoison_task_stack(idle);
5546
Peter Zijlstrade9b8f52015-08-13 23:09:29 +02005547#ifdef CONFIG_SMP
5548 /*
5549 * Its possible that init_idle() gets called multiple times on a task,
5550 * in that case do_set_cpus_allowed() will not do the right thing.
5551 *
5552 * And since this is boot we can forgo the serialization.
5553 */
5554 set_cpus_allowed_common(idle, cpumask_of(cpu));
5555#endif
Peter Zijlstra6506cf6c2010-09-16 17:50:31 +02005556 /*
5557 * We're having a chicken and egg problem, even though we are
5558 * holding rq->lock, the cpu isn't yet set to this cpu so the
5559 * lockdep check in task_group() will fail.
5560 *
5561 * Similar case to sched_fork(). / Alternatively we could
5562 * use task_rq_lock() here and obtain the other rq->lock.
5563 *
5564 * Silence PROVE_RCU
5565 */
5566 rcu_read_lock();
Ingo Molnardd41f592007-07-09 18:51:59 +02005567 __set_task_cpu(idle, cpu);
Peter Zijlstra6506cf6c2010-09-16 17:50:31 +02005568 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005569
Linus Torvalds1da177e2005-04-16 15:20:36 -07005570 rq->curr = rq->idle = idle;
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04005571 idle->on_rq = TASK_ON_RQ_QUEUED;
Peter Zijlstrade9b8f52015-08-13 23:09:29 +02005572#ifdef CONFIG_SMP
Peter Zijlstra3ca7a442011-04-05 17:23:40 +02005573 idle->on_cpu = 1;
Nick Piggin4866cde2005-06-25 14:57:23 -07005574#endif
Peter Zijlstra25834c72015-05-15 17:43:34 +02005575 raw_spin_unlock(&rq->lock);
5576 raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005577
5578 /* Set the preempt count _outside_ the spinlocks! */
Peter Zijlstra01028742013-08-14 14:55:46 +02005579 init_idle_preempt_count(idle, cpu);
Jonathan Corbet625f2a32011-04-22 11:19:10 -06005580
Ingo Molnardd41f592007-07-09 18:51:59 +02005581 /*
5582 * The idle tasks have their own, simple scheduling class:
5583 */
5584 idle->sched_class = &idle_sched_class;
Steven Rostedt868baf02011-02-10 21:26:13 -05005585 ftrace_graph_init_idle_task(idle, cpu);
Frederic Weisbecker45eacc62013-05-15 22:16:32 +02005586 vtime_init_idle(idle, cpu);
Peter Zijlstrade9b8f52015-08-13 23:09:29 +02005587#ifdef CONFIG_SMP
Carsten Emdef1c6f1a2011-10-26 23:14:16 +02005588 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
5589#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005590}
5591
Juri Lellif82f8042014-10-07 09:52:11 +01005592int cpuset_cpumask_can_shrink(const struct cpumask *cur,
5593 const struct cpumask *trial)
5594{
5595 int ret = 1, trial_cpus;
5596 struct dl_bw *cur_dl_b;
5597 unsigned long flags;
5598
Mike Galbraithbb2bc552015-01-28 04:53:55 +01005599 if (!cpumask_weight(cur))
5600 return ret;
5601
Juri Lelli75e23e42014-10-28 11:54:46 +00005602 rcu_read_lock_sched();
Juri Lellif82f8042014-10-07 09:52:11 +01005603 cur_dl_b = dl_bw_of(cpumask_any(cur));
5604 trial_cpus = cpumask_weight(trial);
5605
5606 raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
5607 if (cur_dl_b->bw != -1 &&
5608 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
5609 ret = 0;
5610 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
Juri Lelli75e23e42014-10-28 11:54:46 +00005611 rcu_read_unlock_sched();
Juri Lellif82f8042014-10-07 09:52:11 +01005612
5613 return ret;
5614}
5615
Juri Lelli7f514122014-09-19 10:22:40 +01005616int task_can_attach(struct task_struct *p,
5617 const struct cpumask *cs_cpus_allowed)
5618{
5619 int ret = 0;
5620
5621 /*
5622 * Kthreads which disallow setaffinity shouldn't be moved
5623 * to a new cpuset; we don't want to change their cpu
5624 * affinity and isolating such threads by their set of
5625 * allowed nodes is unnecessary. Thus, cpusets are not
5626 * applicable for such threads. This prevents checking for
5627 * success of set_cpus_allowed_ptr() on all attached tasks
5628 * before cpus_allowed may be changed.
5629 */
5630 if (p->flags & PF_NO_SETAFFINITY) {
5631 ret = -EINVAL;
5632 goto out;
5633 }
5634
5635#ifdef CONFIG_SMP
5636 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
5637 cs_cpus_allowed)) {
5638 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
5639 cs_cpus_allowed);
Juri Lelli75e23e42014-10-28 11:54:46 +00005640 struct dl_bw *dl_b;
Juri Lelli7f514122014-09-19 10:22:40 +01005641 bool overflow;
5642 int cpus;
5643 unsigned long flags;
5644
Juri Lelli75e23e42014-10-28 11:54:46 +00005645 rcu_read_lock_sched();
5646 dl_b = dl_bw_of(dest_cpu);
Juri Lelli7f514122014-09-19 10:22:40 +01005647 raw_spin_lock_irqsave(&dl_b->lock, flags);
5648 cpus = dl_bw_cpus(dest_cpu);
5649 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
5650 if (overflow)
5651 ret = -EBUSY;
5652 else {
5653 /*
5654 * We reserve space for this task in the destination
5655 * root_domain, as we can't fail after this point.
5656 * We will free resources in the source root_domain
5657 * later on (see set_cpus_allowed_dl()).
5658 */
5659 __dl_add(dl_b, p->dl.dl_bw);
5660 }
5661 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
Juri Lelli75e23e42014-10-28 11:54:46 +00005662 rcu_read_unlock_sched();
Juri Lelli7f514122014-09-19 10:22:40 +01005663
5664 }
5665#endif
5666out:
5667 return ret;
5668}
5669
Linus Torvalds1da177e2005-04-16 15:20:36 -07005670#ifdef CONFIG_SMP
Linus Torvalds1da177e2005-04-16 15:20:36 -07005671
Thomas Gleixnere26fbff2016-03-10 12:54:10 +01005672static bool sched_smp_initialized __read_mostly;
5673
Mel Gormane6628d52013-10-07 11:29:02 +01005674#ifdef CONFIG_NUMA_BALANCING
5675/* Migrate current task p to target_cpu */
5676int migrate_task_to(struct task_struct *p, int target_cpu)
5677{
5678 struct migration_arg arg = { p, target_cpu };
5679 int curr_cpu = task_cpu(p);
5680
5681 if (curr_cpu == target_cpu)
5682 return 0;
5683
5684 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
5685 return -EINVAL;
5686
5687 /* TODO: This is not properly updating schedstats */
5688
Mel Gorman286549d2014-01-21 15:51:03 -08005689 trace_sched_move_numa(p, curr_cpu, target_cpu);
Mel Gormane6628d52013-10-07 11:29:02 +01005690 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
5691}
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01005692
5693/*
5694 * Requeue a task on a given node and accurately track the number of NUMA
5695 * tasks on the runqueues
5696 */
5697void sched_setnuma(struct task_struct *p, int nid)
5698{
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04005699 bool queued, running;
Peter Zijlstraeb580752015-07-31 21:28:18 +02005700 struct rq_flags rf;
5701 struct rq *rq;
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01005702
Peter Zijlstraeb580752015-07-31 21:28:18 +02005703 rq = task_rq_lock(p, &rf);
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04005704 queued = task_on_rq_queued(p);
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01005705 running = task_current(rq, p);
5706
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04005707 if (queued)
Peter Zijlstra1de64442015-09-30 17:44:13 +02005708 dequeue_task(rq, p, DEQUEUE_SAVE);
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01005709 if (running)
Kirill Tkhaif3cd1c42014-09-12 17:41:40 +04005710 put_prev_task(rq, p);
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01005711
5712 p->numa_preferred_nid = nid;
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01005713
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04005714 if (queued)
Peter Zijlstra1de64442015-09-30 17:44:13 +02005715 enqueue_task(rq, p, ENQUEUE_RESTORE);
Vincent Guittota399d232016-09-12 09:47:52 +02005716 if (running)
Peter Zijlstrab2bf6c32016-09-20 22:00:38 +02005717 set_curr_task(rq, p);
Peter Zijlstraeb580752015-07-31 21:28:18 +02005718 task_rq_unlock(rq, p, &rf);
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01005719}
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02005720#endif /* CONFIG_NUMA_BALANCING */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005721
5722#ifdef CONFIG_HOTPLUG_CPU
Ingo Molnar48f24c42006-07-03 00:25:40 -07005723/*
5724 * Ensures that the idle task is using init_mm right before its cpu goes
Linus Torvalds1da177e2005-04-16 15:20:36 -07005725 * offline.
5726 */
5727void idle_task_exit(void)
5728{
5729 struct mm_struct *mm = current->active_mm;
5730
5731 BUG_ON(cpu_online(smp_processor_id()));
5732
Martin Schwidefskya53efe52012-10-26 17:17:44 +02005733 if (mm != &init_mm) {
Andy Lutomirski8a48b7e2017-06-09 11:49:15 -07005734 switch_mm(mm, &init_mm, current);
Martin Schwidefskya53efe52012-10-26 17:17:44 +02005735 finish_arch_post_lock_switch();
5736 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005737 mmdrop(mm);
5738}
5739
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01005740/*
Peter Zijlstra5d180232012-08-20 11:26:57 +02005741 * Since this CPU is going 'away' for a while, fold any nr_active delta
5742 * we might have. Assumes we're called after migrate_tasks() so that the
Thomas Gleixnerd60585c2016-07-12 18:33:56 +02005743 * nr_active count is stable. We need to take the teardown thread which
5744 * is calling this into account, so we hand in adjust = 1 to the load
5745 * calculation.
Peter Zijlstra5d180232012-08-20 11:26:57 +02005746 *
5747 * Also see the comment "Global load-average calculations".
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01005748 */
Peter Zijlstra5d180232012-08-20 11:26:57 +02005749static void calc_load_migrate(struct rq *rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005750{
Thomas Gleixnerd60585c2016-07-12 18:33:56 +02005751 long delta = calc_load_fold_active(rq, 1);
Peter Zijlstra5d180232012-08-20 11:26:57 +02005752 if (delta)
5753 atomic_long_add(delta, &calc_load_tasks);
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02005754}
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01005755
Peter Zijlstra3f1d2a32014-02-12 10:49:30 +01005756static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
5757{
5758}
5759
5760static const struct sched_class fake_sched_class = {
5761 .put_prev_task = put_prev_task_fake,
5762};
5763
5764static struct task_struct fake_task = {
5765 /*
5766 * Avoid pull_{rt,dl}_task()
5767 */
5768 .prio = MAX_PRIO + 1,
5769 .sched_class = &fake_sched_class,
5770};
5771
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01005772/*
Syed Rameez Mustafac3c39522016-12-07 17:00:27 -08005773 * Remove a task from the runqueue and pretend that it's migrating. This
5774 * should prevent migrations for the detached task and disallow further
5775 * changes to tsk_cpus_allowed.
5776 */
5777static void
5778detach_one_task(struct task_struct *p, struct rq *rq, struct list_head *tasks)
5779{
5780 lockdep_assert_held(&rq->lock);
5781
5782 p->on_rq = TASK_ON_RQ_MIGRATING;
5783 deactivate_task(rq, p, 0);
5784 list_add(&p->se.group_node, tasks);
5785}
5786
5787static void attach_tasks(struct list_head *tasks, struct rq *rq)
5788{
5789 struct task_struct *p;
5790
5791 lockdep_assert_held(&rq->lock);
5792
5793 while (!list_empty(tasks)) {
5794 p = list_first_entry(tasks, struct task_struct, se.group_node);
5795 list_del_init(&p->se.group_node);
5796
5797 BUG_ON(task_rq(p) != rq);
5798 activate_task(rq, p, 0);
5799 p->on_rq = TASK_ON_RQ_QUEUED;
5800 }
5801}
5802
5803/*
Olav Haugan3f2cb302016-05-31 14:34:46 -07005804 * Migrate all tasks (not pinned if pinned argument say so) from the rq,
5805 * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq().
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01005806 *
5807 * Called with rq->lock held even though we'er in stop_machine() and
5808 * there's no concurrency possible, we hold the required locks anyway
5809 * because of lock validation efforts.
5810 */
Olav Haugan3f2cb302016-05-31 14:34:46 -07005811static void migrate_tasks(struct rq *dead_rq, bool migrate_pinned_tasks)
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01005812{
Peter Zijlstra5e16bbc2015-06-11 14:46:51 +02005813 struct rq *rq = dead_rq;
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01005814 struct task_struct *next, *stop = rq->stop;
Matt Fleming5a91d732016-09-21 14:38:10 +01005815 struct rq_flags rf;
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01005816 int dest_cpu;
Olav Haugan3f2cb302016-05-31 14:34:46 -07005817 unsigned int num_pinned_kthreads = 1; /* this thread */
Syed Rameez Mustafac3c39522016-12-07 17:00:27 -08005818 LIST_HEAD(tasks);
Olav Haugan3f2cb302016-05-31 14:34:46 -07005819 cpumask_t avail_cpus;
5820
5821 cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01005822
5823 /*
5824 * Fudge the rq selection such that the below task selection loop
5825 * doesn't get stuck on the currently eligible stop task.
5826 *
5827 * We're currently inside stop_machine() and the rq is either stuck
5828 * in the stop_machine_cpu_stop() loop, or we're executing this code,
5829 * either way we should never end up calling schedule() until we're
5830 * done here.
5831 */
5832 rq->stop = NULL;
5833
Frederic Weisbecker77bd3972013-04-12 01:50:58 +02005834 /*
5835 * put_prev_task() and pick_next_task() sched
5836 * class method both need to have an up-to-date
5837 * value of rq->clock[_task]
5838 */
5839 update_rq_clock(rq);
5840
Peter Zijlstra5e16bbc2015-06-11 14:46:51 +02005841 for (;;) {
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01005842 /*
Syed Rameez Mustafac3c39522016-12-07 17:00:27 -08005843 * There's this thread running, bail when that's the only
5844 * remaining thread.
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01005845 */
Syed Rameez Mustafac3c39522016-12-07 17:00:27 -08005846 if (rq->nr_running == 1)
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01005847 break;
5848
Peter Zijlstracbce1a62015-06-11 14:46:54 +02005849 /*
Wanpeng Li5473e0cc2015-08-28 14:55:56 +08005850 * pick_next_task assumes pinned rq->lock.
Peter Zijlstracbce1a62015-06-11 14:46:54 +02005851 */
Matt Fleming5a91d732016-09-21 14:38:10 +01005852 rq_pin_lock(rq, &rf);
5853 next = pick_next_task(rq, &fake_task, &rf);
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01005854 BUG_ON(!next);
5855 next->sched_class->put_prev_task(rq, next);
5856
Olav Haugan3f2cb302016-05-31 14:34:46 -07005857 if (!migrate_pinned_tasks && next->flags & PF_KTHREAD &&
5858 !cpumask_intersects(&avail_cpus, &next->cpus_allowed)) {
Syed Rameez Mustafac3c39522016-12-07 17:00:27 -08005859 detach_one_task(next, rq, &tasks);
Olav Haugan3f2cb302016-05-31 14:34:46 -07005860 num_pinned_kthreads += 1;
jianzhoub82a5df2019-04-28 13:43:53 +08005861 rq_unpin_lock(rq, &rf);
Olav Haugan3f2cb302016-05-31 14:34:46 -07005862 continue;
5863 }
5864
Wanpeng Li5473e0cc2015-08-28 14:55:56 +08005865 /*
5866 * Rules for changing task_struct::cpus_allowed are holding
5867 * both pi_lock and rq->lock, such that holding either
5868 * stabilizes the mask.
5869 *
5870 * Drop rq->lock is not quite as disastrous as it usually is
5871 * because !cpu_active at this point, which means load-balance
5872 * will not interfere. Also, stop-machine.
5873 */
Matt Fleming5a91d732016-09-21 14:38:10 +01005874 rq_unpin_lock(rq, &rf);
Wanpeng Li5473e0cc2015-08-28 14:55:56 +08005875 raw_spin_unlock(&rq->lock);
5876 raw_spin_lock(&next->pi_lock);
5877 raw_spin_lock(&rq->lock);
5878
5879 /*
5880 * Since we're inside stop-machine, _nothing_ should have
5881 * changed the task, WARN if weird stuff happened, because in
5882 * that case the above rq->lock drop is a fail too.
Olav Haugan78bafee2016-11-08 17:06:21 -08005883 * However, during cpu isolation the load balancer might have
5884 * interferred since we don't stop all CPUs. Ignore warning for
5885 * this case.
Wanpeng Li5473e0cc2015-08-28 14:55:56 +08005886 */
Vikram Mulukutla5fd8c052016-11-11 16:05:24 -08005887 if (task_rq(next) != rq || !task_on_rq_queued(next)) {
5888 WARN_ON(migrate_pinned_tasks);
Wanpeng Li5473e0cc2015-08-28 14:55:56 +08005889 raw_spin_unlock(&next->pi_lock);
5890 continue;
5891 }
5892
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01005893 /* Find suitable destination for @next, with force if needed. */
Olav Haugan3f2cb302016-05-31 14:34:46 -07005894 dest_cpu = select_fallback_rq(dead_rq->cpu, next, false);
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01005895
Peter Zijlstra5e16bbc2015-06-11 14:46:51 +02005896 rq = __migrate_task(rq, next, dest_cpu);
5897 if (rq != dead_rq) {
5898 raw_spin_unlock(&rq->lock);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005899 raw_spin_unlock(&next->pi_lock);
Peter Zijlstra5e16bbc2015-06-11 14:46:51 +02005900 rq = dead_rq;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07005901 raw_spin_lock(&next->pi_lock);
Peter Zijlstra5e16bbc2015-06-11 14:46:51 +02005902 raw_spin_lock(&rq->lock);
5903 }
Wanpeng Li5473e0cc2015-08-28 14:55:56 +08005904 raw_spin_unlock(&next->pi_lock);
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01005905 }
5906
5907 rq->stop = stop;
Syed Rameez Mustafac3c39522016-12-07 17:00:27 -08005908
5909 if (num_pinned_kthreads > 1)
5910 attach_tasks(&tasks, rq);
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01005911}
Olav Haugan3f2cb302016-05-31 14:34:46 -07005912
5913static void set_rq_online(struct rq *rq);
5914static void set_rq_offline(struct rq *rq);
5915
5916int do_isolation_work_cpu_stop(void *data)
5917{
Olav Haugan3f2cb302016-05-31 14:34:46 -07005918 unsigned int cpu = smp_processor_id();
5919 struct rq *rq = cpu_rq(cpu);
5920
5921 watchdog_disable(cpu);
5922
5923 irq_migrate_all_off_this_cpu();
5924
Olav Haugand67250b2016-11-01 17:30:36 -07005925 local_irq_disable();
5926
Olav Haugan3f2cb302016-05-31 14:34:46 -07005927 sched_ttwu_pending();
Olav Haugand67250b2016-11-01 17:30:36 -07005928
Olav Haugand67250b2016-11-01 17:30:36 -07005929 raw_spin_lock(&rq->lock);
Olav Haugan3f2cb302016-05-31 14:34:46 -07005930
Olav Hauganbf6b7fa2016-11-03 13:39:23 -07005931 /*
5932 * Temporarily mark the rq as offline. This will allow us to
5933 * move tasks off the CPU.
5934 */
Olav Haugan3f2cb302016-05-31 14:34:46 -07005935 if (rq->rd) {
5936 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5937 set_rq_offline(rq);
5938 }
5939
5940 migrate_tasks(rq, false);
Olav Hauganbf6b7fa2016-11-03 13:39:23 -07005941
5942 if (rq->rd)
5943 set_rq_online(rq);
Olav Haugand67250b2016-11-01 17:30:36 -07005944 raw_spin_unlock(&rq->lock);
Olav Haugan3f2cb302016-05-31 14:34:46 -07005945
Pavankumar Kondeti84f72d72017-07-20 11:00:45 +05305946 clear_walt_request(cpu);
Olav Haugand67250b2016-11-01 17:30:36 -07005947 local_irq_enable();
Olav Haugan3f2cb302016-05-31 14:34:46 -07005948 return 0;
5949}
5950
5951int do_unisolation_work_cpu_stop(void *data)
5952{
5953 watchdog_enable(smp_processor_id());
5954 return 0;
5955}
5956
5957static void init_sched_groups_capacity(int cpu, struct sched_domain *sd);
5958
5959static void sched_update_group_capacities(int cpu)
5960{
5961 struct sched_domain *sd;
5962
5963 mutex_lock(&sched_domains_mutex);
5964 rcu_read_lock();
5965
5966 for_each_domain(cpu, sd) {
5967 int balance_cpu = group_balance_cpu(sd->groups);
5968
5969 init_sched_groups_capacity(cpu, sd);
5970 /*
5971 * Need to ensure this is also called with balancing
5972 * cpu.
5973 */
5974 if (cpu != balance_cpu)
5975 init_sched_groups_capacity(balance_cpu, sd);
5976 }
5977
5978 rcu_read_unlock();
5979 mutex_unlock(&sched_domains_mutex);
5980}
5981
5982static unsigned int cpu_isolation_vote[NR_CPUS];
5983
5984int sched_isolate_count(const cpumask_t *mask, bool include_offline)
5985{
5986 cpumask_t count_mask = CPU_MASK_NONE;
5987
5988 if (include_offline) {
5989 cpumask_complement(&count_mask, cpu_online_mask);
5990 cpumask_or(&count_mask, &count_mask, cpu_isolated_mask);
5991 cpumask_and(&count_mask, &count_mask, mask);
5992 } else {
5993 cpumask_and(&count_mask, mask, cpu_isolated_mask);
5994 }
5995
5996 return cpumask_weight(&count_mask);
5997}
5998
5999/*
6000 * 1) CPU is isolated and cpu is offlined:
6001 * Unisolate the core.
6002 * 2) CPU is not isolated and CPU is offlined:
6003 * No action taken.
6004 * 3) CPU is offline and request to isolate
6005 * Request ignored.
6006 * 4) CPU is offline and isolated:
6007 * Not a possible state.
6008 * 5) CPU is online and request to isolate
6009 * Normal case: Isolate the CPU
6010 * 6) CPU is not isolated and comes back online
6011 * Nothing to do
6012 *
6013 * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY
6014 * calling sched_unisolate_cpu() on a CPU that the client previously isolated.
6015 * Client is also responsible for unisolating when a core goes offline
6016 * (after CPU is marked offline).
6017 */
6018int sched_isolate_cpu(int cpu)
6019{
6020 struct rq *rq = cpu_rq(cpu);
6021 cpumask_t avail_cpus;
6022 int ret_code = 0;
Olav Haugan7763f342017-01-13 17:04:36 -08006023 u64 start_time = 0;
Olav Haugan39432ea2016-06-12 13:57:05 -07006024
6025 if (trace_sched_isolate_enabled())
6026 start_time = sched_clock();
Olav Haugan3f2cb302016-05-31 14:34:46 -07006027
Olav Haugan1e8a44c2016-11-17 18:31:33 -08006028 cpu_maps_update_begin();
Olav Haugan3f2cb302016-05-31 14:34:46 -07006029
6030 cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
6031
6032 /* We cannot isolate ALL cpus in the system */
6033 if (cpumask_weight(&avail_cpus) == 1) {
6034 ret_code = -EINVAL;
6035 goto out;
6036 }
6037
6038 if (!cpu_online(cpu)) {
6039 ret_code = -EINVAL;
6040 goto out;
6041 }
6042
6043 if (++cpu_isolation_vote[cpu] > 1)
6044 goto out;
6045
Olav Haugan11113472016-11-03 15:10:57 -07006046 /*
6047 * There is a race between watchdog being enabled by hotplug and
6048 * core isolation disabling the watchdog. When a CPU is hotplugged in
6049 * and the hotplug lock has been released the watchdog thread might
6050 * not have run yet to enable the watchdog.
6051 * We have to wait for the watchdog to be enabled before proceeding.
6052 */
6053 if (!watchdog_configured(cpu)) {
6054 msleep(20);
6055 if (!watchdog_configured(cpu)) {
6056 --cpu_isolation_vote[cpu];
6057 ret_code = -EBUSY;
6058 goto out;
6059 }
6060 }
6061
Olav Haugan3f2cb302016-05-31 14:34:46 -07006062 set_cpu_isolated(cpu, true);
6063 cpumask_clear_cpu(cpu, &avail_cpus);
6064
6065 /* Migrate timers */
6066 smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1);
6067 smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1);
6068
Olav Haugan3f2cb302016-05-31 14:34:46 -07006069 stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0);
6070
Olav Haugan3f2cb302016-05-31 14:34:46 -07006071 calc_load_migrate(rq);
6072 update_max_interval();
6073 sched_update_group_capacities(cpu);
6074
6075out:
Olav Haugan1e8a44c2016-11-17 18:31:33 -08006076 cpu_maps_update_done();
Olav Haugan39432ea2016-06-12 13:57:05 -07006077 trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0],
6078 start_time, 1);
Olav Haugan3f2cb302016-05-31 14:34:46 -07006079 return ret_code;
6080}
6081
6082/*
6083 * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY
6084 * calling sched_unisolate_cpu() on a CPU that the client previously isolated.
6085 * Client is also responsible for unisolating when a core goes offline
6086 * (after CPU is marked offline).
6087 */
6088int sched_unisolate_cpu_unlocked(int cpu)
6089{
6090 int ret_code = 0;
6091 struct rq *rq = cpu_rq(cpu);
Olav Haugan7763f342017-01-13 17:04:36 -08006092 u64 start_time = 0;
Olav Haugan39432ea2016-06-12 13:57:05 -07006093
6094 if (trace_sched_isolate_enabled())
6095 start_time = sched_clock();
Olav Haugan3f2cb302016-05-31 14:34:46 -07006096
Olav Haugan3f2cb302016-05-31 14:34:46 -07006097 if (!cpu_isolation_vote[cpu]) {
6098 ret_code = -EINVAL;
6099 goto out;
6100 }
6101
6102 if (--cpu_isolation_vote[cpu])
6103 goto out;
6104
6105 if (cpu_online(cpu)) {
6106 unsigned long flags;
6107
6108 raw_spin_lock_irqsave(&rq->lock, flags);
6109 rq->age_stamp = sched_clock_cpu(cpu);
Olav Haugan3f2cb302016-05-31 14:34:46 -07006110 raw_spin_unlock_irqrestore(&rq->lock, flags);
6111 }
6112
6113 set_cpu_isolated(cpu, false);
6114 update_max_interval();
6115 sched_update_group_capacities(cpu);
6116
6117 if (cpu_online(cpu)) {
6118 stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0);
6119
6120 /* Kick CPU to immediately do load balancing */
6121 if (!test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
6122 smp_send_reschedule(cpu);
6123 }
6124
6125out:
Olav Haugan39432ea2016-06-12 13:57:05 -07006126 trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0],
6127 start_time, 0);
Olav Haugan3f2cb302016-05-31 14:34:46 -07006128 return ret_code;
6129}
6130
6131int sched_unisolate_cpu(int cpu)
6132{
6133 int ret_code;
6134
Olav Haugan1e8a44c2016-11-17 18:31:33 -08006135 cpu_maps_update_begin();
Olav Haugan3f2cb302016-05-31 14:34:46 -07006136 ret_code = sched_unisolate_cpu_unlocked(cpu);
Olav Haugan1e8a44c2016-11-17 18:31:33 -08006137 cpu_maps_update_done();
Olav Haugan3f2cb302016-05-31 14:34:46 -07006138 return ret_code;
6139}
6140
Linus Torvalds1da177e2005-04-16 15:20:36 -07006141#endif /* CONFIG_HOTPLUG_CPU */
6142
Gregory Haskins1f11eb62008-06-04 15:04:05 -04006143static void set_rq_online(struct rq *rq)
6144{
6145 if (!rq->online) {
6146 const struct sched_class *class;
6147
Rusty Russellc6c49272008-11-25 02:35:05 +10306148 cpumask_set_cpu(rq->cpu, rq->rd->online);
Gregory Haskins1f11eb62008-06-04 15:04:05 -04006149 rq->online = 1;
6150
6151 for_each_class(class) {
6152 if (class->rq_online)
6153 class->rq_online(rq);
6154 }
6155 }
6156}
6157
6158static void set_rq_offline(struct rq *rq)
6159{
6160 if (rq->online) {
6161 const struct sched_class *class;
6162
6163 for_each_class(class) {
6164 if (class->rq_offline)
6165 class->rq_offline(rq);
6166 }
6167
Rusty Russellc6c49272008-11-25 02:35:05 +10306168 cpumask_clear_cpu(rq->cpu, rq->rd->online);
Gregory Haskins1f11eb62008-06-04 15:04:05 -04006169 rq->online = 0;
6170 }
6171}
6172
Thomas Gleixner9cf72432016-03-10 12:54:09 +01006173static void set_cpu_rq_start_time(unsigned int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006174{
Tejun Heo969c7922010-05-06 18:49:21 +02006175 struct rq *rq = cpu_rq(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006176
Corey Minyarda803f022014-05-08 13:47:39 -05006177 rq->age_stamp = sched_clock_cpu(cpu);
6178}
6179
Peter Zijlstra4cb98832011-04-07 14:09:58 +02006180static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
6181
Ingo Molnar3e9830d2007-10-15 17:00:13 +02006182#ifdef CONFIG_SCHED_DEBUG
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006183
Peter Zijlstrad039ac62012-05-31 21:20:16 +02006184static __read_mostly int sched_debug_enabled;
Mike Travisf6630112009-11-17 18:22:15 -06006185
Peter Zijlstrad039ac62012-05-31 21:20:16 +02006186static int __init sched_debug_setup(char *str)
Mike Travisf6630112009-11-17 18:22:15 -06006187{
Peter Zijlstrad039ac62012-05-31 21:20:16 +02006188 sched_debug_enabled = 1;
Mike Travisf6630112009-11-17 18:22:15 -06006189
6190 return 0;
6191}
Peter Zijlstrad039ac62012-05-31 21:20:16 +02006192early_param("sched_debug", sched_debug_setup);
6193
6194static inline bool sched_debug(void)
6195{
6196 return sched_debug_enabled;
6197}
Mike Travisf6630112009-11-17 18:22:15 -06006198
Mike Travis7c16ec52008-04-04 18:11:11 -07006199static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
Rusty Russell96f874e2008-11-25 02:35:14 +10306200 struct cpumask *groupmask)
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006201{
6202 struct sched_group *group = sd->groups;
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006203
Rusty Russell96f874e2008-11-25 02:35:14 +10306204 cpumask_clear(groupmask);
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006205
6206 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6207
6208 if (!(sd->flags & SD_LOAD_BALANCE)) {
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006209 printk("does not load-balance\n");
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006210 return -1;
6211 }
6212
Tejun Heo333470e2015-02-13 14:37:28 -08006213 printk(KERN_CONT "span %*pbl level %s\n",
6214 cpumask_pr_args(sched_domain_span(sd)), sd->name);
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006215
Rusty Russell758b2cd2008-11-25 02:35:04 +10306216 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006217 printk(KERN_ERR "ERROR: domain->span does not contain "
6218 "CPU%d\n", cpu);
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006219 }
Rusty Russell758b2cd2008-11-25 02:35:04 +10306220 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006221 printk(KERN_ERR "ERROR: domain->groups does not contain"
6222 " CPU%d\n", cpu);
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006223 }
6224
6225 printk(KERN_DEBUG "%*s groups:", level + 1, "");
6226 do {
6227 if (!group) {
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006228 printk("\n");
6229 printk(KERN_ERR "ERROR: group is NULL\n");
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006230 break;
6231 }
6232
Rusty Russell758b2cd2008-11-25 02:35:04 +10306233 if (!cpumask_weight(sched_group_cpus(group))) {
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006234 printk(KERN_CONT "\n");
6235 printk(KERN_ERR "ERROR: empty group\n");
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006236 break;
6237 }
6238
Peter Zijlstracb83b622012-04-17 15:49:36 +02006239 if (!(sd->flags & SD_OVERLAP) &&
6240 cpumask_intersects(groupmask, sched_group_cpus(group))) {
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006241 printk(KERN_CONT "\n");
6242 printk(KERN_ERR "ERROR: repeated CPUs\n");
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006243 break;
6244 }
6245
Rusty Russell758b2cd2008-11-25 02:35:04 +10306246 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006247
Tejun Heo333470e2015-02-13 14:37:28 -08006248 printk(KERN_CONT " %*pbl",
6249 cpumask_pr_args(sched_group_cpus(group)));
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04006250 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
Morten Rasmussen5cdeb5f2016-02-25 12:43:49 +00006251 printk(KERN_CONT " (cpu_capacity = %lu)",
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04006252 group->sgc->capacity);
Gautham R Shenoy381512c2009-04-14 09:09:36 +05306253 }
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006254
6255 group = group->next;
6256 } while (group != sd->groups);
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006257 printk(KERN_CONT "\n");
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006258
Rusty Russell758b2cd2008-11-25 02:35:04 +10306259 if (!cpumask_equal(sched_domain_span(sd), groupmask))
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006260 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006261
Rusty Russell758b2cd2008-11-25 02:35:04 +10306262 if (sd->parent &&
6263 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006264 printk(KERN_ERR "ERROR: parent span is not a superset "
6265 "of domain->span\n");
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006266 return 0;
6267}
6268
Linus Torvalds1da177e2005-04-16 15:20:36 -07006269static void sched_domain_debug(struct sched_domain *sd, int cpu)
6270{
6271 int level = 0;
6272
Peter Zijlstrad039ac62012-05-31 21:20:16 +02006273 if (!sched_debug_enabled)
Mike Travisf6630112009-11-17 18:22:15 -06006274 return;
6275
Nick Piggin41c7ce92005-06-25 14:57:24 -07006276 if (!sd) {
6277 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
6278 return;
6279 }
6280
Linus Torvalds1da177e2005-04-16 15:20:36 -07006281 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6282
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006283 for (;;) {
Peter Zijlstra4cb98832011-04-07 14:09:58 +02006284 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
Linus Torvalds1da177e2005-04-16 15:20:36 -07006285 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006286 level++;
6287 sd = sd->parent;
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08006288 if (!sd)
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006289 break;
6290 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006291}
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02006292#else /* !CONFIG_SCHED_DEBUG */
Peter Zijlstraa18a5792016-09-20 11:05:31 +02006293
6294# define sched_debug_enabled 0
Ingo Molnar48f24c42006-07-03 00:25:40 -07006295# define sched_domain_debug(sd, cpu) do { } while (0)
Peter Zijlstrad039ac62012-05-31 21:20:16 +02006296static inline bool sched_debug(void)
6297{
6298 return false;
6299}
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02006300#endif /* CONFIG_SCHED_DEBUG */
Linus Torvalds1da177e2005-04-16 15:20:36 -07006301
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006302static int sd_degenerate(struct sched_domain *sd)
Suresh Siddha245af2c2005-06-25 14:57:25 -07006303{
Dietmar Eggemann06654992015-07-30 16:53:30 +01006304 if (cpumask_weight(sched_domain_span(sd)) == 1) {
6305 if (sd->groups->sge)
6306 sd->flags &= ~SD_LOAD_BALANCE;
6307 else
6308 return 1;
6309 }
Suresh Siddha245af2c2005-06-25 14:57:25 -07006310
6311 /* Following flags need at least 2 groups */
6312 if (sd->flags & (SD_LOAD_BALANCE |
6313 SD_BALANCE_NEWIDLE |
6314 SD_BALANCE_FORK |
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006315 SD_BALANCE_EXEC |
Nicolas Pitre5d4dfdd2014-05-27 13:50:41 -04006316 SD_SHARE_CPUCAPACITY |
Morten Rasmussen1f6e6c72016-07-25 14:34:22 +01006317 SD_ASYM_CPUCAPACITY |
Vincent Guittotd77b3ed2014-04-11 11:44:40 +02006318 SD_SHARE_PKG_RESOURCES |
Morten Rasmussen858d7182015-01-13 13:50:46 +00006319 SD_SHARE_POWERDOMAIN |
6320 SD_SHARE_CAP_STATES)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07006321 if (sd->groups != sd->groups->next)
6322 return 0;
6323 }
6324
6325 /* Following flags don't use groups */
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006326 if (sd->flags & (SD_WAKE_AFFINE))
Suresh Siddha245af2c2005-06-25 14:57:25 -07006327 return 0;
6328
6329 return 1;
6330}
6331
Ingo Molnar48f24c42006-07-03 00:25:40 -07006332static int
6333sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
Suresh Siddha245af2c2005-06-25 14:57:25 -07006334{
6335 unsigned long cflags = sd->flags, pflags = parent->flags;
6336
6337 if (sd_degenerate(parent))
6338 return 1;
6339
Rusty Russell758b2cd2008-11-25 02:35:04 +10306340 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
Suresh Siddha245af2c2005-06-25 14:57:25 -07006341 return 0;
6342
Suresh Siddha245af2c2005-06-25 14:57:25 -07006343 /* Flags needing groups don't count if only 1 group in parent */
6344 if (parent->groups == parent->groups->next) {
6345 pflags &= ~(SD_LOAD_BALANCE |
6346 SD_BALANCE_NEWIDLE |
6347 SD_BALANCE_FORK |
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006348 SD_BALANCE_EXEC |
Morten Rasmussen1f6e6c72016-07-25 14:34:22 +01006349 SD_ASYM_CPUCAPACITY |
Nicolas Pitre5d4dfdd2014-05-27 13:50:41 -04006350 SD_SHARE_CPUCAPACITY |
Peter Zijlstra10866e622013-08-19 16:57:04 +02006351 SD_SHARE_PKG_RESOURCES |
Vincent Guittotd77b3ed2014-04-11 11:44:40 +02006352 SD_PREFER_SIBLING |
Morten Rasmussen858d7182015-01-13 13:50:46 +00006353 SD_SHARE_POWERDOMAIN |
6354 SD_SHARE_CAP_STATES);
Dietmar Eggemann06654992015-07-30 16:53:30 +01006355 if (parent->groups->sge) {
6356 parent->flags &= ~SD_LOAD_BALANCE;
6357 return 0;
6358 }
Ken Chen54364992008-12-07 18:47:37 -08006359 if (nr_node_ids == 1)
6360 pflags &= ~SD_SERIALIZE;
Suresh Siddha245af2c2005-06-25 14:57:25 -07006361 }
6362 if (~cflags & pflags)
6363 return 0;
6364
6365 return 1;
6366}
6367
Peter Zijlstradce840a2011-04-07 14:09:50 +02006368static void free_rootdomain(struct rcu_head *rcu)
Rusty Russellc6c49272008-11-25 02:35:05 +10306369{
Peter Zijlstradce840a2011-04-07 14:09:50 +02006370 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
Peter Zijlstra047106a2009-11-16 10:28:09 +01006371
Rusty Russell68e74562008-11-25 02:35:13 +10306372 cpupri_cleanup(&rd->cpupri);
Juri Lelli6bfd6d72013-11-07 14:43:47 +01006373 cpudl_cleanup(&rd->cpudl);
Juri Lelli1baca4c2013-11-07 14:43:38 +01006374 free_cpumask_var(rd->dlo_mask);
Rusty Russellc6c49272008-11-25 02:35:05 +10306375 free_cpumask_var(rd->rto_mask);
6376 free_cpumask_var(rd->online);
6377 free_cpumask_var(rd->span);
6378 kfree(rd);
6379}
6380
Gregory Haskins57d885f2008-01-25 21:08:18 +01006381static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6382{
Ingo Molnara0490fa2009-02-12 11:35:40 +01006383 struct root_domain *old_rd = NULL;
Gregory Haskins57d885f2008-01-25 21:08:18 +01006384 unsigned long flags;
Gregory Haskins57d885f2008-01-25 21:08:18 +01006385
Thomas Gleixner05fa7852009-11-17 14:28:38 +01006386 raw_spin_lock_irqsave(&rq->lock, flags);
Gregory Haskins57d885f2008-01-25 21:08:18 +01006387
6388 if (rq->rd) {
Ingo Molnara0490fa2009-02-12 11:35:40 +01006389 old_rd = rq->rd;
Gregory Haskins57d885f2008-01-25 21:08:18 +01006390
Rusty Russellc6c49272008-11-25 02:35:05 +10306391 if (cpumask_test_cpu(rq->cpu, old_rd->online))
Gregory Haskins1f11eb62008-06-04 15:04:05 -04006392 set_rq_offline(rq);
Gregory Haskins57d885f2008-01-25 21:08:18 +01006393
Rusty Russellc6c49272008-11-25 02:35:05 +10306394 cpumask_clear_cpu(rq->cpu, old_rd->span);
Gregory Haskinsdc938522008-01-25 21:08:26 +01006395
Ingo Molnara0490fa2009-02-12 11:35:40 +01006396 /*
Shigeru Yoshida05159732013-11-17 12:12:36 +09006397 * If we dont want to free the old_rd yet then
Ingo Molnara0490fa2009-02-12 11:35:40 +01006398 * set old_rd to NULL to skip the freeing later
6399 * in this function:
6400 */
6401 if (!atomic_dec_and_test(&old_rd->refcount))
6402 old_rd = NULL;
Gregory Haskins57d885f2008-01-25 21:08:18 +01006403 }
6404
6405 atomic_inc(&rd->refcount);
6406 rq->rd = rd;
6407
Rusty Russellc6c49272008-11-25 02:35:05 +10306408 cpumask_set_cpu(rq->cpu, rd->span);
Gregory Haskins00aec932009-07-30 10:57:23 -04006409 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
Gregory Haskins1f11eb62008-06-04 15:04:05 -04006410 set_rq_online(rq);
Gregory Haskins57d885f2008-01-25 21:08:18 +01006411
Thomas Gleixner05fa7852009-11-17 14:28:38 +01006412 raw_spin_unlock_irqrestore(&rq->lock, flags);
Ingo Molnara0490fa2009-02-12 11:35:40 +01006413
6414 if (old_rd)
Peter Zijlstradce840a2011-04-07 14:09:50 +02006415 call_rcu_sched(&old_rd->rcu, free_rootdomain);
Gregory Haskins57d885f2008-01-25 21:08:18 +01006416}
6417
Steven Rostedt (VMware)a384e542018-01-23 20:45:38 -05006418void sched_get_rd(struct root_domain *rd)
6419{
6420 atomic_inc(&rd->refcount);
6421}
6422
6423void sched_put_rd(struct root_domain *rd)
6424{
6425 if (!atomic_dec_and_test(&rd->refcount))
6426 return;
6427
6428 call_rcu_sched(&rd->rcu, free_rootdomain);
6429}
6430
Pekka Enberg68c38fc2010-07-15 23:18:22 +03006431static int init_rootdomain(struct root_domain *rd)
Gregory Haskins57d885f2008-01-25 21:08:18 +01006432{
6433 memset(rd, 0, sizeof(*rd));
6434
Xunlei Pang8295c692015-12-02 19:52:59 +08006435 if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
Li Zefan0c910d22009-01-06 17:39:06 +08006436 goto out;
Xunlei Pang8295c692015-12-02 19:52:59 +08006437 if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
Rusty Russellc6c49272008-11-25 02:35:05 +10306438 goto free_span;
Xunlei Pang8295c692015-12-02 19:52:59 +08006439 if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
Rusty Russellc6c49272008-11-25 02:35:05 +10306440 goto free_online;
Xunlei Pang8295c692015-12-02 19:52:59 +08006441 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
Juri Lelli1baca4c2013-11-07 14:43:38 +01006442 goto free_dlo_mask;
Gregory Haskins6e0534f2008-05-12 21:21:01 +02006443
Steven Rostedt (Red Hat)1c37ff72017-10-06 14:05:04 -04006444#ifdef HAVE_RT_PUSH_IPI
6445 rd->rto_cpu = -1;
6446 raw_spin_lock_init(&rd->rto_lock);
6447 init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
6448#endif
6449
Dario Faggioli332ac172013-11-07 14:43:45 +01006450 init_dl_bw(&rd->dl_bw);
Juri Lelli6bfd6d72013-11-07 14:43:47 +01006451 if (cpudl_init(&rd->cpudl) != 0)
6452 goto free_dlo_mask;
Dario Faggioli332ac172013-11-07 14:43:45 +01006453
Pekka Enberg68c38fc2010-07-15 23:18:22 +03006454 if (cpupri_init(&rd->cpupri) != 0)
Rusty Russell68e74562008-11-25 02:35:13 +10306455 goto free_rto_mask;
Dietmar Eggemannbbb138b2015-09-26 18:19:54 +01006456
6457 init_max_cpu_capacity(&rd->max_cpu_capacity);
Dietmar Eggemann14774e72017-01-08 16:16:59 +00006458
6459 rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1;
6460
Rusty Russellc6c49272008-11-25 02:35:05 +10306461 return 0;
6462
Rusty Russell68e74562008-11-25 02:35:13 +10306463free_rto_mask:
6464 free_cpumask_var(rd->rto_mask);
Juri Lelli1baca4c2013-11-07 14:43:38 +01006465free_dlo_mask:
6466 free_cpumask_var(rd->dlo_mask);
Rusty Russellc6c49272008-11-25 02:35:05 +10306467free_online:
6468 free_cpumask_var(rd->online);
6469free_span:
6470 free_cpumask_var(rd->span);
Li Zefan0c910d22009-01-06 17:39:06 +08006471out:
Rusty Russellc6c49272008-11-25 02:35:05 +10306472 return -ENOMEM;
Gregory Haskins57d885f2008-01-25 21:08:18 +01006473}
6474
Peter Zijlstra029632f2011-10-25 10:00:11 +02006475/*
6476 * By default the system creates a single root-domain with all cpus as
6477 * members (mimicking the global state we have today).
6478 */
6479struct root_domain def_root_domain;
6480
Gregory Haskins57d885f2008-01-25 21:08:18 +01006481static void init_defrootdomain(void)
6482{
Pekka Enberg68c38fc2010-07-15 23:18:22 +03006483 init_rootdomain(&def_root_domain);
Rusty Russellc6c49272008-11-25 02:35:05 +10306484
Gregory Haskins57d885f2008-01-25 21:08:18 +01006485 atomic_set(&def_root_domain.refcount, 1);
6486}
6487
Gregory Haskinsdc938522008-01-25 21:08:26 +01006488static struct root_domain *alloc_rootdomain(void)
Gregory Haskins57d885f2008-01-25 21:08:18 +01006489{
6490 struct root_domain *rd;
6491
6492 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6493 if (!rd)
6494 return NULL;
6495
Pekka Enberg68c38fc2010-07-15 23:18:22 +03006496 if (init_rootdomain(rd) != 0) {
Rusty Russellc6c49272008-11-25 02:35:05 +10306497 kfree(rd);
6498 return NULL;
6499 }
Gregory Haskins57d885f2008-01-25 21:08:18 +01006500
6501 return rd;
6502}
6503
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04006504static void free_sched_groups(struct sched_group *sg, int free_sgc)
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006505{
6506 struct sched_group *tmp, *first;
6507
6508 if (!sg)
6509 return;
6510
6511 first = sg;
6512 do {
6513 tmp = sg->next;
6514
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04006515 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
6516 kfree(sg->sgc);
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006517
6518 kfree(sg);
6519 sg = tmp;
6520 } while (sg != first);
6521}
6522
Peter Zijlstra16f3ef42016-05-09 10:37:57 +02006523static void destroy_sched_domain(struct sched_domain *sd)
Peter Zijlstradce840a2011-04-07 14:09:50 +02006524{
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006525 /*
6526 * If its an overlapping domain it has private groups, iterate and
6527 * nuke them all.
6528 */
6529 if (sd->flags & SD_OVERLAP) {
6530 free_sched_groups(sd->groups, 1);
6531 } else if (atomic_dec_and_test(&sd->groups->ref)) {
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04006532 kfree(sd->groups->sgc);
Peter Zijlstradce840a2011-04-07 14:09:50 +02006533 kfree(sd->groups);
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02006534 }
Peter Zijlstra24fc7ed2016-05-09 10:37:59 +02006535 if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
6536 kfree(sd->shared);
Peter Zijlstradce840a2011-04-07 14:09:50 +02006537 kfree(sd);
6538}
6539
Peter Zijlstra16f3ef42016-05-09 10:37:57 +02006540static void destroy_sched_domains_rcu(struct rcu_head *rcu)
Peter Zijlstradce840a2011-04-07 14:09:50 +02006541{
Peter Zijlstra16f3ef42016-05-09 10:37:57 +02006542 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6543
6544 while (sd) {
6545 struct sched_domain *parent = sd->parent;
6546 destroy_sched_domain(sd);
6547 sd = parent;
6548 }
Peter Zijlstradce840a2011-04-07 14:09:50 +02006549}
6550
Peter Zijlstraf39180e2016-05-09 10:37:54 +02006551static void destroy_sched_domains(struct sched_domain *sd)
Peter Zijlstradce840a2011-04-07 14:09:50 +02006552{
Peter Zijlstra16f3ef42016-05-09 10:37:57 +02006553 if (sd)
6554 call_rcu(&sd->rcu, destroy_sched_domains_rcu);
Peter Zijlstradce840a2011-04-07 14:09:50 +02006555}
6556
Linus Torvalds1da177e2005-04-16 15:20:36 -07006557/*
Peter Zijlstra518cd622011-12-07 15:07:31 +01006558 * Keep a special pointer to the highest sched_domain that has
6559 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
6560 * allows us to avoid some pointer chasing select_idle_sibling().
6561 *
6562 * Also keep a unique ID per domain (we use the first cpu number in
6563 * the cpumask of the domain), this allows us to quickly tell if
Peter Zijlstra39be3502012-01-26 12:44:34 +01006564 * two cpus are in the same cache domain, see cpus_share_cache().
Peter Zijlstra518cd622011-12-07 15:07:31 +01006565 */
6566DEFINE_PER_CPU(struct sched_domain *, sd_llc);
Peter Zijlstra7d9ffa82013-07-04 12:56:46 +08006567DEFINE_PER_CPU(int, sd_llc_size);
Peter Zijlstra518cd622011-12-07 15:07:31 +01006568DEFINE_PER_CPU(int, sd_llc_id);
Peter Zijlstra0e369d72016-05-09 10:38:01 +02006569DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
Mel Gormanfb13c7e2013-10-07 11:29:17 +01006570DEFINE_PER_CPU(struct sched_domain *, sd_numa);
Preeti U Murthy37dc6b52013-10-30 08:42:52 +05306571DEFINE_PER_CPU(struct sched_domain *, sd_asym);
Morten Rasmussen30786a02015-01-02 17:08:52 +00006572DEFINE_PER_CPU(struct sched_domain *, sd_ea);
Morten Rasmussen61bf6252014-12-18 14:47:18 +00006573DEFINE_PER_CPU(struct sched_domain *, sd_scs);
Peter Zijlstra518cd622011-12-07 15:07:31 +01006574
6575static void update_top_cache_domain(int cpu)
6576{
Peter Zijlstra0e369d72016-05-09 10:38:01 +02006577 struct sched_domain_shared *sds = NULL;
Peter Zijlstra518cd622011-12-07 15:07:31 +01006578 struct sched_domain *sd;
Morten Rasmussen30786a02015-01-02 17:08:52 +00006579 struct sched_domain *ea_sd = NULL;
Peter Zijlstra518cd622011-12-07 15:07:31 +01006580 int id = cpu;
Peter Zijlstra7d9ffa82013-07-04 12:56:46 +08006581 int size = 1;
Peter Zijlstra518cd622011-12-07 15:07:31 +01006582
6583 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
Peter Zijlstra7d9ffa82013-07-04 12:56:46 +08006584 if (sd) {
Peter Zijlstra518cd622011-12-07 15:07:31 +01006585 id = cpumask_first(sched_domain_span(sd));
Peter Zijlstra7d9ffa82013-07-04 12:56:46 +08006586 size = cpumask_weight(sched_domain_span(sd));
Peter Zijlstra0e369d72016-05-09 10:38:01 +02006587 sds = sd->shared;
Peter Zijlstra7d9ffa82013-07-04 12:56:46 +08006588 }
Peter Zijlstra518cd622011-12-07 15:07:31 +01006589
6590 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
Peter Zijlstra7d9ffa82013-07-04 12:56:46 +08006591 per_cpu(sd_llc_size, cpu) = size;
Peter Zijlstra518cd622011-12-07 15:07:31 +01006592 per_cpu(sd_llc_id, cpu) = id;
Peter Zijlstra0e369d72016-05-09 10:38:01 +02006593 rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
Mel Gormanfb13c7e2013-10-07 11:29:17 +01006594
6595 sd = lowest_flag_domain(cpu, SD_NUMA);
6596 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
Preeti U Murthy37dc6b52013-10-30 08:42:52 +05306597
6598 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
6599 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
Morten Rasmussen30786a02015-01-02 17:08:52 +00006600
6601 for_each_domain(cpu, sd) {
6602 if (sd->groups->sge)
6603 ea_sd = sd;
6604 else
6605 break;
6606 }
6607 rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);
Morten Rasmussen61bf6252014-12-18 14:47:18 +00006608
6609 sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);
6610 rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);
Peter Zijlstra518cd622011-12-07 15:07:31 +01006611}
6612
6613/*
Ingo Molnar0eab9142008-01-25 21:08:19 +01006614 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
Linus Torvalds1da177e2005-04-16 15:20:36 -07006615 * hold the hotplug lock.
6616 */
Ingo Molnar0eab9142008-01-25 21:08:19 +01006617static void
6618cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006619{
Ingo Molnar70b97a72006-07-03 00:25:42 -07006620 struct rq *rq = cpu_rq(cpu);
Suresh Siddha245af2c2005-06-25 14:57:25 -07006621 struct sched_domain *tmp;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07006622 unsigned long next_balance = rq->next_balance;
Suresh Siddha245af2c2005-06-25 14:57:25 -07006623
6624 /* Remove the sched domains which do not contribute to scheduling. */
Li Zefanf29c9b12008-11-06 09:45:16 +08006625 for (tmp = sd; tmp; ) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07006626 struct sched_domain *parent = tmp->parent;
6627 if (!parent)
6628 break;
Li Zefanf29c9b12008-11-06 09:45:16 +08006629
Siddha, Suresh B1a848872006-10-03 01:14:08 -07006630 if (sd_parent_degenerate(tmp, parent)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07006631 tmp->parent = parent->parent;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07006632 if (parent->parent)
6633 parent->parent->child = tmp;
Peter Zijlstra10866e622013-08-19 16:57:04 +02006634 /*
6635 * Transfer SD_PREFER_SIBLING down in case of a
6636 * degenerate parent; the spans match for this
6637 * so the property transfers.
6638 */
6639 if (parent->flags & SD_PREFER_SIBLING)
6640 tmp->flags |= SD_PREFER_SIBLING;
Peter Zijlstraf39180e2016-05-09 10:37:54 +02006641 destroy_sched_domain(parent);
Li Zefanf29c9b12008-11-06 09:45:16 +08006642 } else
6643 tmp = tmp->parent;
Suresh Siddha245af2c2005-06-25 14:57:25 -07006644 }
6645
Siddha, Suresh B1a848872006-10-03 01:14:08 -07006646 if (sd && sd_degenerate(sd)) {
Peter Zijlstradce840a2011-04-07 14:09:50 +02006647 tmp = sd;
Suresh Siddha245af2c2005-06-25 14:57:25 -07006648 sd = sd->parent;
Peter Zijlstraf39180e2016-05-09 10:37:54 +02006649 destroy_sched_domain(tmp);
Siddha, Suresh B1a848872006-10-03 01:14:08 -07006650 if (sd)
6651 sd->child = NULL;
6652 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006653
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07006654 for (tmp = sd; tmp; ) {
6655 unsigned long interval;
6656
6657 interval = msecs_to_jiffies(tmp->balance_interval);
6658 if (time_after(next_balance, tmp->last_balance + interval))
6659 next_balance = tmp->last_balance + interval;
6660
6661 tmp = tmp->parent;
6662 }
6663 rq->next_balance = next_balance;
6664
Peter Zijlstra4cb98832011-04-07 14:09:58 +02006665 sched_domain_debug(sd, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006666
Gregory Haskins57d885f2008-01-25 21:08:18 +01006667 rq_attach_root(rq, rd);
Peter Zijlstradce840a2011-04-07 14:09:50 +02006668 tmp = rq->sd;
Nick Piggin674311d2005-06-25 14:57:27 -07006669 rcu_assign_pointer(rq->sd, sd);
Peter Zijlstraf39180e2016-05-09 10:37:54 +02006670 destroy_sched_domains(tmp);
Peter Zijlstra518cd622011-12-07 15:07:31 +01006671
6672 update_top_cache_domain(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006673}
6674
Linus Torvalds1da177e2005-04-16 15:20:36 -07006675/* Setup the mask of cpus configured for isolated domains */
6676static int __init isolated_cpu_setup(char *str)
6677{
Prarit Bhargavaa6e44912016-02-04 09:38:00 -05006678 int ret;
6679
Rusty Russellbdddd292009-12-02 14:09:16 +10306680 alloc_bootmem_cpumask_var(&cpu_isolated_map);
Prarit Bhargavaa6e44912016-02-04 09:38:00 -05006681 ret = cpulist_parse(str, cpu_isolated_map);
6682 if (ret) {
6683 pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
6684 return 0;
6685 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006686 return 1;
6687}
Ingo Molnar8927f492007-10-15 17:00:13 +02006688__setup("isolcpus=", isolated_cpu_setup);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006689
Andreas Herrmann49a02c52009-08-18 12:51:52 +02006690struct s_data {
Peter Zijlstra21d42cc2011-04-07 14:09:48 +02006691 struct sched_domain ** __percpu sd;
Andreas Herrmann49a02c52009-08-18 12:51:52 +02006692 struct root_domain *rd;
6693};
6694
Andreas Herrmann2109b992009-08-18 12:53:00 +02006695enum s_alloc {
Andreas Herrmann2109b992009-08-18 12:53:00 +02006696 sa_rootdomain,
Peter Zijlstra21d42cc2011-04-07 14:09:48 +02006697 sa_sd,
Peter Zijlstradce840a2011-04-07 14:09:50 +02006698 sa_sd_storage,
Andreas Herrmann2109b992009-08-18 12:53:00 +02006699 sa_none,
6700};
6701
Peter Zijlstrac1174872012-05-31 14:47:33 +02006702/*
6703 * Build an iteration mask that can exclude certain CPUs from the upwards
6704 * domain traversal.
6705 *
Peter Zijlstra758dc6a2017-04-25 14:00:49 +02006706 * Only CPUs that can arrive at this group should be considered to continue
6707 * balancing.
6708 *
Peter Zijlstrac1174872012-05-31 14:47:33 +02006709 * Asymmetric node setups can result in situations where the domain tree is of
6710 * unequal depth, make sure to skip domains that already cover the entire
6711 * range.
6712 *
6713 * In that case build_sched_domains() will have terminated the iteration early
6714 * and our sibling sd spans will be empty. Domains should always include the
6715 * cpu they're built on, so check that.
6716 *
6717 */
6718static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
6719{
Lauro Ramos Venancio3e165b22017-04-20 16:51:40 -03006720 const struct cpumask *sg_span = sched_group_cpus(sg);
Peter Zijlstrac1174872012-05-31 14:47:33 +02006721 struct sd_data *sdd = sd->private;
6722 struct sched_domain *sibling;
6723 int i;
6724
Lauro Ramos Venancio3e165b22017-04-20 16:51:40 -03006725 for_each_cpu(i, sg_span) {
Peter Zijlstrac1174872012-05-31 14:47:33 +02006726 sibling = *per_cpu_ptr(sdd->sd, i);
Peter Zijlstra758dc6a2017-04-25 14:00:49 +02006727
6728 /*
6729 * Can happen in the asymmetric case, where these siblings are
6730 * unused. The mask will not be empty because those CPUs that
6731 * do have the top domain _should_ span the domain.
6732 */
6733 if (!sibling->child)
6734 continue;
6735
6736 /* If we would not end up here, we can't continue from here */
6737 if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
Peter Zijlstrac1174872012-05-31 14:47:33 +02006738 continue;
6739
6740 cpumask_set_cpu(i, sched_group_mask(sg));
6741 }
Peter Zijlstra758dc6a2017-04-25 14:00:49 +02006742
6743 /* We must not have empty masks here */
6744 WARN_ON_ONCE(cpumask_empty(sched_group_mask(sg)));
Peter Zijlstrac1174872012-05-31 14:47:33 +02006745}
6746
6747/*
6748 * Return the canonical balance cpu for this group, this is the first cpu
6749 * of this group that's also in the iteration mask.
6750 */
6751int group_balance_cpu(struct sched_group *sg)
6752{
6753 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
6754}
6755
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006756static int
6757build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6758{
6759 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
6760 const struct cpumask *span = sched_domain_span(sd);
6761 struct cpumask *covered = sched_domains_tmpmask;
6762 struct sd_data *sdd = sd->private;
Zhihui Zhangaaecac42014-08-01 21:18:03 -04006763 struct sched_domain *sibling;
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006764 int i;
6765
6766 cpumask_clear(covered);
6767
Peter Zijlstra7c3f08e2017-04-14 17:24:02 +02006768 for_each_cpu_wrap(i, span, cpu) {
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006769 struct cpumask *sg_span;
6770
6771 if (cpumask_test_cpu(i, covered))
6772 continue;
6773
Zhihui Zhangaaecac42014-08-01 21:18:03 -04006774 sibling = *per_cpu_ptr(sdd->sd, i);
Peter Zijlstrac1174872012-05-31 14:47:33 +02006775
6776 /* See the comment near build_group_mask(). */
Zhihui Zhangaaecac42014-08-01 21:18:03 -04006777 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
Peter Zijlstrac1174872012-05-31 14:47:33 +02006778 continue;
6779
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006780 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
Suresh Siddha4d78a222011-11-18 15:03:29 -08006781 GFP_KERNEL, cpu_to_node(cpu));
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006782
6783 if (!sg)
6784 goto fail;
6785
6786 sg_span = sched_group_cpus(sg);
Zhihui Zhangaaecac42014-08-01 21:18:03 -04006787 if (sibling->child)
6788 cpumask_copy(sg_span, sched_domain_span(sibling->child));
6789 else
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006790 cpumask_set_cpu(i, sg_span);
6791
6792 cpumask_or(covered, covered, sg_span);
6793
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04006794 sg->sgc = *per_cpu_ptr(sdd->sgc, i);
6795 if (atomic_inc_return(&sg->sgc->ref) == 1)
Peter Zijlstrac1174872012-05-31 14:47:33 +02006796 build_group_mask(sd, sg);
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006797
Peter Zijlstrac3decf02012-05-31 12:05:32 +02006798 /*
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04006799 * Initialize sgc->capacity such that even if we mess up the
Peter Zijlstrac3decf02012-05-31 12:05:32 +02006800 * domains and no possible iteration will get us here, we won't
6801 * die on a /0 trap.
6802 */
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04006803 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
Morten Rasmussen5cdeb5f2016-02-25 12:43:49 +00006804 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
Morten Rasmussen3d8cb902016-10-14 14:41:09 +01006805 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
Peter Zijlstrac1174872012-05-31 14:47:33 +02006806
6807 /*
6808 * Make sure the first group of this domain contains the
6809 * canonical balance cpu. Otherwise the sched_domain iteration
6810 * breaks. See update_sg_lb_stats().
6811 */
Peter Zijlstra74a5ce22012-05-23 18:00:43 +02006812 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
Peter Zijlstrac1174872012-05-31 14:47:33 +02006813 group_balance_cpu(sg) == cpu)
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006814 groups = sg;
6815
6816 if (!first)
6817 first = sg;
6818 if (last)
6819 last->next = sg;
6820 last = sg;
6821 last->next = first;
6822 }
6823 sd->groups = groups;
6824
6825 return 0;
6826
6827fail:
6828 free_sched_groups(first, 0);
6829
6830 return -ENOMEM;
6831}
6832
Peter Zijlstradce840a2011-04-07 14:09:50 +02006833static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006834{
Peter Zijlstradce840a2011-04-07 14:09:50 +02006835 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6836 struct sched_domain *child = sd->child;
6837
6838 if (child)
6839 cpu = cpumask_first(sched_domain_span(child));
6840
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02006841 if (sg) {
Peter Zijlstradce840a2011-04-07 14:09:50 +02006842 *sg = *per_cpu_ptr(sdd->sg, cpu);
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04006843 (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
6844 atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02006845 }
Peter Zijlstradce840a2011-04-07 14:09:50 +02006846
Linus Torvalds1da177e2005-04-16 15:20:36 -07006847 return cpu;
6848}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006849
Ingo Molnar48f24c42006-07-03 00:25:40 -07006850/*
Peter Zijlstradce840a2011-04-07 14:09:50 +02006851 * build_sched_groups will build a circular linked list of the groups
6852 * covered by the given span, and will set each group's ->cpumask correctly,
Nicolas Pitreced549f2014-05-26 18:19:38 -04006853 * and ->cpu_capacity to 0.
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006854 *
6855 * Assumes the sched_domain tree is fully constructed
Ingo Molnar48f24c42006-07-03 00:25:40 -07006856 */
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006857static int
6858build_sched_groups(struct sched_domain *sd, int cpu)
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08006859{
Peter Zijlstradce840a2011-04-07 14:09:50 +02006860 struct sched_group *first = NULL, *last = NULL;
6861 struct sd_data *sdd = sd->private;
6862 const struct cpumask *span = sched_domain_span(sd);
Peter Zijlstraf96225f2011-04-07 14:09:57 +02006863 struct cpumask *covered;
Peter Zijlstradce840a2011-04-07 14:09:50 +02006864 int i;
6865
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006866 get_group(cpu, sdd, &sd->groups);
6867 atomic_inc(&sd->groups->ref);
6868
Viresh Kumar09366292013-06-11 16:32:43 +05306869 if (cpu != cpumask_first(span))
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006870 return 0;
6871
Peter Zijlstraf96225f2011-04-07 14:09:57 +02006872 lockdep_assert_held(&sched_domains_mutex);
6873 covered = sched_domains_tmpmask;
6874
Peter Zijlstradce840a2011-04-07 14:09:50 +02006875 cpumask_clear(covered);
6876
6877 for_each_cpu(i, span) {
6878 struct sched_group *sg;
Viresh Kumarcd08e922013-06-11 16:32:44 +05306879 int group, j;
Peter Zijlstradce840a2011-04-07 14:09:50 +02006880
6881 if (cpumask_test_cpu(i, covered))
6882 continue;
6883
Viresh Kumarcd08e922013-06-11 16:32:44 +05306884 group = get_group(i, sdd, &sg);
Peter Zijlstrac1174872012-05-31 14:47:33 +02006885 cpumask_setall(sched_group_mask(sg));
Peter Zijlstradce840a2011-04-07 14:09:50 +02006886
6887 for_each_cpu(j, span) {
6888 if (get_group(j, sdd, NULL) != group)
6889 continue;
6890
6891 cpumask_set_cpu(j, covered);
6892 cpumask_set_cpu(j, sched_group_cpus(sg));
6893 }
6894
6895 if (!first)
6896 first = sg;
6897 if (last)
6898 last->next = sg;
6899 last = sg;
6900 }
6901 last->next = first;
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006902
6903 return 0;
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08006904}
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006905
Linus Torvalds1da177e2005-04-16 15:20:36 -07006906/*
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04006907 * Initialize sched groups cpu_capacity.
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006908 *
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04006909 * cpu_capacity indicates the capacity of sched group, which is used while
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006910 * distributing the load between different sched groups in a sched domain.
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04006911 * Typically cpu_capacity for all the groups in a sched domain will be same
6912 * unless there are asymmetries in the topology. If there are asymmetries,
6913 * group having more cpu_capacity will pickup more load compared to the
6914 * group having less cpu_capacity.
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006915 */
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04006916static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006917{
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006918 struct sched_group *sg = sd->groups;
Olav Haugan3f2cb302016-05-31 14:34:46 -07006919 cpumask_t avail_mask;
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006920
Viresh Kumar94c95ba2013-06-11 16:32:45 +05306921 WARN_ON(!sg);
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006922
6923 do {
Olav Haugan3f2cb302016-05-31 14:34:46 -07006924 cpumask_andnot(&avail_mask, sched_group_cpus(sg),
6925 cpu_isolated_mask);
6926 sg->group_weight = cpumask_weight(&avail_mask);
Peter Zijlstrae3589f62011-07-15 10:35:52 +02006927 sg = sg->next;
6928 } while (sg != sd->groups);
6929
Peter Zijlstrac1174872012-05-31 14:47:33 +02006930 if (cpu != group_balance_cpu(sg))
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006931 return;
6932
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04006933 update_group_capacity(sd, cpu);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006934}
6935
6936/*
Dietmar Eggemanndd23c092014-11-14 16:20:20 +00006937 * Check that the per-cpu provided sd energy data is consistent for all cpus
6938 * within the mask.
6939 */
6940static inline void check_sched_energy_data(int cpu, sched_domain_energy_f fn,
6941 const struct cpumask *cpumask)
6942{
6943 const struct sched_group_energy * const sge = fn(cpu);
6944 struct cpumask mask;
6945 int i;
6946
6947 if (cpumask_weight(cpumask) <= 1)
6948 return;
6949
6950 cpumask_xor(&mask, cpumask, get_cpu_mask(cpu));
6951
6952 for_each_cpu(i, &mask) {
6953 const struct sched_group_energy * const e = fn(i);
6954 int y;
6955
6956 BUG_ON(e->nr_idle_states != sge->nr_idle_states);
6957
6958 for (y = 0; y < (e->nr_idle_states); y++) {
6959 BUG_ON(e->idle_states[y].power !=
6960 sge->idle_states[y].power);
6961 }
6962
6963 BUG_ON(e->nr_cap_states != sge->nr_cap_states);
6964
6965 for (y = 0; y < (e->nr_cap_states); y++) {
6966 BUG_ON(e->cap_states[y].cap != sge->cap_states[y].cap);
6967 BUG_ON(e->cap_states[y].power !=
6968 sge->cap_states[y].power);
6969 }
6970 }
6971}
6972
6973static void init_sched_energy(int cpu, struct sched_domain *sd,
6974 sched_domain_energy_f fn)
6975{
6976 if (!(fn && fn(cpu)))
6977 return;
6978
6979 if (cpu != group_balance_cpu(sd->groups))
6980 return;
6981
6982 if (sd->child && !sd->child->groups->sge) {
6983 pr_err("BUG: EAS setup broken for CPU%d\n", cpu);
6984#ifdef CONFIG_SCHED_DEBUG
6985 pr_err(" energy data on %s but not on %s domain\n",
6986 sd->name, sd->child->name);
6987#endif
6988 return;
6989 }
6990
6991 check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups));
6992
6993 sd->groups->sge = fn(cpu);
6994}
6995
6996/*
Mike Travis7c16ec52008-04-04 18:11:11 -07006997 * Initializers for schedule domains
6998 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
6999 */
7000
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007001static int default_relax_domain_level = -1;
Peter Zijlstra60495e72011-04-07 14:10:04 +02007002int sched_domain_level_max;
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007003
7004static int __init setup_relax_domain_level(char *str)
7005{
Dimitri Sivanicha841f8c2012-06-05 13:44:36 -05007006 if (kstrtoint(str, 0, &default_relax_domain_level))
7007 pr_warn("Unable to set relax_domain_level\n");
Li Zefan30e0e172008-05-13 10:27:17 +08007008
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007009 return 1;
7010}
7011__setup("relax_domain_level=", setup_relax_domain_level);
7012
7013static void set_domain_attribute(struct sched_domain *sd,
7014 struct sched_domain_attr *attr)
7015{
7016 int request;
7017
7018 if (!attr || attr->relax_domain_level < 0) {
7019 if (default_relax_domain_level < 0)
7020 return;
7021 else
7022 request = default_relax_domain_level;
7023 } else
7024 request = attr->relax_domain_level;
7025 if (request < sd->level) {
7026 /* turn off idle balance on this domain */
Peter Zijlstrac88d5912009-09-10 13:50:02 +02007027 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007028 } else {
7029 /* turn on idle balance on this domain */
Peter Zijlstrac88d5912009-09-10 13:50:02 +02007030 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007031 }
7032}
7033
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007034static void __sdt_free(const struct cpumask *cpu_map);
7035static int __sdt_alloc(const struct cpumask *cpu_map);
7036
Andreas Herrmann2109b992009-08-18 12:53:00 +02007037static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7038 const struct cpumask *cpu_map)
7039{
7040 switch (what) {
Andreas Herrmann2109b992009-08-18 12:53:00 +02007041 case sa_rootdomain:
Peter Zijlstra822ff792011-04-07 14:09:51 +02007042 if (!atomic_read(&d->rd->refcount))
7043 free_rootdomain(&d->rd->rcu); /* fall through */
Peter Zijlstra21d42cc2011-04-07 14:09:48 +02007044 case sa_sd:
7045 free_percpu(d->sd); /* fall through */
Peter Zijlstradce840a2011-04-07 14:09:50 +02007046 case sa_sd_storage:
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007047 __sdt_free(cpu_map); /* fall through */
Andreas Herrmann2109b992009-08-18 12:53:00 +02007048 case sa_none:
7049 break;
7050 }
7051}
7052
7053static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7054 const struct cpumask *cpu_map)
7055{
Peter Zijlstradce840a2011-04-07 14:09:50 +02007056 memset(d, 0, sizeof(*d));
7057
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007058 if (__sdt_alloc(cpu_map))
7059 return sa_sd_storage;
Peter Zijlstra21d42cc2011-04-07 14:09:48 +02007060 d->sd = alloc_percpu(struct sched_domain *);
Peter Zijlstradce840a2011-04-07 14:09:50 +02007061 if (!d->sd)
7062 return sa_sd_storage;
Andreas Herrmann2109b992009-08-18 12:53:00 +02007063 d->rd = alloc_rootdomain();
Peter Zijlstradce840a2011-04-07 14:09:50 +02007064 if (!d->rd)
Peter Zijlstra21d42cc2011-04-07 14:09:48 +02007065 return sa_sd;
Andreas Herrmann2109b992009-08-18 12:53:00 +02007066 return sa_rootdomain;
7067}
7068
Peter Zijlstradce840a2011-04-07 14:09:50 +02007069/*
7070 * NULL the sd_data elements we've used to build the sched_domain and
7071 * sched_group structure so that the subsequent __free_domain_allocs()
7072 * will not free the data we're using.
7073 */
7074static void claim_allocations(int cpu, struct sched_domain *sd)
7075{
7076 struct sd_data *sdd = sd->private;
Peter Zijlstradce840a2011-04-07 14:09:50 +02007077
7078 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7079 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7080
Peter Zijlstra24fc7ed2016-05-09 10:37:59 +02007081 if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
7082 *per_cpu_ptr(sdd->sds, cpu) = NULL;
7083
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007084 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
Peter Zijlstradce840a2011-04-07 14:09:50 +02007085 *per_cpu_ptr(sdd->sg, cpu) = NULL;
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007086
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007087 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
7088 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
Peter Zijlstradce840a2011-04-07 14:09:50 +02007089}
7090
Peter Zijlstracb83b622012-04-17 15:49:36 +02007091#ifdef CONFIG_NUMA
Peter Zijlstracb83b622012-04-17 15:49:36 +02007092static int sched_domains_numa_levels;
Rik van Riele3fe70b2014-10-17 03:29:50 -04007093enum numa_topology_type sched_numa_topology_type;
Peter Zijlstracb83b622012-04-17 15:49:36 +02007094static int *sched_domains_numa_distance;
Rik van Riel9942f792014-10-17 03:29:49 -04007095int sched_max_numa_distance;
Peter Zijlstracb83b622012-04-17 15:49:36 +02007096static struct cpumask ***sched_domains_numa_masks;
7097static int sched_domains_curr_level;
Vincent Guittot143e1e22014-04-11 11:44:37 +02007098#endif
Peter Zijlstracb83b622012-04-17 15:49:36 +02007099
Vincent Guittot143e1e22014-04-11 11:44:37 +02007100/*
7101 * SD_flags allowed in topology descriptions.
7102 *
Peter Zijlstra94f438c2016-08-15 12:54:59 +02007103 * These flags are purely descriptive of the topology and do not prescribe
7104 * behaviour. Behaviour is artificial and mapped in the below sd_init()
7105 * function:
Vincent Guittot143e1e22014-04-11 11:44:37 +02007106 *
Peter Zijlstra94f438c2016-08-15 12:54:59 +02007107 * SD_SHARE_CPUCAPACITY - describes SMT topologies
7108 * SD_SHARE_PKG_RESOURCES - describes shared caches
7109 * SD_NUMA - describes NUMA topologies
7110 * SD_SHARE_POWERDOMAIN - describes shared power domain
Morten Rasmussen1f6e6c72016-07-25 14:34:22 +01007111 * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
Morten Rasmussen858d7182015-01-13 13:50:46 +00007112 * SD_SHARE_CAP_STATES - describes shared capacity states
Peter Zijlstra94f438c2016-08-15 12:54:59 +02007113 *
7114 * Odd one out, which beside describing the topology has a quirk also
7115 * prescribes the desired behaviour that goes along with it:
7116 *
7117 * SD_ASYM_PACKING - describes SMT quirks
Vincent Guittot143e1e22014-04-11 11:44:37 +02007118 */
7119#define TOPOLOGY_SD_FLAGS \
Nicolas Pitre5d4dfdd2014-05-27 13:50:41 -04007120 (SD_SHARE_CPUCAPACITY | \
Vincent Guittot143e1e22014-04-11 11:44:37 +02007121 SD_SHARE_PKG_RESOURCES | \
7122 SD_NUMA | \
Vincent Guittotd77b3ed2014-04-11 11:44:40 +02007123 SD_ASYM_PACKING | \
Morten Rasmussen1f6e6c72016-07-25 14:34:22 +01007124 SD_ASYM_CPUCAPACITY | \
Morten Rasmussen858d7182015-01-13 13:50:46 +00007125 SD_SHARE_POWERDOMAIN | \
7126 SD_SHARE_CAP_STATES)
Peter Zijlstracb83b622012-04-17 15:49:36 +02007127
7128static struct sched_domain *
Morten Rasmussen3676b132016-07-25 14:34:23 +01007129sd_init(struct sched_domain_topology_level *tl,
Peter Zijlstra24fc7ed2016-05-09 10:37:59 +02007130 const struct cpumask *cpu_map,
Morten Rasmussen3676b132016-07-25 14:34:23 +01007131 struct sched_domain *child, int cpu)
Peter Zijlstracb83b622012-04-17 15:49:36 +02007132{
Peter Zijlstra24fc7ed2016-05-09 10:37:59 +02007133 struct sd_data *sdd = &tl->data;
7134 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
7135 int sd_id, sd_weight, sd_flags = 0;
Vincent Guittot143e1e22014-04-11 11:44:37 +02007136
7137#ifdef CONFIG_NUMA
7138 /*
7139 * Ugly hack to pass state to sd_numa_mask()...
7140 */
7141 sched_domains_curr_level = tl->numa_level;
7142#endif
7143
7144 sd_weight = cpumask_weight(tl->mask(cpu));
7145
7146 if (tl->sd_flags)
7147 sd_flags = (*tl->sd_flags)();
7148 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
7149 "wrong sd_flags in topology description\n"))
7150 sd_flags &= ~TOPOLOGY_SD_FLAGS;
Peter Zijlstracb83b622012-04-17 15:49:36 +02007151
7152 *sd = (struct sched_domain){
7153 .min_interval = sd_weight,
7154 .max_interval = 2*sd_weight,
7155 .busy_factor = 32,
Peter Zijlstra870a0bb2012-05-11 00:26:27 +02007156 .imbalance_pct = 125,
Vincent Guittot143e1e22014-04-11 11:44:37 +02007157
7158 .cache_nice_tries = 0,
7159 .busy_idx = 0,
7160 .idle_idx = 0,
Peter Zijlstracb83b622012-04-17 15:49:36 +02007161 .newidle_idx = 0,
7162 .wake_idx = 0,
7163 .forkexec_idx = 0,
7164
7165 .flags = 1*SD_LOAD_BALANCE
7166 | 1*SD_BALANCE_NEWIDLE
Vincent Guittot143e1e22014-04-11 11:44:37 +02007167 | 1*SD_BALANCE_EXEC
7168 | 1*SD_BALANCE_FORK
Peter Zijlstracb83b622012-04-17 15:49:36 +02007169 | 0*SD_BALANCE_WAKE
Vincent Guittot143e1e22014-04-11 11:44:37 +02007170 | 1*SD_WAKE_AFFINE
Nicolas Pitre5d4dfdd2014-05-27 13:50:41 -04007171 | 0*SD_SHARE_CPUCAPACITY
Peter Zijlstracb83b622012-04-17 15:49:36 +02007172 | 0*SD_SHARE_PKG_RESOURCES
Vincent Guittot143e1e22014-04-11 11:44:37 +02007173 | 0*SD_SERIALIZE
Peter Zijlstracb83b622012-04-17 15:49:36 +02007174 | 0*SD_PREFER_SIBLING
Vincent Guittot143e1e22014-04-11 11:44:37 +02007175 | 0*SD_NUMA
7176 | sd_flags
Peter Zijlstracb83b622012-04-17 15:49:36 +02007177 ,
Vincent Guittot143e1e22014-04-11 11:44:37 +02007178
Peter Zijlstracb83b622012-04-17 15:49:36 +02007179 .last_balance = jiffies,
7180 .balance_interval = sd_weight,
Vincent Guittot143e1e22014-04-11 11:44:37 +02007181 .smt_gain = 0,
Jason Low2b4cfe62014-04-23 18:30:34 -07007182 .max_newidle_lb_cost = 0,
7183 .next_decay_max_lb_cost = jiffies,
Morten Rasmussen3676b132016-07-25 14:34:23 +01007184 .child = child,
Vincent Guittot143e1e22014-04-11 11:44:37 +02007185#ifdef CONFIG_SCHED_DEBUG
7186 .name = tl->name,
7187#endif
Peter Zijlstracb83b622012-04-17 15:49:36 +02007188 };
Peter Zijlstracb83b622012-04-17 15:49:36 +02007189
Peter Zijlstra24fc7ed2016-05-09 10:37:59 +02007190 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7191 sd_id = cpumask_first(sched_domain_span(sd));
7192
Peter Zijlstracb83b622012-04-17 15:49:36 +02007193 /*
Vincent Guittot143e1e22014-04-11 11:44:37 +02007194 * Convert topological properties into behaviour.
Peter Zijlstracb83b622012-04-17 15:49:36 +02007195 */
Vincent Guittot143e1e22014-04-11 11:44:37 +02007196
Morten Rasmussen9ee1cda2016-07-25 14:34:24 +01007197 if (sd->flags & SD_ASYM_CPUCAPACITY) {
7198 struct sched_domain *t = sd;
7199
7200 for_each_lower_domain(t)
7201 t->flags |= SD_BALANCE_WAKE;
7202 }
7203
Nicolas Pitre5d4dfdd2014-05-27 13:50:41 -04007204 if (sd->flags & SD_SHARE_CPUCAPACITY) {
Vincent Guittotcaff37e2015-02-27 16:54:13 +01007205 sd->flags |= SD_PREFER_SIBLING;
Vincent Guittot143e1e22014-04-11 11:44:37 +02007206 sd->imbalance_pct = 110;
7207 sd->smt_gain = 1178; /* ~15% */
Vincent Guittot143e1e22014-04-11 11:44:37 +02007208
7209 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
7210 sd->imbalance_pct = 117;
7211 sd->cache_nice_tries = 1;
7212 sd->busy_idx = 2;
7213
7214#ifdef CONFIG_NUMA
7215 } else if (sd->flags & SD_NUMA) {
7216 sd->cache_nice_tries = 2;
7217 sd->busy_idx = 3;
7218 sd->idle_idx = 2;
7219
7220 sd->flags |= SD_SERIALIZE;
7221 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
7222 sd->flags &= ~(SD_BALANCE_EXEC |
7223 SD_BALANCE_FORK |
7224 SD_WAKE_AFFINE);
7225 }
7226
7227#endif
7228 } else {
7229 sd->flags |= SD_PREFER_SIBLING;
7230 sd->cache_nice_tries = 1;
7231 sd->busy_idx = 2;
7232 sd->idle_idx = 1;
7233 }
7234
Peter Zijlstra24fc7ed2016-05-09 10:37:59 +02007235 /*
7236 * For all levels sharing cache; connect a sched_domain_shared
7237 * instance.
7238 */
7239 if (sd->flags & SD_SHARE_PKG_RESOURCES) {
7240 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
7241 atomic_inc(&sd->shared->ref);
Peter Zijlstra0e369d72016-05-09 10:38:01 +02007242 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
Peter Zijlstra24fc7ed2016-05-09 10:37:59 +02007243 }
7244
7245 sd->private = sdd;
Peter Zijlstracb83b622012-04-17 15:49:36 +02007246
7247 return sd;
7248}
7249
Vincent Guittot143e1e22014-04-11 11:44:37 +02007250/*
7251 * Topology list, bottom-up.
7252 */
7253static struct sched_domain_topology_level default_topology[] = {
7254#ifdef CONFIG_SCHED_SMT
7255 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
7256#endif
7257#ifdef CONFIG_SCHED_MC
7258 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
7259#endif
Vincent Guittot143e1e22014-04-11 11:44:37 +02007260 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
7261 { NULL, },
7262};
7263
Juergen Grossc6e1e7b2015-09-22 12:48:59 +02007264static struct sched_domain_topology_level *sched_domain_topology =
7265 default_topology;
Vincent Guittot143e1e22014-04-11 11:44:37 +02007266
7267#define for_each_sd_topology(tl) \
7268 for (tl = sched_domain_topology; tl->mask; tl++)
7269
7270void set_sched_topology(struct sched_domain_topology_level *tl)
7271{
Tim Chen8f379612016-09-21 12:19:03 -07007272 if (WARN_ON_ONCE(sched_smp_initialized))
7273 return;
7274
Vincent Guittot143e1e22014-04-11 11:44:37 +02007275 sched_domain_topology = tl;
7276}
7277
7278#ifdef CONFIG_NUMA
7279
Peter Zijlstracb83b622012-04-17 15:49:36 +02007280static const struct cpumask *sd_numa_mask(int cpu)
7281{
7282 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
7283}
7284
Peter Zijlstrad039ac62012-05-31 21:20:16 +02007285static void sched_numa_warn(const char *str)
7286{
7287 static int done = false;
7288 int i,j;
7289
7290 if (done)
7291 return;
7292
7293 done = true;
7294
7295 printk(KERN_WARNING "ERROR: %s\n\n", str);
7296
7297 for (i = 0; i < nr_node_ids; i++) {
7298 printk(KERN_WARNING " ");
7299 for (j = 0; j < nr_node_ids; j++)
7300 printk(KERN_CONT "%02d ", node_distance(i,j));
7301 printk(KERN_CONT "\n");
7302 }
7303 printk(KERN_WARNING "\n");
7304}
7305
Rik van Riel9942f792014-10-17 03:29:49 -04007306bool find_numa_distance(int distance)
Peter Zijlstrad039ac62012-05-31 21:20:16 +02007307{
7308 int i;
7309
7310 if (distance == node_distance(0, 0))
7311 return true;
7312
7313 for (i = 0; i < sched_domains_numa_levels; i++) {
7314 if (sched_domains_numa_distance[i] == distance)
7315 return true;
7316 }
7317
7318 return false;
7319}
7320
Rik van Riele3fe70b2014-10-17 03:29:50 -04007321/*
7322 * A system can have three types of NUMA topology:
7323 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
7324 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
7325 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
7326 *
7327 * The difference between a glueless mesh topology and a backplane
7328 * topology lies in whether communication between not directly
7329 * connected nodes goes through intermediary nodes (where programs
7330 * could run), or through backplane controllers. This affects
7331 * placement of programs.
7332 *
7333 * The type of topology can be discerned with the following tests:
7334 * - If the maximum distance between any nodes is 1 hop, the system
7335 * is directly connected.
7336 * - If for two nodes A and B, located N > 1 hops away from each other,
7337 * there is an intermediary node C, which is < N hops away from both
7338 * nodes A and B, the system is a glueless mesh.
7339 */
7340static void init_numa_topology_type(void)
7341{
7342 int a, b, c, n;
7343
7344 n = sched_max_numa_distance;
7345
Aravind Gopalakrishnane2378822015-08-10 20:20:48 -05007346 if (sched_domains_numa_levels <= 1) {
Rik van Riele3fe70b2014-10-17 03:29:50 -04007347 sched_numa_topology_type = NUMA_DIRECT;
Aravind Gopalakrishnane2378822015-08-10 20:20:48 -05007348 return;
7349 }
Rik van Riele3fe70b2014-10-17 03:29:50 -04007350
7351 for_each_online_node(a) {
7352 for_each_online_node(b) {
7353 /* Find two nodes furthest removed from each other. */
7354 if (node_distance(a, b) < n)
7355 continue;
7356
7357 /* Is there an intermediary node between a and b? */
7358 for_each_online_node(c) {
7359 if (node_distance(a, c) < n &&
7360 node_distance(b, c) < n) {
7361 sched_numa_topology_type =
7362 NUMA_GLUELESS_MESH;
7363 return;
7364 }
7365 }
7366
7367 sched_numa_topology_type = NUMA_BACKPLANE;
7368 return;
7369 }
7370 }
7371}
7372
Peter Zijlstracb83b622012-04-17 15:49:36 +02007373static void sched_init_numa(void)
7374{
7375 int next_distance, curr_distance = node_distance(0, 0);
7376 struct sched_domain_topology_level *tl;
7377 int level = 0;
7378 int i, j, k;
7379
Peter Zijlstracb83b622012-04-17 15:49:36 +02007380 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
7381 if (!sched_domains_numa_distance)
7382 return;
7383
7384 /*
7385 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
7386 * unique distances in the node_distance() table.
7387 *
7388 * Assumes node_distance(0,j) includes all distances in
7389 * node_distance(i,j) in order to avoid cubic time.
Peter Zijlstracb83b622012-04-17 15:49:36 +02007390 */
7391 next_distance = curr_distance;
7392 for (i = 0; i < nr_node_ids; i++) {
7393 for (j = 0; j < nr_node_ids; j++) {
Peter Zijlstrad039ac62012-05-31 21:20:16 +02007394 for (k = 0; k < nr_node_ids; k++) {
7395 int distance = node_distance(i, k);
7396
7397 if (distance > curr_distance &&
7398 (distance < next_distance ||
7399 next_distance == curr_distance))
7400 next_distance = distance;
7401
7402 /*
7403 * While not a strong assumption it would be nice to know
7404 * about cases where if node A is connected to B, B is not
7405 * equally connected to A.
7406 */
7407 if (sched_debug() && node_distance(k, i) != distance)
7408 sched_numa_warn("Node-distance not symmetric");
7409
7410 if (sched_debug() && i && !find_numa_distance(distance))
7411 sched_numa_warn("Node-0 not representative");
7412 }
7413 if (next_distance != curr_distance) {
7414 sched_domains_numa_distance[level++] = next_distance;
7415 sched_domains_numa_levels = level;
7416 curr_distance = next_distance;
7417 } else break;
Peter Zijlstracb83b622012-04-17 15:49:36 +02007418 }
Peter Zijlstrad039ac62012-05-31 21:20:16 +02007419
7420 /*
7421 * In case of sched_debug() we verify the above assumption.
7422 */
7423 if (!sched_debug())
7424 break;
Peter Zijlstracb83b622012-04-17 15:49:36 +02007425 }
Andrey Ryabininc1235882014-11-07 17:53:40 +03007426
7427 if (!level)
7428 return;
7429
Peter Zijlstracb83b622012-04-17 15:49:36 +02007430 /*
7431 * 'level' contains the number of unique distances, excluding the
7432 * identity distance node_distance(i,i).
7433 *
Viresh Kumar28b4a522013-04-05 16:26:46 +05307434 * The sched_domains_numa_distance[] array includes the actual distance
Peter Zijlstracb83b622012-04-17 15:49:36 +02007435 * numbers.
7436 */
7437
Tang Chen5f7865f2012-09-25 21:12:30 +08007438 /*
7439 * Here, we should temporarily reset sched_domains_numa_levels to 0.
7440 * If it fails to allocate memory for array sched_domains_numa_masks[][],
7441 * the array will contain less then 'level' members. This could be
7442 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
7443 * in other functions.
7444 *
7445 * We reset it to 'level' at the end of this function.
7446 */
7447 sched_domains_numa_levels = 0;
7448
Peter Zijlstracb83b622012-04-17 15:49:36 +02007449 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
7450 if (!sched_domains_numa_masks)
7451 return;
7452
7453 /*
7454 * Now for each level, construct a mask per node which contains all
7455 * cpus of nodes that are that many hops away from us.
7456 */
7457 for (i = 0; i < level; i++) {
7458 sched_domains_numa_masks[i] =
7459 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
7460 if (!sched_domains_numa_masks[i])
7461 return;
7462
7463 for (j = 0; j < nr_node_ids; j++) {
Peter Zijlstra2ea45802012-05-25 09:26:43 +02007464 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
Peter Zijlstracb83b622012-04-17 15:49:36 +02007465 if (!mask)
7466 return;
7467
7468 sched_domains_numa_masks[i][j] = mask;
7469
Raghavendra K T9c03ee12016-01-16 00:31:23 +05307470 for_each_node(k) {
Peter Zijlstradd7d8632012-05-11 00:56:20 +02007471 if (node_distance(j, k) > sched_domains_numa_distance[i])
Peter Zijlstracb83b622012-04-17 15:49:36 +02007472 continue;
7473
7474 cpumask_or(mask, mask, cpumask_of_node(k));
7475 }
7476 }
7477 }
7478
Vincent Guittot143e1e22014-04-11 11:44:37 +02007479 /* Compute default topology size */
7480 for (i = 0; sched_domain_topology[i].mask; i++);
7481
Vincent Guittotc515db82014-05-13 11:11:01 +02007482 tl = kzalloc((i + level + 1) *
Peter Zijlstracb83b622012-04-17 15:49:36 +02007483 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
7484 if (!tl)
7485 return;
7486
7487 /*
7488 * Copy the default topology bits..
7489 */
Vincent Guittot143e1e22014-04-11 11:44:37 +02007490 for (i = 0; sched_domain_topology[i].mask; i++)
7491 tl[i] = sched_domain_topology[i];
Peter Zijlstracb83b622012-04-17 15:49:36 +02007492
7493 /*
7494 * .. and append 'j' levels of NUMA goodness.
7495 */
7496 for (j = 0; j < level; i++, j++) {
7497 tl[i] = (struct sched_domain_topology_level){
Peter Zijlstracb83b622012-04-17 15:49:36 +02007498 .mask = sd_numa_mask,
Vincent Guittot143e1e22014-04-11 11:44:37 +02007499 .sd_flags = cpu_numa_flags,
Peter Zijlstracb83b622012-04-17 15:49:36 +02007500 .flags = SDTL_OVERLAP,
7501 .numa_level = j,
Vincent Guittot143e1e22014-04-11 11:44:37 +02007502 SD_INIT_NAME(NUMA)
Peter Zijlstracb83b622012-04-17 15:49:36 +02007503 };
7504 }
7505
7506 sched_domain_topology = tl;
Tang Chen5f7865f2012-09-25 21:12:30 +08007507
7508 sched_domains_numa_levels = level;
Rik van Riel9942f792014-10-17 03:29:49 -04007509 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
Rik van Riele3fe70b2014-10-17 03:29:50 -04007510
7511 init_numa_topology_type();
Peter Zijlstracb83b622012-04-17 15:49:36 +02007512}
Tang Chen301a5cb2012-09-25 21:12:31 +08007513
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01007514static void sched_domains_numa_masks_set(unsigned int cpu)
Tang Chen301a5cb2012-09-25 21:12:31 +08007515{
Tang Chen301a5cb2012-09-25 21:12:31 +08007516 int node = cpu_to_node(cpu);
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01007517 int i, j;
Tang Chen301a5cb2012-09-25 21:12:31 +08007518
7519 for (i = 0; i < sched_domains_numa_levels; i++) {
7520 for (j = 0; j < nr_node_ids; j++) {
7521 if (node_distance(j, node) <= sched_domains_numa_distance[i])
7522 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
7523 }
7524 }
7525}
7526
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01007527static void sched_domains_numa_masks_clear(unsigned int cpu)
Tang Chen301a5cb2012-09-25 21:12:31 +08007528{
7529 int i, j;
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01007530
Tang Chen301a5cb2012-09-25 21:12:31 +08007531 for (i = 0; i < sched_domains_numa_levels; i++) {
7532 for (j = 0; j < nr_node_ids; j++)
7533 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
7534 }
7535}
7536
Peter Zijlstracb83b622012-04-17 15:49:36 +02007537#else
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01007538static inline void sched_init_numa(void) { }
7539static void sched_domains_numa_masks_set(unsigned int cpu) { }
7540static void sched_domains_numa_masks_clear(unsigned int cpu) { }
Peter Zijlstracb83b622012-04-17 15:49:36 +02007541#endif /* CONFIG_NUMA */
7542
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007543static int __sdt_alloc(const struct cpumask *cpu_map)
7544{
7545 struct sched_domain_topology_level *tl;
7546 int j;
7547
Viresh Kumar27723a62013-06-10 16:27:20 +05307548 for_each_sd_topology(tl) {
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007549 struct sd_data *sdd = &tl->data;
7550
7551 sdd->sd = alloc_percpu(struct sched_domain *);
7552 if (!sdd->sd)
7553 return -ENOMEM;
7554
Peter Zijlstra24fc7ed2016-05-09 10:37:59 +02007555 sdd->sds = alloc_percpu(struct sched_domain_shared *);
7556 if (!sdd->sds)
7557 return -ENOMEM;
7558
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007559 sdd->sg = alloc_percpu(struct sched_group *);
7560 if (!sdd->sg)
7561 return -ENOMEM;
7562
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007563 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
7564 if (!sdd->sgc)
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02007565 return -ENOMEM;
7566
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007567 for_each_cpu(j, cpu_map) {
7568 struct sched_domain *sd;
Peter Zijlstra24fc7ed2016-05-09 10:37:59 +02007569 struct sched_domain_shared *sds;
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007570 struct sched_group *sg;
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007571 struct sched_group_capacity *sgc;
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007572
Peter Zijlstra5cc389b2015-06-11 14:46:50 +02007573 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007574 GFP_KERNEL, cpu_to_node(j));
7575 if (!sd)
7576 return -ENOMEM;
7577
7578 *per_cpu_ptr(sdd->sd, j) = sd;
7579
Peter Zijlstra24fc7ed2016-05-09 10:37:59 +02007580 sds = kzalloc_node(sizeof(struct sched_domain_shared),
7581 GFP_KERNEL, cpu_to_node(j));
7582 if (!sds)
7583 return -ENOMEM;
7584
7585 *per_cpu_ptr(sdd->sds, j) = sds;
7586
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007587 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7588 GFP_KERNEL, cpu_to_node(j));
7589 if (!sg)
7590 return -ENOMEM;
7591
Igor Mammedov30b4e9e2012-05-09 12:38:28 +02007592 sg->next = sg;
7593
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007594 *per_cpu_ptr(sdd->sg, j) = sg;
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02007595
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007596 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02007597 GFP_KERNEL, cpu_to_node(j));
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007598 if (!sgc)
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02007599 return -ENOMEM;
7600
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007601 *per_cpu_ptr(sdd->sgc, j) = sgc;
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007602 }
7603 }
7604
7605 return 0;
7606}
7607
7608static void __sdt_free(const struct cpumask *cpu_map)
7609{
7610 struct sched_domain_topology_level *tl;
7611 int j;
7612
Viresh Kumar27723a62013-06-10 16:27:20 +05307613 for_each_sd_topology(tl) {
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007614 struct sd_data *sdd = &tl->data;
7615
7616 for_each_cpu(j, cpu_map) {
he, bofb2cf2c2012-04-25 19:59:21 +08007617 struct sched_domain *sd;
7618
7619 if (sdd->sd) {
7620 sd = *per_cpu_ptr(sdd->sd, j);
7621 if (sd && (sd->flags & SD_OVERLAP))
7622 free_sched_groups(sd->groups, 0);
7623 kfree(*per_cpu_ptr(sdd->sd, j));
7624 }
7625
Peter Zijlstra24fc7ed2016-05-09 10:37:59 +02007626 if (sdd->sds)
7627 kfree(*per_cpu_ptr(sdd->sds, j));
he, bofb2cf2c2012-04-25 19:59:21 +08007628 if (sdd->sg)
7629 kfree(*per_cpu_ptr(sdd->sg, j));
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007630 if (sdd->sgc)
7631 kfree(*per_cpu_ptr(sdd->sgc, j));
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007632 }
7633 free_percpu(sdd->sd);
he, bofb2cf2c2012-04-25 19:59:21 +08007634 sdd->sd = NULL;
Peter Zijlstra24fc7ed2016-05-09 10:37:59 +02007635 free_percpu(sdd->sds);
7636 sdd->sds = NULL;
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007637 free_percpu(sdd->sg);
he, bofb2cf2c2012-04-25 19:59:21 +08007638 sdd->sg = NULL;
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007639 free_percpu(sdd->sgc);
7640 sdd->sgc = NULL;
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007641 }
7642}
7643
Peter Zijlstra2c402dc2011-04-07 14:10:01 +02007644struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
Viresh Kumar4a850cb2013-06-04 16:12:43 +05307645 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7646 struct sched_domain *child, int cpu)
Peter Zijlstra2c402dc2011-04-07 14:10:01 +02007647{
Peter Zijlstra24fc7ed2016-05-09 10:37:59 +02007648 struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
Peter Zijlstra2c402dc2011-04-07 14:10:01 +02007649
Peter Zijlstra60495e72011-04-07 14:10:04 +02007650 if (child) {
7651 sd->level = child->level + 1;
7652 sched_domain_level_max = max(sched_domain_level_max, sd->level);
Peter Zijlstrad069b912011-04-07 14:10:02 +02007653 child->parent = sd;
Peter Zijlstra6ae72df2014-07-22 11:47:40 +02007654
7655 if (!cpumask_subset(sched_domain_span(child),
7656 sched_domain_span(sd))) {
7657 pr_err("BUG: arch topology borken\n");
7658#ifdef CONFIG_SCHED_DEBUG
7659 pr_err(" the %s domain not a subset of the %s domain\n",
7660 child->name, sd->name);
7661#endif
7662 /* Fixup, ensure @sd has at least @child cpus. */
7663 cpumask_or(sched_domain_span(sd),
7664 sched_domain_span(sd),
7665 sched_domain_span(child));
7666 }
7667
Peter Zijlstra60495e72011-04-07 14:10:04 +02007668 }
Dimitri Sivanicha841f8c2012-06-05 13:44:36 -05007669 set_domain_attribute(sd, attr);
Peter Zijlstra2c402dc2011-04-07 14:10:01 +02007670
7671 return sd;
7672}
7673
Mike Travis7c16ec52008-04-04 18:11:11 -07007674/*
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07007675 * Build sched domains for a given set of cpus and attach the sched domains
7676 * to the individual cpus
Linus Torvalds1da177e2005-04-16 15:20:36 -07007677 */
Peter Zijlstradce840a2011-04-07 14:09:50 +02007678static int build_sched_domains(const struct cpumask *cpu_map,
7679 struct sched_domain_attr *attr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007680{
Viresh Kumar1c632162013-06-10 16:27:18 +05307681 enum s_alloc alloc_state;
Peter Zijlstradce840a2011-04-07 14:09:50 +02007682 struct sched_domain *sd;
Andreas Herrmann49a02c52009-08-18 12:51:52 +02007683 struct s_data d;
Peter Zijlstra822ff792011-04-07 14:09:51 +02007684 int i, ret = -ENOMEM;
Rusty Russell3404c8d2008-11-25 02:35:03 +10307685
Andreas Herrmann2109b992009-08-18 12:53:00 +02007686 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7687 if (alloc_state != sa_rootdomain)
7688 goto error;
Mike Travis7c16ec52008-04-04 18:11:11 -07007689
Peter Zijlstradce840a2011-04-07 14:09:50 +02007690 /* Set up domains for cpus specified by the cpu_map. */
Rusty Russellabcd0832008-11-25 02:35:02 +10307691 for_each_cpu(i, cpu_map) {
Peter Zijlstraeb7a74e62011-04-07 14:10:00 +02007692 struct sched_domain_topology_level *tl;
7693
Peter Zijlstra3bd65a82011-04-07 14:09:54 +02007694 sd = NULL;
Viresh Kumar27723a62013-06-10 16:27:20 +05307695 for_each_sd_topology(tl) {
Viresh Kumar4a850cb2013-06-04 16:12:43 +05307696 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
Viresh Kumar22da9562013-06-04 15:41:15 +05307697 if (tl == sched_domain_topology)
7698 *per_cpu_ptr(d.sd, i) = sd;
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007699 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
7700 sd->flags |= SD_OVERLAP;
7701 }
Peter Zijlstradce840a2011-04-07 14:09:50 +02007702 }
Peter Zijlstra21d42cc2011-04-07 14:09:48 +02007703
Peter Zijlstradce840a2011-04-07 14:09:50 +02007704 /* Build the groups for the domains */
7705 for_each_cpu(i, cpu_map) {
7706 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7707 sd->span_weight = cpumask_weight(sched_domain_span(sd));
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007708 if (sd->flags & SD_OVERLAP) {
7709 if (build_overlap_sched_groups(sd, i))
7710 goto error;
7711 } else {
7712 if (build_sched_groups(sd, i))
7713 goto error;
7714 }
Peter Zijlstra1cf519022011-04-07 14:09:47 +02007715 }
Peter Zijlstraa06dadb2011-04-07 14:09:44 +02007716 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07007717
Nicolas Pitreced549f2014-05-26 18:19:38 -04007718 /* Calculate CPU capacity for physical packages and nodes */
Peter Zijlstraa9c9a9b2011-04-07 14:09:49 +02007719 for (i = nr_cpumask_bits-1; i >= 0; i--) {
Dietmar Eggemanndd23c092014-11-14 16:20:20 +00007720 struct sched_domain_topology_level *tl = sched_domain_topology;
7721
Peter Zijlstraa9c9a9b2011-04-07 14:09:49 +02007722 if (!cpumask_test_cpu(i, cpu_map))
7723 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007724
Dietmar Eggemanndd23c092014-11-14 16:20:20 +00007725 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {
7726 init_sched_energy(i, sd, tl->energy);
Peter Zijlstradce840a2011-04-07 14:09:50 +02007727 claim_allocations(i, sd);
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007728 init_sched_groups_capacity(i, sd);
Peter Zijlstradce840a2011-04-07 14:09:50 +02007729 }
Siddha, Suresh Bf712c0c2006-07-30 03:02:59 -07007730 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07007731
Linus Torvalds1da177e2005-04-16 15:20:36 -07007732 /* Attach the domains */
Peter Zijlstradce840a2011-04-07 14:09:50 +02007733 rcu_read_lock();
Rusty Russellabcd0832008-11-25 02:35:02 +10307734 for_each_cpu(i, cpu_map) {
Dietmar Eggemann14774e72017-01-08 16:16:59 +00007735 int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
7736 int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu);
7737
7738 if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig >
7739 cpu_rq(max_cpu)->cpu_capacity_orig))
7740 WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
7741
7742 if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig <
7743 cpu_rq(min_cpu)->cpu_capacity_orig))
7744 WRITE_ONCE(d.rd->min_cap_orig_cpu, i);
7745
Peter Zijlstra21d42cc2011-04-07 14:09:48 +02007746 sd = *per_cpu_ptr(d.sd, i);
Dietmar Eggemann14774e72017-01-08 16:16:59 +00007747
Andreas Herrmann49a02c52009-08-18 12:51:52 +02007748 cpu_attach_domain(sd, d.rd, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007749 }
Peter Zijlstradce840a2011-04-07 14:09:50 +02007750 rcu_read_unlock();
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07007751
Peter Zijlstra822ff792011-04-07 14:09:51 +02007752 ret = 0;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07007753error:
Andreas Herrmann2109b992009-08-18 12:53:00 +02007754 __free_domain_allocs(&d, alloc_state, cpu_map);
Peter Zijlstra822ff792011-04-07 14:09:51 +02007755 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007756}
Paul Jackson029190c2007-10-18 23:40:20 -07007757
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307758static cpumask_var_t *doms_cur; /* current sched domains */
Paul Jackson029190c2007-10-18 23:40:20 -07007759static int ndoms_cur; /* number of sched domains in 'doms_cur' */
Ingo Molnar4285f5942008-05-16 17:47:14 +02007760static struct sched_domain_attr *dattr_cur;
7761 /* attribues of custom domains in 'doms_cur' */
Paul Jackson029190c2007-10-18 23:40:20 -07007762
7763/*
7764 * Special case: If a kmalloc of a doms_cur partition (array of
Rusty Russell42128232008-11-25 02:35:12 +10307765 * cpumask) fails, then fallback to a single sched domain,
7766 * as determined by the single cpumask fallback_doms.
Paul Jackson029190c2007-10-18 23:40:20 -07007767 */
Rusty Russell42128232008-11-25 02:35:12 +10307768static cpumask_var_t fallback_doms;
Paul Jackson029190c2007-10-18 23:40:20 -07007769
Heiko Carstensee79d1b2008-12-09 18:49:50 +01007770/*
7771 * arch_update_cpu_topology lets virtualized architectures update the
7772 * cpu core maps. It is supposed to return 1 if the topology changed
7773 * or 0 if it stayed the same.
7774 */
Gideon Israel Dsouza52f5684c2014-04-07 15:39:20 -07007775int __weak arch_update_cpu_topology(void)
Heiko Carstens22e52b02008-03-12 18:31:59 +01007776{
Heiko Carstensee79d1b2008-12-09 18:49:50 +01007777 return 0;
Heiko Carstens22e52b02008-03-12 18:31:59 +01007778}
7779
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307780cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
7781{
7782 int i;
7783 cpumask_var_t *doms;
7784
7785 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
7786 if (!doms)
7787 return NULL;
7788 for (i = 0; i < ndoms; i++) {
7789 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
7790 free_sched_domains(doms, i);
7791 return NULL;
7792 }
7793 }
7794 return doms;
7795}
7796
7797void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7798{
7799 unsigned int i;
7800 for (i = 0; i < ndoms; i++)
7801 free_cpumask_var(doms[i]);
7802 kfree(doms);
7803}
7804
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07007805/*
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01007806 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
Paul Jackson029190c2007-10-18 23:40:20 -07007807 * For now this just excludes isolated cpus, but could be used to
7808 * exclude other special cases in the future.
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07007809 */
Peter Zijlstrac4a88492011-04-07 14:09:42 +02007810static int init_sched_domains(const struct cpumask *cpu_map)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07007811{
Milton Miller73785472007-10-24 18:23:48 +02007812 int err;
7813
Heiko Carstens22e52b02008-03-12 18:31:59 +01007814 arch_update_cpu_topology();
Paul Jackson029190c2007-10-18 23:40:20 -07007815 ndoms_cur = 1;
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307816 doms_cur = alloc_sched_domains(ndoms_cur);
Paul Jackson029190c2007-10-18 23:40:20 -07007817 if (!doms_cur)
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307818 doms_cur = &fallback_doms;
7819 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
Peter Zijlstradce840a2011-04-07 14:09:50 +02007820 err = build_sched_domains(doms_cur[0], NULL);
Milton Miller6382bc92007-10-15 17:00:19 +02007821 register_sched_domain_sysctl();
Milton Miller73785472007-10-24 18:23:48 +02007822
7823 return err;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07007824}
7825
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07007826/*
7827 * Detach sched domains from a group of cpus specified in cpu_map
7828 * These cpus will now be attached to the NULL domain
7829 */
Rusty Russell96f874e2008-11-25 02:35:14 +10307830static void detach_destroy_domains(const struct cpumask *cpu_map)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07007831{
7832 int i;
7833
Peter Zijlstradce840a2011-04-07 14:09:50 +02007834 rcu_read_lock();
Rusty Russellabcd0832008-11-25 02:35:02 +10307835 for_each_cpu(i, cpu_map)
Gregory Haskins57d885f2008-01-25 21:08:18 +01007836 cpu_attach_domain(NULL, &def_root_domain, i);
Peter Zijlstradce840a2011-04-07 14:09:50 +02007837 rcu_read_unlock();
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07007838}
7839
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007840/* handle null as "default" */
7841static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7842 struct sched_domain_attr *new, int idx_new)
7843{
7844 struct sched_domain_attr tmp;
7845
7846 /* fast path */
7847 if (!new && !cur)
7848 return 1;
7849
7850 tmp = SD_ATTR_INIT;
7851 return !memcmp(cur ? (cur + idx_cur) : &tmp,
7852 new ? (new + idx_new) : &tmp,
7853 sizeof(struct sched_domain_attr));
7854}
7855
Paul Jackson029190c2007-10-18 23:40:20 -07007856/*
7857 * Partition sched domains as specified by the 'ndoms_new'
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01007858 * cpumasks in the array doms_new[] of cpumasks. This compares
Paul Jackson029190c2007-10-18 23:40:20 -07007859 * doms_new[] to the current sched domain partitioning, doms_cur[].
7860 * It destroys each deleted domain and builds each new domain.
7861 *
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307862 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01007863 * The masks don't intersect (don't overlap.) We should setup one
7864 * sched domain for each mask. CPUs not in any of the cpumasks will
7865 * not be load balanced. If the same cpumask appears both in the
Paul Jackson029190c2007-10-18 23:40:20 -07007866 * current 'doms_cur' domains and in the new 'doms_new', we can leave
7867 * it as it is.
7868 *
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307869 * The passed in 'doms_new' should be allocated using
7870 * alloc_sched_domains. This routine takes ownership of it and will
7871 * free_sched_domains it when done with it. If the caller failed the
7872 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
7873 * and partition_sched_domains() will fallback to the single partition
7874 * 'fallback_doms', it also forces the domains to be rebuilt.
Paul Jackson029190c2007-10-18 23:40:20 -07007875 *
Rusty Russell96f874e2008-11-25 02:35:14 +10307876 * If doms_new == NULL it will be replaced with cpu_online_mask.
Li Zefan700018e2008-11-18 14:02:03 +08007877 * ndoms_new == 0 is a special case for destroying existing domains,
7878 * and it will not create the default domain.
Max Krasnyanskydfb512e2008-08-29 13:11:41 -07007879 *
Paul Jackson029190c2007-10-18 23:40:20 -07007880 * Call with hotplug lock held
7881 */
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307882void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007883 struct sched_domain_attr *dattr_new)
Paul Jackson029190c2007-10-18 23:40:20 -07007884{
Max Krasnyanskydfb512e2008-08-29 13:11:41 -07007885 int i, j, n;
Heiko Carstensd65bd5e2008-12-09 18:49:51 +01007886 int new_topology;
Paul Jackson029190c2007-10-18 23:40:20 -07007887
Heiko Carstens712555e2008-04-28 11:33:07 +02007888 mutex_lock(&sched_domains_mutex);
Srivatsa Vaddagiria1835612008-01-25 21:08:00 +01007889
Milton Miller73785472007-10-24 18:23:48 +02007890 /* always unregister in case we don't destroy any domains */
7891 unregister_sched_domain_sysctl();
7892
Heiko Carstensd65bd5e2008-12-09 18:49:51 +01007893 /* Let architecture update cpu core mappings. */
7894 new_topology = arch_update_cpu_topology();
7895
Max Krasnyanskydfb512e2008-08-29 13:11:41 -07007896 n = doms_new ? ndoms_new : 0;
Paul Jackson029190c2007-10-18 23:40:20 -07007897
7898 /* Destroy deleted domains */
7899 for (i = 0; i < ndoms_cur; i++) {
Heiko Carstensd65bd5e2008-12-09 18:49:51 +01007900 for (j = 0; j < n && !new_topology; j++) {
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307901 if (cpumask_equal(doms_cur[i], doms_new[j])
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007902 && dattrs_equal(dattr_cur, i, dattr_new, j))
Paul Jackson029190c2007-10-18 23:40:20 -07007903 goto match1;
7904 }
7905 /* no match - a current sched domain not in new doms_new[] */
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307906 detach_destroy_domains(doms_cur[i]);
Paul Jackson029190c2007-10-18 23:40:20 -07007907match1:
7908 ;
7909 }
7910
Xiaotian Fengc8d2d472013-08-06 20:06:42 +08007911 n = ndoms_cur;
Max Krasnyanskye761b772008-07-15 04:43:49 -07007912 if (doms_new == NULL) {
Xiaotian Fengc8d2d472013-08-06 20:06:42 +08007913 n = 0;
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307914 doms_new = &fallback_doms;
Peter Zijlstra6ad4c182009-11-25 13:31:39 +01007915 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
Li Zefanfaa2f982008-11-04 16:20:23 +08007916 WARN_ON_ONCE(dattr_new);
Max Krasnyanskye761b772008-07-15 04:43:49 -07007917 }
7918
Paul Jackson029190c2007-10-18 23:40:20 -07007919 /* Build new domains */
7920 for (i = 0; i < ndoms_new; i++) {
Xiaotian Fengc8d2d472013-08-06 20:06:42 +08007921 for (j = 0; j < n && !new_topology; j++) {
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307922 if (cpumask_equal(doms_new[i], doms_cur[j])
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007923 && dattrs_equal(dattr_new, i, dattr_cur, j))
Paul Jackson029190c2007-10-18 23:40:20 -07007924 goto match2;
7925 }
7926 /* no match - add a new doms_new */
Peter Zijlstradce840a2011-04-07 14:09:50 +02007927 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
Paul Jackson029190c2007-10-18 23:40:20 -07007928match2:
7929 ;
7930 }
7931
7932 /* Remember the new sched domains */
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307933 if (doms_cur != &fallback_doms)
7934 free_sched_domains(doms_cur, ndoms_cur);
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007935 kfree(dattr_cur); /* kfree(NULL) is safe */
Paul Jackson029190c2007-10-18 23:40:20 -07007936 doms_cur = doms_new;
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007937 dattr_cur = dattr_new;
Paul Jackson029190c2007-10-18 23:40:20 -07007938 ndoms_cur = ndoms_new;
Milton Miller73785472007-10-24 18:23:48 +02007939
7940 register_sched_domain_sysctl();
Srivatsa Vaddagiria1835612008-01-25 21:08:00 +01007941
Heiko Carstens712555e2008-04-28 11:33:07 +02007942 mutex_unlock(&sched_domains_mutex);
Paul Jackson029190c2007-10-18 23:40:20 -07007943}
7944
Srivatsa S. Bhatd35be8b2012-05-24 19:46:26 +05307945static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
7946
Linus Torvalds1da177e2005-04-16 15:20:36 -07007947/*
Tejun Heo3a101d02010-06-08 21:40:36 +02007948 * Update cpusets according to cpu_active mask. If cpusets are
7949 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7950 * around partition_sched_domains().
Srivatsa S. Bhatd35be8b2012-05-24 19:46:26 +05307951 *
7952 * If we come here as part of a suspend/resume, don't touch cpusets because we
7953 * want to restore it back to its original state upon resume anyway.
Linus Torvalds1da177e2005-04-16 15:20:36 -07007954 */
Thomas Gleixner40190a72016-03-10 12:54:13 +01007955static void cpuset_cpu_active(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007956{
Thomas Gleixner40190a72016-03-10 12:54:13 +01007957 if (cpuhp_tasks_frozen) {
Srivatsa S. Bhatd35be8b2012-05-24 19:46:26 +05307958 /*
7959 * num_cpus_frozen tracks how many CPUs are involved in suspend
7960 * resume sequence. As long as this is not the last online
7961 * operation in the resume sequence, just build a single sched
7962 * domain, ignoring cpusets.
7963 */
Peter Zijlstraba155182017-09-07 11:13:38 +02007964 partition_sched_domains(1, NULL, NULL);
7965 if (--num_cpus_frozen)
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01007966 return;
Srivatsa S. Bhatd35be8b2012-05-24 19:46:26 +05307967 /*
7968 * This is the last CPU online operation. So fall through and
7969 * restore the original sched domains by considering the
7970 * cpuset configurations.
7971 */
Peter Zijlstraba155182017-09-07 11:13:38 +02007972 cpuset_force_rebuild();
Max Krasnyanskye761b772008-07-15 04:43:49 -07007973 }
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01007974 cpuset_update_active_cpus(true);
Max Krasnyanskye761b772008-07-15 04:43:49 -07007975}
Tejun Heo3a101d02010-06-08 21:40:36 +02007976
Thomas Gleixner40190a72016-03-10 12:54:13 +01007977static int cpuset_cpu_inactive(unsigned int cpu)
Tejun Heo3a101d02010-06-08 21:40:36 +02007978{
Juri Lelli3c18d442015-03-31 09:53:37 +01007979 unsigned long flags;
Juri Lelli3c18d442015-03-31 09:53:37 +01007980 struct dl_bw *dl_b;
Omar Sandoval533445c2015-05-04 03:09:36 -07007981 bool overflow;
7982 int cpus;
Juri Lelli3c18d442015-03-31 09:53:37 +01007983
Thomas Gleixner40190a72016-03-10 12:54:13 +01007984 if (!cpuhp_tasks_frozen) {
Omar Sandoval533445c2015-05-04 03:09:36 -07007985 rcu_read_lock_sched();
7986 dl_b = dl_bw_of(cpu);
Juri Lelli3c18d442015-03-31 09:53:37 +01007987
Omar Sandoval533445c2015-05-04 03:09:36 -07007988 raw_spin_lock_irqsave(&dl_b->lock, flags);
7989 cpus = dl_bw_cpus(cpu);
7990 overflow = __dl_overflow(dl_b, cpus, 0, 0);
7991 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
Juri Lelli3c18d442015-03-31 09:53:37 +01007992
Omar Sandoval533445c2015-05-04 03:09:36 -07007993 rcu_read_unlock_sched();
Juri Lelli3c18d442015-03-31 09:53:37 +01007994
Omar Sandoval533445c2015-05-04 03:09:36 -07007995 if (overflow)
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01007996 return -EBUSY;
Srivatsa S. Bhat7ddf96b2012-05-24 19:46:55 +05307997 cpuset_update_active_cpus(false);
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01007998 } else {
Srivatsa S. Bhatd35be8b2012-05-24 19:46:26 +05307999 num_cpus_frozen++;
8000 partition_sched_domains(1, NULL, NULL);
Tejun Heo3a101d02010-06-08 21:40:36 +02008001 }
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01008002 return 0;
Tejun Heo3a101d02010-06-08 21:40:36 +02008003}
Max Krasnyanskye761b772008-07-15 04:43:49 -07008004
Ben Hutchingsc8034092019-05-10 00:46:25 +01008005#ifdef CONFIG_SCHED_SMT
8006atomic_t sched_smt_present = ATOMIC_INIT(0);
8007#endif
8008
Thomas Gleixner40190a72016-03-10 12:54:13 +01008009int sched_cpu_activate(unsigned int cpu)
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01008010{
Thomas Gleixner7d976692016-03-10 12:54:17 +01008011 struct rq *rq = cpu_rq(cpu);
8012 unsigned long flags;
8013
Ben Hutchingsc8034092019-05-10 00:46:25 +01008014#ifdef CONFIG_SCHED_SMT
8015 /*
8016 * When going up, increment the number of cores with SMT present.
8017 */
8018 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
8019 atomic_inc(&sched_smt_present);
8020#endif
Thomas Gleixner40190a72016-03-10 12:54:13 +01008021 set_cpu_active(cpu, true);
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01008022
Thomas Gleixner40190a72016-03-10 12:54:13 +01008023 if (sched_smp_initialized) {
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01008024 sched_domains_numa_masks_set(cpu);
Thomas Gleixner40190a72016-03-10 12:54:13 +01008025 cpuset_cpu_active();
Nick Piggin5c1e1762006-10-03 01:14:04 -07008026 }
Thomas Gleixner7d976692016-03-10 12:54:17 +01008027
8028 /*
8029 * Put the rq online, if not already. This happens:
8030 *
8031 * 1) In the early boot process, because we build the real domains
8032 * after all cpus have been brought up.
8033 *
8034 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
8035 * domains.
8036 */
8037 raw_spin_lock_irqsave(&rq->lock, flags);
8038 if (rq->rd) {
8039 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
8040 set_rq_online(rq);
8041 }
8042 raw_spin_unlock_irqrestore(&rq->lock, flags);
8043
8044 update_max_interval();
Pavankumar Kondetif51d5392018-11-28 11:57:29 +05308045 walt_update_min_max_capacity();
Thomas Gleixner7d976692016-03-10 12:54:17 +01008046
Thomas Gleixner40190a72016-03-10 12:54:13 +01008047 return 0;
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01008048}
8049
Thomas Gleixner40190a72016-03-10 12:54:13 +01008050int sched_cpu_deactivate(unsigned int cpu)
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01008051{
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01008052 int ret;
8053
Thomas Gleixner40190a72016-03-10 12:54:13 +01008054 set_cpu_active(cpu, false);
Kyle Yana9790472017-06-19 15:01:20 -07008055 /*
8056 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
8057 * users of this state to go away such that all new such users will
8058 * observe it.
8059 *
8060 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
8061 * not imply sync_sched(), so wait for both.
8062 *
8063 * Do sync before park smpboot threads to take care the rcu boost case.
8064 */
8065 if (IS_ENABLED(CONFIG_PREEMPT))
8066 synchronize_rcu_mult(call_rcu, call_rcu_sched);
8067 else
8068 synchronize_rcu();
Thomas Gleixner40190a72016-03-10 12:54:13 +01008069
Ben Hutchingsc8034092019-05-10 00:46:25 +01008070#ifdef CONFIG_SCHED_SMT
8071 /*
8072 * When going down, decrement the number of cores with SMT present.
8073 */
8074 if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
8075 atomic_dec(&sched_smt_present);
8076#endif
8077
Thomas Gleixner40190a72016-03-10 12:54:13 +01008078 if (!sched_smp_initialized)
8079 return 0;
8080
8081 ret = cpuset_cpu_inactive(cpu);
8082 if (ret) {
8083 set_cpu_active(cpu, true);
8084 return ret;
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01008085 }
Thomas Gleixner40190a72016-03-10 12:54:13 +01008086 sched_domains_numa_masks_clear(cpu);
Pavankumar Kondetif51d5392018-11-28 11:57:29 +05308087 walt_update_min_max_capacity();
Thomas Gleixner40190a72016-03-10 12:54:13 +01008088 return 0;
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01008089}
8090
Thomas Gleixner94baf7a2016-03-10 12:54:15 +01008091static void sched_rq_cpu_starting(unsigned int cpu)
8092{
8093 struct rq *rq = cpu_rq(cpu);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008094 unsigned long flags;
Thomas Gleixner94baf7a2016-03-10 12:54:15 +01008095
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008096 raw_spin_lock_irqsave(&rq->lock, flags);
8097 set_window_start(rq);
8098 raw_spin_unlock_irqrestore(&rq->lock, flags);
Thomas Gleixner94baf7a2016-03-10 12:54:15 +01008099 rq->calc_load_update = calc_load_update;
Thomas Gleixner94baf7a2016-03-10 12:54:15 +01008100 update_max_interval();
8101}
8102
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01008103int sched_cpu_starting(unsigned int cpu)
8104{
8105 set_cpu_rq_start_time(cpu);
Thomas Gleixner94baf7a2016-03-10 12:54:15 +01008106 sched_rq_cpu_starting(cpu);
Maria Yua396ef72019-06-27 14:43:16 +08008107 clear_walt_request(cpu);
Thomas Gleixner135fb3e2016-03-10 12:54:11 +01008108 return 0;
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008109}
8110
Thomas Gleixnerf2785dd2016-03-10 12:54:18 +01008111#ifdef CONFIG_HOTPLUG_CPU
8112int sched_cpu_dying(unsigned int cpu)
8113{
8114 struct rq *rq = cpu_rq(cpu);
8115 unsigned long flags;
8116
8117 /* Handle pending wakeups and then migrate everything off */
8118 sched_ttwu_pending();
8119 raw_spin_lock_irqsave(&rq->lock, flags);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008120
Thomas Gleixnerf2785dd2016-03-10 12:54:18 +01008121 if (rq->rd) {
8122 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
8123 set_rq_offline(rq);
8124 }
Olav Haugan3f2cb302016-05-31 14:34:46 -07008125 migrate_tasks(rq, true);
Thomas Gleixnerf2785dd2016-03-10 12:54:18 +01008126 BUG_ON(rq->nr_running != 1);
8127 raw_spin_unlock_irqrestore(&rq->lock, flags);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008128
Pavankumar Kondeti84f72d72017-07-20 11:00:45 +05308129 clear_walt_request(cpu);
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008130
Thomas Gleixnerf2785dd2016-03-10 12:54:18 +01008131 calc_load_migrate(rq);
8132 update_max_interval();
Thomas Gleixner20a5c8c2016-03-10 12:54:20 +01008133 nohz_balance_exit_idle(cpu);
Thomas Gleixnere5ef27d2016-03-10 12:54:21 +01008134 hrtick_clear(rq);
Thomas Gleixnerf2785dd2016-03-10 12:54:18 +01008135 return 0;
8136}
8137#endif
8138
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008139void __init sched_init_smp(void)
8140{
Linus Torvalds1da177e2005-04-16 15:20:36 -07008141 cpumask_var_t non_isolated_cpus;
8142
Christoph Lameter476f3532007-05-06 14:48:58 -07008143 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
Ingo Molnardd41f592007-07-09 18:51:59 +02008144 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
Mike Travis434d53b2008-04-04 18:11:04 -07008145
8146 sched_init_numa();
8147
Peter Zijlstra6acce3e2013-10-11 14:38:20 +02008148 /*
8149 * There's no userspace yet to cause hotplug operations; hence all the
8150 * cpu masks are stable and all blatant races in the below code cannot
8151 * happen.
8152 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07008153 mutex_lock(&sched_domains_mutex);
8154 init_sched_domains(cpu_active_mask);
8155 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
8156 if (cpumask_empty(non_isolated_cpus))
8157 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
8158 mutex_unlock(&sched_domains_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008159
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008160 update_cluster_topology();
8161
Mike Travis434d53b2008-04-04 18:11:04 -07008162 /* Move init over to a non-isolated CPU */
8163 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
8164 BUG();
Pavankumar Kondeti435eea92019-02-28 10:40:39 +05308165 cpumask_copy(&current->cpus_requested, cpu_possible_mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008166 sched_init_granularity();
Rusty Russelldcc30a32008-11-25 02:35:12 +10308167 free_cpumask_var(non_isolated_cpus);
Rusty Russell42128232008-11-25 02:35:12 +10308168
Rusty Russell0e3900e2008-11-25 02:35:13 +10308169 init_sched_rt_class();
Juri Lelli1baca4c2013-11-07 14:43:38 +01008170 init_sched_dl_class();
Thomas Gleixnere26fbff2016-03-10 12:54:10 +01008171 sched_smp_initialized = true;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008172}
Thomas Gleixnere26fbff2016-03-10 12:54:10 +01008173
8174static int __init migration_init(void)
8175{
Thomas Gleixner94baf7a2016-03-10 12:54:15 +01008176 sched_rq_cpu_starting(smp_processor_id());
Thomas Gleixnere26fbff2016-03-10 12:54:10 +01008177 return 0;
8178}
8179early_initcall(migration_init);
8180
Ingo Molnardd41f592007-07-09 18:51:59 +02008181#else
8182void __init sched_init_smp(void)
8183{
Linus Torvalds1da177e2005-04-16 15:20:36 -07008184 sched_init_granularity();
8185}
Peter Williams2dd73a42006-06-27 02:54:34 -07008186#endif /* CONFIG_SMP */
Heiko Carstensb50f60c2006-07-30 03:03:52 -07008187
Avi Kivitye107be32007-07-26 13:40:43 +02008188int in_sched_functions(unsigned long addr)
8189{
8190 return in_lock_functions(addr) ||
8191 (addr >= (unsigned long)__sched_text_start
Christoph Lameterc9819f42006-12-10 02:20:25 -08008192 && addr < (unsigned long)__sched_text_end);
Christoph Lameter476f3532007-05-06 14:48:58 -07008193}
Christoph Lameterc9819f42006-12-10 02:20:25 -08008194
Peter Zijlstra029632f2011-10-25 10:00:11 +02008195#ifdef CONFIG_CGROUP_SCHED
Li Zefan27b4b932013-03-05 16:07:52 +08008196/*
8197 * Default task group.
8198 * Every task in system belongs to this group at bootup.
8199 */
Peter Zijlstra029632f2011-10-25 10:00:11 +02008200struct task_group root_task_group;
Mike Galbraith35cf4e52012-08-07 05:00:13 +02008201LIST_HEAD(task_groups);
Waiman Longb0367622015-12-02 13:41:49 -05008202
8203/* Cacheline aligned slab cache for task_group */
8204static struct kmem_cache *task_group_cache __read_mostly;
Heiko Carstensb50f60c2006-07-30 03:03:52 -07008205#endif
8206
Joonsoo Kime6252c32013-04-23 17:27:41 +09008207DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02008208DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008209
Linus Torvalds9dcb8b62016-10-26 10:15:30 -07008210#define WAIT_TABLE_BITS 8
8211#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
8212static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
8213
8214wait_queue_head_t *bit_waitqueue(void *word, int bit)
8215{
8216 const int shift = BITS_PER_LONG == 32 ? 5 : 6;
8217 unsigned long val = (unsigned long)word << shift | bit;
8218
8219 return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
8220}
8221EXPORT_SYMBOL(bit_waitqueue);
8222
Linus Torvalds1da177e2005-04-16 15:20:36 -07008223void __init sched_init(void)
8224{
Linus Torvalds1da177e2005-04-16 15:20:36 -07008225 int i, j;
Mike Travis434d53b2008-04-04 18:11:04 -07008226 unsigned long alloc_size = 0, ptr;
8227
Linus Torvalds9dcb8b62016-10-26 10:15:30 -07008228 for (i = 0; i < WAIT_TABLE_SIZE; i++)
8229 init_waitqueue_head(bit_wait_table + i);
8230
Syed Rameez Mustafa084075b2016-08-31 16:54:12 -07008231 sched_boost_parse_dt();
Pavankumar Kondeti14c79002016-10-01 11:06:13 +05308232 init_clusters();
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008233
Mike Travis434d53b2008-04-04 18:11:04 -07008234#ifdef CONFIG_FAIR_GROUP_SCHED
8235 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8236#endif
8237#ifdef CONFIG_RT_GROUP_SCHED
8238 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8239#endif
Mike Travis434d53b2008-04-04 18:11:04 -07008240 if (alloc_size) {
Pekka Enberg36b7b6d2009-06-10 23:42:36 +03008241 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
Mike Travis434d53b2008-04-04 18:11:04 -07008242
8243#ifdef CONFIG_FAIR_GROUP_SCHED
Yong Zhang07e06b02011-01-07 15:17:36 +08008244 root_task_group.se = (struct sched_entity **)ptr;
Mike Travis434d53b2008-04-04 18:11:04 -07008245 ptr += nr_cpu_ids * sizeof(void **);
8246
Yong Zhang07e06b02011-01-07 15:17:36 +08008247 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
Mike Travis434d53b2008-04-04 18:11:04 -07008248 ptr += nr_cpu_ids * sizeof(void **);
Peter Zijlstraeff766a2008-04-19 19:45:00 +02008249
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02008250#endif /* CONFIG_FAIR_GROUP_SCHED */
Mike Travis434d53b2008-04-04 18:11:04 -07008251#ifdef CONFIG_RT_GROUP_SCHED
Yong Zhang07e06b02011-01-07 15:17:36 +08008252 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
Mike Travis434d53b2008-04-04 18:11:04 -07008253 ptr += nr_cpu_ids * sizeof(void **);
8254
Yong Zhang07e06b02011-01-07 15:17:36 +08008255 root_task_group.rt_rq = (struct rt_rq **)ptr;
Peter Zijlstraeff766a2008-04-19 19:45:00 +02008256 ptr += nr_cpu_ids * sizeof(void **);
8257
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02008258#endif /* CONFIG_RT_GROUP_SCHED */
Mike Travis434d53b2008-04-04 18:11:04 -07008259 }
Alex Thorltonb74e6272014-12-18 12:44:30 -06008260#ifdef CONFIG_CPUMASK_OFFSTACK
8261 for_each_possible_cpu(i) {
8262 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
8263 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02008264 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
8265 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
Alex Thorltonb74e6272014-12-18 12:44:30 -06008266 }
8267#endif /* CONFIG_CPUMASK_OFFSTACK */
Linus Torvalds1da177e2005-04-16 15:20:36 -07008268
Dario Faggioli332ac172013-11-07 14:43:45 +01008269 init_rt_bandwidth(&def_rt_bandwidth,
8270 global_rt_period(), global_rt_runtime());
8271 init_dl_bandwidth(&def_dl_bandwidth,
Peter Zijlstra17248132013-12-17 12:44:49 +01008272 global_rt_period(), global_rt_runtime());
Dario Faggioli332ac172013-11-07 14:43:45 +01008273
Gregory Haskins57d885f2008-01-25 21:08:18 +01008274#ifdef CONFIG_SMP
8275 init_defrootdomain();
8276#endif
8277
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008278#ifdef CONFIG_RT_GROUP_SCHED
Yong Zhang07e06b02011-01-07 15:17:36 +08008279 init_rt_bandwidth(&root_task_group.rt_bandwidth,
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008280 global_rt_period(), global_rt_runtime());
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02008281#endif /* CONFIG_RT_GROUP_SCHED */
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008282
Dhaval Giani7c941432010-01-20 13:26:18 +01008283#ifdef CONFIG_CGROUP_SCHED
Waiman Longb0367622015-12-02 13:41:49 -05008284 task_group_cache = KMEM_CACHE(task_group, 0);
8285
Yong Zhang07e06b02011-01-07 15:17:36 +08008286 list_add(&root_task_group.list, &task_groups);
8287 INIT_LIST_HEAD(&root_task_group.children);
Glauber Costaf4d6f6c2011-11-01 19:19:07 -02008288 INIT_LIST_HEAD(&root_task_group.siblings);
Mike Galbraith5091faa2010-11-30 14:18:03 +01008289 autogroup_init(&init_task);
Dhaval Giani7c941432010-01-20 13:26:18 +01008290#endif /* CONFIG_CGROUP_SCHED */
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008291
Ingo Molnardd41f592007-07-09 18:51:59 +02008292 for_each_possible_cpu(i) {
Ingo Molnardd41f592007-07-09 18:51:59 +02008293 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008294
8295 rq = cpu_rq(i);
Thomas Gleixner05fa7852009-11-17 14:28:38 +01008296 raw_spin_lock_init(&rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008297 rq->nr_running = 0;
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02008298 rq->calc_load_active = 0;
8299 rq->calc_load_update = jiffies + LOAD_FREQ;
Jan H. Schönherracb5a9b2011-07-14 18:32:43 +02008300 init_cfs_rq(&rq->cfs);
Abel Vesa07c54f72015-03-03 13:50:27 +02008301 init_rt_rq(&rq->rt);
8302 init_dl_rq(&rq->dl);
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008303#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra029632f2011-10-25 10:00:11 +02008304 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008305 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
Vincent Guittot96956e22016-11-08 10:53:44 +01008306 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
Dhaval Giani354d60c2008-04-19 19:44:59 +02008307 /*
Yong Zhang07e06b02011-01-07 15:17:36 +08008308 * How much cpu bandwidth does root_task_group get?
Dhaval Giani354d60c2008-04-19 19:44:59 +02008309 *
8310 * In case of task-groups formed thr' the cgroup filesystem, it
8311 * gets 100% of the cpu resources in the system. This overall
8312 * system cpu resource is divided among the tasks of
Yong Zhang07e06b02011-01-07 15:17:36 +08008313 * root_task_group and its child task-groups in a fair manner,
Dhaval Giani354d60c2008-04-19 19:44:59 +02008314 * based on each entity's (task or task-group's) weight
8315 * (se->load.weight).
8316 *
Yong Zhang07e06b02011-01-07 15:17:36 +08008317 * In other words, if root_task_group has 10 tasks of weight
Dhaval Giani354d60c2008-04-19 19:44:59 +02008318 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8319 * then A0's share of the cpu resource is:
8320 *
Ingo Molnar0d905bc2009-05-04 19:13:30 +02008321 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
Dhaval Giani354d60c2008-04-19 19:44:59 +02008322 *
Yong Zhang07e06b02011-01-07 15:17:36 +08008323 * We achieve this by letting root_task_group's tasks sit
8324 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
Dhaval Giani354d60c2008-04-19 19:44:59 +02008325 */
Paul Turnerab84d312011-07-21 09:43:28 -07008326 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
Yong Zhang07e06b02011-01-07 15:17:36 +08008327 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
Dhaval Giani354d60c2008-04-19 19:44:59 +02008328#endif /* CONFIG_FAIR_GROUP_SCHED */
8329
8330 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01008331#ifdef CONFIG_RT_GROUP_SCHED
Yong Zhang07e06b02011-01-07 15:17:36 +08008332 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008333#endif
Ingo Molnar91368d72006-03-23 03:00:54 -08008334
Linus Torvalds1da177e2005-04-16 15:20:36 -07008335 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
8336 rq->cpu_load[j] = 0;
Venkatesh Pallipadifdf3e952010-05-17 18:14:43 -07008337
Linus Torvalds1da177e2005-04-16 15:20:36 -07008338#ifdef CONFIG_SMP
Peter Zijlstraa4c410f2006-12-06 20:37:21 -08008339 rq->sd = NULL;
Gregory Haskins57d885f2008-01-25 21:08:18 +01008340 rq->rd = NULL;
Vincent Guittotca6d75e2015-02-27 16:54:09 +01008341 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
Peter Zijlstrae3fca9e2015-06-11 14:46:37 +02008342 rq->balance_callback = NULL;
Ingo Molnar3117df02006-12-13 00:34:43 -08008343 rq->active_balance = 0;
8344 rq->next_balance = jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008345 rq->push_cpu = 0;
8346 rq->cpu = i;
Gregory Haskins1f11eb62008-06-04 15:04:05 -04008347 rq->online = 0;
Mike Galbraitheae0c9d2009-11-10 03:50:02 +01008348 rq->idle_stamp = 0;
8349 rq->avg_idle = 2*sysctl_sched_migration_cost;
Jason Low9bd721c2013-09-13 11:26:52 -07008350 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008351 rq->push_task = NULL;
Vikram Mulukutlae625d402017-07-17 12:31:52 -07008352 walt_sched_init(rq);
Syed Rameez Mustafa59b5fb72016-05-31 16:40:45 -07008353
Peter Zijlstra367456c2012-02-20 21:49:09 +01008354 INIT_LIST_HEAD(&rq->cfs_tasks);
8355
Gregory Haskinsdc938522008-01-25 21:08:26 +01008356 rq_attach_root(rq, &def_root_domain);
Frederic Weisbecker3451d022011-08-10 23:21:01 +02008357#ifdef CONFIG_NO_HZ_COMMON
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02008358 rq->last_load_update_tick = jiffies;
Suresh Siddha1c792db2011-12-01 17:07:32 -08008359 rq->nohz_flags = 0;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07008360#endif
Frederic Weisbecker265f22a2013-05-03 03:39:05 +02008361#ifdef CONFIG_NO_HZ_FULL
8362 rq->last_sched_tick = 0;
8363#endif
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02008364#endif /* CONFIG_SMP */
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01008365 init_rq_hrtick(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008366 atomic_set(&rq->nr_iowait, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008367 }
Ingo Molnara0f98a12007-06-17 18:37:45 +02008368
Joonwoo Parka976a452016-11-28 13:41:18 -08008369 i = alloc_related_thread_groups();
8370 BUG_ON(i);
8371
Ingo Molnardd41f592007-07-09 18:51:59 +02008372 set_load_weight(&init_task);
8373
Linus Torvalds1da177e2005-04-16 15:20:36 -07008374 /*
8375 * The boot idle thread does lazy MMU switching as well:
8376 */
8377 atomic_inc(&init_mm.mm_count);
8378 enter_lazy_tlb(&init_mm, current);
8379
8380 /*
8381 * Make us the idle thread. Technically, schedule() should not be
8382 * called from this thread, however somewhere below it might be,
8383 * but because we are the idle thread, we just pick up running again
8384 * when this runqueue becomes "idle".
8385 */
Pavankumar Kondeti736630c2018-09-20 15:31:36 +05308386 init_idle(current, smp_processor_id());
8387 init_new_task_load(current);
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02008388
8389 calc_load_update = jiffies + LOAD_FREQ;
8390
Rusty Russellbf4d83f2008-11-25 09:57:51 +10308391#ifdef CONFIG_SMP
Peter Zijlstra4cb98832011-04-07 14:09:58 +02008392 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
Rusty Russellbdddd292009-12-02 14:09:16 +10308393 /* May be allocated at isolcpus cmdline parse time */
8394 if (cpu_isolated_map == NULL)
8395 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
Thomas Gleixner29d5e042012-04-20 13:05:45 +00008396 idle_thread_set_boot_cpu();
Thomas Gleixner9cf72432016-03-10 12:54:09 +01008397 set_cpu_rq_start_time(smp_processor_id());
Peter Zijlstra029632f2011-10-25 10:00:11 +02008398#endif
8399 init_sched_fair_class();
Rusty Russell6a7b3dc2008-11-25 02:35:04 +10308400
Josh Poimboeuf4698f882016-06-07 14:43:16 -05008401 init_schedstats();
8402
Johannes Weiner3df0e592018-10-26 15:06:27 -07008403 psi_init();
8404
Ingo Molnar6892b752008-02-13 14:02:36 +01008405 scheduler_running = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008406}
8407
Frederic Weisbeckerd902db12011-06-08 19:31:56 +02008408#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
Frederic Weisbeckere4aafea2009-07-16 15:44:29 +02008409static inline int preempt_count_equals(int preempt_offset)
8410{
Peter Zijlstrada7142e2015-09-28 18:11:45 +02008411 int nested = preempt_count() + rcu_preempt_depth();
Frederic Weisbeckere4aafea2009-07-16 15:44:29 +02008412
Arnd Bergmann4ba82162011-01-25 22:52:22 +01008413 return (nested == preempt_offset);
Frederic Weisbeckere4aafea2009-07-16 15:44:29 +02008414}
8415
Arve Hjønnevåg6828a7f2008-12-10 20:06:28 -08008416static int __might_sleep_init_called;
8417int __init __might_sleep_init(void)
8418{
8419 __might_sleep_init_called = 1;
8420 return 0;
8421}
8422early_initcall(__might_sleep_init);
8423
Simon Kagstromd8948372009-12-23 11:08:18 +01008424void __might_sleep(const char *file, int line, int preempt_offset)
Linus Torvalds1da177e2005-04-16 15:20:36 -07008425{
Peter Zijlstra8eb23b92014-09-24 10:18:55 +02008426 /*
8427 * Blocking primitives will set (and therefore destroy) current->state,
8428 * since we will exit with TASK_RUNNING make sure we enter with it,
8429 * otherwise we will destroy state.
8430 */
Linus Torvalds00845eb2015-02-01 12:23:32 -08008431 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
Peter Zijlstra8eb23b92014-09-24 10:18:55 +02008432 "do not call blocking ops when !TASK_RUNNING; "
8433 "state=%lx set at [<%p>] %pS\n",
8434 current->state,
8435 (void *)current->task_state_change,
Linus Torvalds00845eb2015-02-01 12:23:32 -08008436 (void *)current->task_state_change);
Peter Zijlstra8eb23b92014-09-24 10:18:55 +02008437
Peter Zijlstra34274452014-09-24 10:18:56 +02008438 ___might_sleep(file, line, preempt_offset);
8439}
8440EXPORT_SYMBOL(__might_sleep);
8441
8442void ___might_sleep(const char *file, int line, int preempt_offset)
8443{
Linus Torvalds1da177e2005-04-16 15:20:36 -07008444 static unsigned long prev_jiffy; /* ratelimiting */
Vegard Nossumd1c6d142016-07-23 09:46:39 +02008445 unsigned long preempt_disable_ip;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008446
Paul E. McKenneyb3fbab02011-05-24 08:31:09 -07008447 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
Thomas Gleixnerdb273be2014-02-07 20:58:38 +01008448 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
Arve Hjønnevåg6828a7f2008-12-10 20:06:28 -08008449 !is_idle_task(current)) || oops_in_progress)
8450 return;
8451 if (system_state != SYSTEM_RUNNING &&
8452 (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
Ingo Molnaraef745f2008-08-28 11:34:43 +02008453 return;
8454 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8455 return;
8456 prev_jiffy = jiffies;
8457
Vegard Nossumd1c6d142016-07-23 09:46:39 +02008458 /* Save this before calling printk(), since that will clobber it */
8459 preempt_disable_ip = get_preempt_disable_ip(current);
8460
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01008461 printk(KERN_ERR
8462 "BUG: sleeping function called from invalid context at %s:%d\n",
8463 file, line);
8464 printk(KERN_ERR
8465 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8466 in_atomic(), irqs_disabled(),
8467 current->pid, current->comm);
Ingo Molnaraef745f2008-08-28 11:34:43 +02008468
Eric Sandeena8b686b2014-12-16 16:25:28 -06008469 if (task_stack_end_corrupted(current))
8470 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
8471
Ingo Molnaraef745f2008-08-28 11:34:43 +02008472 debug_show_held_locks(current);
8473 if (irqs_disabled())
8474 print_irqtrace_events(current);
Vegard Nossumd1c6d142016-07-23 09:46:39 +02008475 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
8476 && !preempt_count_equals(preempt_offset)) {
Thomas Gleixner8f47b182014-02-07 20:58:39 +01008477 pr_err("Preemption disabled at:");
Vegard Nossumd1c6d142016-07-23 09:46:39 +02008478 print_ip_sym(preempt_disable_ip);
Thomas Gleixner8f47b182014-02-07 20:58:39 +01008479 pr_cont("\n");
8480 }
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07008481#ifdef CONFIG_PANIC_ON_SCHED_BUG
8482 BUG();
8483#endif
Ingo Molnaraef745f2008-08-28 11:34:43 +02008484 dump_stack();
Vegard Nossumf0b22e32016-07-22 21:46:02 +02008485 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008486}
Peter Zijlstra34274452014-09-24 10:18:56 +02008487EXPORT_SYMBOL(___might_sleep);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008488#endif
8489
8490#ifdef CONFIG_MAGIC_SYSRQ
8491void normalize_rt_tasks(void)
8492{
8493 struct task_struct *g, *p;
Peter Zijlstradbc7f062015-06-11 14:46:38 +02008494 struct sched_attr attr = {
8495 .sched_policy = SCHED_NORMAL,
8496 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07008497
Oleg Nesterov3472eaa2014-09-21 21:33:38 +02008498 read_lock(&tasklist_lock);
Oleg Nesterov5d07f422014-08-13 21:19:53 +02008499 for_each_process_thread(g, p) {
Ingo Molnar178be792007-10-15 17:00:18 +02008500 /*
8501 * Only normalize user tasks:
8502 */
Oleg Nesterov3472eaa2014-09-21 21:33:38 +02008503 if (p->flags & PF_KTHREAD)
Ingo Molnar178be792007-10-15 17:00:18 +02008504 continue;
8505
Josh Poimboeuf4fa8d2992016-06-17 12:43:26 -05008506 p->se.exec_start = 0;
8507 schedstat_set(p->se.statistics.wait_start, 0);
8508 schedstat_set(p->se.statistics.sleep_start, 0);
8509 schedstat_set(p->se.statistics.block_start, 0);
Ingo Molnardd41f592007-07-09 18:51:59 +02008510
Dario Faggioliaab03e02013-11-28 11:14:43 +01008511 if (!dl_task(p) && !rt_task(p)) {
Ingo Molnardd41f592007-07-09 18:51:59 +02008512 /*
8513 * Renice negative nice level userspace
8514 * tasks back to 0:
8515 */
Oleg Nesterov3472eaa2014-09-21 21:33:38 +02008516 if (task_nice(p) < 0)
Ingo Molnardd41f592007-07-09 18:51:59 +02008517 set_user_nice(p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008518 continue;
Ingo Molnardd41f592007-07-09 18:51:59 +02008519 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07008520
Peter Zijlstradbc7f062015-06-11 14:46:38 +02008521 __sched_setscheduler(p, &attr, false, false);
Oleg Nesterov5d07f422014-08-13 21:19:53 +02008522 }
Oleg Nesterov3472eaa2014-09-21 21:33:38 +02008523 read_unlock(&tasklist_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008524}
8525
8526#endif /* CONFIG_MAGIC_SYSRQ */
Linus Torvalds1df5c102005-09-12 07:59:21 -07008527
Jason Wessel67fc4e02010-05-20 21:04:21 -05008528#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
Linus Torvalds1df5c102005-09-12 07:59:21 -07008529/*
Jason Wessel67fc4e02010-05-20 21:04:21 -05008530 * These functions are only useful for the IA64 MCA handling, or kdb.
Linus Torvalds1df5c102005-09-12 07:59:21 -07008531 *
8532 * They can only be called when the whole system has been
8533 * stopped - every CPU needs to be quiescent, and no scheduling
8534 * activity can take place. Using them for anything else would
8535 * be a serious bug, and as a result, they aren't even visible
8536 * under any other configuration.
8537 */
8538
8539/**
8540 * curr_task - return the current task for a given cpu.
8541 * @cpu: the processor in question.
8542 *
8543 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
Yacine Belkadie69f6182013-07-12 20:45:47 +02008544 *
8545 * Return: The current task for @cpu.
Linus Torvalds1df5c102005-09-12 07:59:21 -07008546 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07008547struct task_struct *curr_task(int cpu)
Linus Torvalds1df5c102005-09-12 07:59:21 -07008548{
8549 return cpu_curr(cpu);
8550}
8551
Jason Wessel67fc4e02010-05-20 21:04:21 -05008552#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
8553
8554#ifdef CONFIG_IA64
Linus Torvalds1df5c102005-09-12 07:59:21 -07008555/**
8556 * set_curr_task - set the current task for a given cpu.
8557 * @cpu: the processor in question.
8558 * @p: the task pointer to set.
8559 *
8560 * Description: This function must only be used when non-maskable interrupts
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01008561 * are serviced on a separate stack. It allows the architecture to switch the
8562 * notion of the current task on a cpu in a non-blocking manner. This function
Linus Torvalds1df5c102005-09-12 07:59:21 -07008563 * must be called with all CPU's synchronized, and interrupts disabled, the
8564 * and caller must save the original value of the current task (see
8565 * curr_task() above) and restore that value before reenabling interrupts and
8566 * re-starting the system.
8567 *
8568 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8569 */
Peter Zijlstraa458ae22016-09-20 20:29:40 +02008570void ia64_set_curr_task(int cpu, struct task_struct *p)
Linus Torvalds1df5c102005-09-12 07:59:21 -07008571{
8572 cpu_curr(cpu) = p;
8573}
8574
8575#endif
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008576
Dhaval Giani7c941432010-01-20 13:26:18 +01008577#ifdef CONFIG_CGROUP_SCHED
Peter Zijlstra029632f2011-10-25 10:00:11 +02008578/* task_group_lock serializes the addition/removal of task groups */
8579static DEFINE_SPINLOCK(task_group_lock);
8580
Peter Zijlstra2f5177f2016-03-16 16:22:45 +01008581static void sched_free_group(struct task_group *tg)
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008582{
8583 free_fair_sched_group(tg);
8584 free_rt_sched_group(tg);
Mike Galbraithe9aa1dd2011-01-05 11:11:25 +01008585 autogroup_free(tg);
Waiman Longb0367622015-12-02 13:41:49 -05008586 kmem_cache_free(task_group_cache, tg);
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008587}
8588
8589/* allocate runqueue etc for a new task group */
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008590struct task_group *sched_create_group(struct task_group *parent)
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008591{
8592 struct task_group *tg;
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008593
Waiman Longb0367622015-12-02 13:41:49 -05008594 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008595 if (!tg)
8596 return ERR_PTR(-ENOMEM);
8597
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008598 if (!alloc_fair_sched_group(tg, parent))
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008599 goto err;
8600
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008601 if (!alloc_rt_sched_group(tg, parent))
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008602 goto err;
8603
Li Zefanace783b2013-01-24 14:30:48 +08008604 return tg;
8605
8606err:
Peter Zijlstra2f5177f2016-03-16 16:22:45 +01008607 sched_free_group(tg);
Li Zefanace783b2013-01-24 14:30:48 +08008608 return ERR_PTR(-ENOMEM);
8609}
8610
8611void sched_online_group(struct task_group *tg, struct task_group *parent)
8612{
8613 unsigned long flags;
8614
Peter Zijlstra8ed36992008-02-13 15:45:39 +01008615 spin_lock_irqsave(&task_group_lock, flags);
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008616 list_add_rcu(&tg->list, &task_groups);
Peter Zijlstraf473aa52008-04-19 19:45:00 +02008617
8618 WARN_ON(!parent); /* root should already exist */
8619
8620 tg->parent = parent;
Peter Zijlstraf473aa52008-04-19 19:45:00 +02008621 INIT_LIST_HEAD(&tg->children);
Zhang, Yanmin09f27242030-08-14 15:56:40 +08008622 list_add_rcu(&tg->siblings, &parent->children);
Peter Zijlstra8ed36992008-02-13 15:45:39 +01008623 spin_unlock_irqrestore(&task_group_lock, flags);
Peter Zijlstra8663e242016-06-22 14:58:02 +02008624
8625 online_fair_sched_group(tg);
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008626}
8627
Srivatsa Vaddagiri9b5b7752007-10-15 17:00:09 +02008628/* rcu callback to free various structures associated with a task group */
Peter Zijlstra2f5177f2016-03-16 16:22:45 +01008629static void sched_free_group_rcu(struct rcu_head *rhp)
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008630{
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008631 /* now it should be safe to free those cfs_rqs */
Peter Zijlstra2f5177f2016-03-16 16:22:45 +01008632 sched_free_group(container_of(rhp, struct task_group, rcu));
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008633}
8634
Ingo Molnar4cf86d72007-10-15 17:00:14 +02008635void sched_destroy_group(struct task_group *tg)
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008636{
Li Zefanace783b2013-01-24 14:30:48 +08008637 /* wait for possible concurrent references to cfs_rqs complete */
Peter Zijlstra2f5177f2016-03-16 16:22:45 +01008638 call_rcu(&tg->rcu, sched_free_group_rcu);
Li Zefanace783b2013-01-24 14:30:48 +08008639}
8640
8641void sched_offline_group(struct task_group *tg)
8642{
Peter Zijlstra8ed36992008-02-13 15:45:39 +01008643 unsigned long flags;
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008644
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08008645 /* end participation in shares distribution */
Peter Zijlstra6fe1f342016-01-21 22:24:16 +01008646 unregister_fair_sched_group(tg);
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08008647
8648 spin_lock_irqsave(&task_group_lock, flags);
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008649 list_del_rcu(&tg->list);
Peter Zijlstraf473aa52008-04-19 19:45:00 +02008650 list_del_rcu(&tg->siblings);
Peter Zijlstra8ed36992008-02-13 15:45:39 +01008651 spin_unlock_irqrestore(&task_group_lock, flags);
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008652}
8653
Vincent Guittotea86cb42016-06-17 13:38:55 +02008654static void sched_change_group(struct task_struct *tsk, int type)
8655{
8656 struct task_group *tg;
8657
8658 /*
8659 * All callers are synchronized by task_rq_lock(); we do not use RCU
8660 * which is pointless here. Thus, we pass "true" to task_css_check()
8661 * to prevent lockdep warnings.
8662 */
8663 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
8664 struct task_group, css);
8665 tg = autogroup_task_group(tsk, tg);
8666 tsk->sched_task_group = tg;
8667
8668#ifdef CONFIG_FAIR_GROUP_SCHED
8669 if (tsk->sched_class->task_change_group)
8670 tsk->sched_class->task_change_group(tsk, type);
8671 else
8672#endif
8673 set_task_rq(tsk, task_cpu(tsk));
8674}
8675
8676/*
8677 * Change task's runqueue when it moves between groups.
8678 *
8679 * The caller of this function should have put the task in its new group by
8680 * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
8681 * its new group.
Srivatsa Vaddagiri9b5b7752007-10-15 17:00:09 +02008682 */
8683void sched_move_task(struct task_struct *tsk)
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008684{
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04008685 int queued, running;
Peter Zijlstraeb580752015-07-31 21:28:18 +02008686 struct rq_flags rf;
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008687 struct rq *rq;
8688
Peter Zijlstraeb580752015-07-31 21:28:18 +02008689 rq = task_rq_lock(tsk, &rf);
Peter Zijlstra6da1c982017-01-23 16:05:55 +01008690 update_rq_clock(rq);
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008691
Dmitry Adamushko051a1d12007-12-18 15:21:13 +01008692 running = task_current(rq, tsk);
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04008693 queued = task_on_rq_queued(tsk);
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008694
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04008695 if (queued)
Peter Zijlstraff77e462016-01-18 15:27:07 +01008696 dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
Hiroshi Shimamoto0e1f3482008-03-10 11:01:20 -07008697 if (unlikely(running))
Kirill Tkhaif3cd1c42014-09-12 17:41:40 +04008698 put_prev_task(rq, tsk);
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008699
Vincent Guittotea86cb42016-06-17 13:38:55 +02008700 sched_change_group(tsk, TASK_MOVE_GROUP);
Peter Zijlstra810b3812008-02-29 15:21:01 -05008701
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04008702 if (queued)
Peter Zijlstraff77e462016-01-18 15:27:07 +01008703 enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
Vincent Guittota399d232016-09-12 09:47:52 +02008704 if (unlikely(running))
Peter Zijlstrab2bf6c32016-09-20 22:00:38 +02008705 set_curr_task(rq, tsk);
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008706
Peter Zijlstraeb580752015-07-31 21:28:18 +02008707 task_rq_unlock(rq, tsk, &rf);
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008708}
Dhaval Giani7c941432010-01-20 13:26:18 +01008709#endif /* CONFIG_CGROUP_SCHED */
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008710
Paul Turnera790de92011-07-21 09:43:29 -07008711#ifdef CONFIG_RT_GROUP_SCHED
8712/*
8713 * Ensure that the real time constraints are schedulable.
8714 */
8715static DEFINE_MUTEX(rt_constraints_mutex);
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01008716
Dhaval Giani521f1a242008-02-28 15:21:56 +05308717/* Must be called with tasklist_lock held */
8718static inline int tg_has_rt_tasks(struct task_group *tg)
8719{
8720 struct task_struct *g, *p;
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008721
Peter Zijlstra1fe89e12015-02-09 11:53:18 +01008722 /*
8723 * Autogroups do not have RT tasks; see autogroup_create().
8724 */
8725 if (task_group_is_autogroup(tg))
8726 return 0;
8727
Oleg Nesterov5d07f422014-08-13 21:19:53 +02008728 for_each_process_thread(g, p) {
Oleg Nesterov8651c652014-09-21 21:33:36 +02008729 if (rt_task(p) && task_group(p) == tg)
Dhaval Giani521f1a242008-02-28 15:21:56 +05308730 return 1;
Oleg Nesterov5d07f422014-08-13 21:19:53 +02008731 }
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008732
Dhaval Giani521f1a242008-02-28 15:21:56 +05308733 return 0;
8734}
8735
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008736struct rt_schedulable_data {
8737 struct task_group *tg;
8738 u64 rt_period;
8739 u64 rt_runtime;
8740};
8741
Paul Turnera790de92011-07-21 09:43:29 -07008742static int tg_rt_schedulable(struct task_group *tg, void *data)
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008743{
8744 struct rt_schedulable_data *d = data;
8745 struct task_group *child;
8746 unsigned long total, sum = 0;
8747 u64 period, runtime;
8748
8749 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8750 runtime = tg->rt_bandwidth.rt_runtime;
8751
8752 if (tg == d->tg) {
8753 period = d->rt_period;
8754 runtime = d->rt_runtime;
8755 }
8756
Peter Zijlstra4653f802008-09-23 15:33:44 +02008757 /*
8758 * Cannot have more runtime than the period.
8759 */
8760 if (runtime > period && runtime != RUNTIME_INF)
8761 return -EINVAL;
8762
8763 /*
8764 * Ensure we don't starve existing RT tasks.
8765 */
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008766 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8767 return -EBUSY;
8768
8769 total = to_ratio(period, runtime);
8770
Peter Zijlstra4653f802008-09-23 15:33:44 +02008771 /*
8772 * Nobody can have more than the global setting allows.
8773 */
8774 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8775 return -EINVAL;
8776
8777 /*
8778 * The sum of our children's runtime should not exceed our own.
8779 */
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008780 list_for_each_entry_rcu(child, &tg->children, siblings) {
8781 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8782 runtime = child->rt_bandwidth.rt_runtime;
8783
8784 if (child == d->tg) {
8785 period = d->rt_period;
8786 runtime = d->rt_runtime;
8787 }
8788
8789 sum += to_ratio(period, runtime);
8790 }
8791
8792 if (sum > total)
8793 return -EINVAL;
8794
8795 return 0;
8796}
8797
8798static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8799{
Paul Turner82774342011-07-21 09:43:35 -07008800 int ret;
8801
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008802 struct rt_schedulable_data data = {
8803 .tg = tg,
8804 .rt_period = period,
8805 .rt_runtime = runtime,
8806 };
8807
Paul Turner82774342011-07-21 09:43:35 -07008808 rcu_read_lock();
8809 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
8810 rcu_read_unlock();
8811
8812 return ret;
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008813}
8814
Paul Turnerab84d312011-07-21 09:43:28 -07008815static int tg_set_rt_bandwidth(struct task_group *tg,
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008816 u64 rt_period, u64 rt_runtime)
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008817{
Peter Zijlstraac086bc2008-04-19 19:44:58 +02008818 int i, err = 0;
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01008819
Peter Zijlstra2636ed52015-02-09 12:23:20 +01008820 /*
8821 * Disallowing the root group RT runtime is BAD, it would disallow the
8822 * kernel creating (and or operating) RT threads.
8823 */
8824 if (tg == &root_task_group && rt_runtime == 0)
8825 return -EINVAL;
8826
8827 /* No period doesn't make any sense. */
8828 if (rt_period == 0)
8829 return -EINVAL;
8830
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01008831 mutex_lock(&rt_constraints_mutex);
Dhaval Giani521f1a242008-02-28 15:21:56 +05308832 read_lock(&tasklist_lock);
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008833 err = __rt_schedulable(tg, rt_period, rt_runtime);
8834 if (err)
Dhaval Giani521f1a242008-02-28 15:21:56 +05308835 goto unlock;
Peter Zijlstraac086bc2008-04-19 19:44:58 +02008836
Thomas Gleixner0986b112009-11-17 15:32:06 +01008837 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008838 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
8839 tg->rt_bandwidth.rt_runtime = rt_runtime;
Peter Zijlstraac086bc2008-04-19 19:44:58 +02008840
8841 for_each_possible_cpu(i) {
8842 struct rt_rq *rt_rq = tg->rt_rq[i];
8843
Thomas Gleixner0986b112009-11-17 15:32:06 +01008844 raw_spin_lock(&rt_rq->rt_runtime_lock);
Peter Zijlstraac086bc2008-04-19 19:44:58 +02008845 rt_rq->rt_runtime = rt_runtime;
Thomas Gleixner0986b112009-11-17 15:32:06 +01008846 raw_spin_unlock(&rt_rq->rt_runtime_lock);
Peter Zijlstraac086bc2008-04-19 19:44:58 +02008847 }
Thomas Gleixner0986b112009-11-17 15:32:06 +01008848 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
Peter Zijlstra49246272010-10-17 21:46:10 +02008849unlock:
Dhaval Giani521f1a242008-02-28 15:21:56 +05308850 read_unlock(&tasklist_lock);
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01008851 mutex_unlock(&rt_constraints_mutex);
8852
8853 return err;
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008854}
8855
Li Zefan25cc7da2013-03-05 16:07:33 +08008856static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008857{
8858 u64 rt_runtime, rt_period;
8859
8860 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8861 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
8862 if (rt_runtime_us < 0)
8863 rt_runtime = RUNTIME_INF;
8864
Paul Turnerab84d312011-07-21 09:43:28 -07008865 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008866}
8867
Li Zefan25cc7da2013-03-05 16:07:33 +08008868static long sched_group_rt_runtime(struct task_group *tg)
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01008869{
8870 u64 rt_runtime_us;
8871
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008872 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01008873 return -1;
8874
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008875 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01008876 do_div(rt_runtime_us, NSEC_PER_USEC);
8877 return rt_runtime_us;
8878}
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008879
Nicholas Mc Guirece2f5fe2015-05-03 10:51:56 +02008880static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008881{
8882 u64 rt_runtime, rt_period;
8883
Nicholas Mc Guirece2f5fe2015-05-03 10:51:56 +02008884 rt_period = rt_period_us * NSEC_PER_USEC;
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008885 rt_runtime = tg->rt_bandwidth.rt_runtime;
8886
Paul Turnerab84d312011-07-21 09:43:28 -07008887 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008888}
8889
Li Zefan25cc7da2013-03-05 16:07:33 +08008890static long sched_group_rt_period(struct task_group *tg)
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008891{
8892 u64 rt_period_us;
8893
8894 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
8895 do_div(rt_period_us, NSEC_PER_USEC);
8896 return rt_period_us;
8897}
Dario Faggioli332ac172013-11-07 14:43:45 +01008898#endif /* CONFIG_RT_GROUP_SCHED */
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008899
Dario Faggioli332ac172013-11-07 14:43:45 +01008900#ifdef CONFIG_RT_GROUP_SCHED
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008901static int sched_rt_global_constraints(void)
8902{
8903 int ret = 0;
8904
8905 mutex_lock(&rt_constraints_mutex);
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008906 read_lock(&tasklist_lock);
Peter Zijlstra4653f802008-09-23 15:33:44 +02008907 ret = __rt_schedulable(NULL, 0, 0);
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008908 read_unlock(&tasklist_lock);
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008909 mutex_unlock(&rt_constraints_mutex);
8910
8911 return ret;
8912}
Dhaval Giani54e99122009-02-27 15:13:54 +05308913
Li Zefan25cc7da2013-03-05 16:07:33 +08008914static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
Dhaval Giani54e99122009-02-27 15:13:54 +05308915{
8916 /* Don't accept realtime tasks when there is no way for them to run */
8917 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
8918 return 0;
8919
8920 return 1;
8921}
8922
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02008923#else /* !CONFIG_RT_GROUP_SCHED */
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008924static int sched_rt_global_constraints(void)
8925{
Peter Zijlstraac086bc2008-04-19 19:44:58 +02008926 unsigned long flags;
Muhammad Falak R Wani8c5e9552016-05-05 15:21:19 +05308927 int i;
Hiroshi Shimamotoec5d4982008-09-10 17:00:19 -07008928
Thomas Gleixner0986b112009-11-17 15:32:06 +01008929 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
Peter Zijlstraac086bc2008-04-19 19:44:58 +02008930 for_each_possible_cpu(i) {
8931 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
8932
Thomas Gleixner0986b112009-11-17 15:32:06 +01008933 raw_spin_lock(&rt_rq->rt_runtime_lock);
Peter Zijlstraac086bc2008-04-19 19:44:58 +02008934 rt_rq->rt_runtime = global_rt_runtime();
Thomas Gleixner0986b112009-11-17 15:32:06 +01008935 raw_spin_unlock(&rt_rq->rt_runtime_lock);
Peter Zijlstraac086bc2008-04-19 19:44:58 +02008936 }
Thomas Gleixner0986b112009-11-17 15:32:06 +01008937 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
Peter Zijlstraac086bc2008-04-19 19:44:58 +02008938
Muhammad Falak R Wani8c5e9552016-05-05 15:21:19 +05308939 return 0;
Dario Faggioli332ac172013-11-07 14:43:45 +01008940}
8941#endif /* CONFIG_RT_GROUP_SCHED */
8942
Wanpeng Lia1963b82015-03-17 19:15:31 +08008943static int sched_dl_global_validate(void)
Dario Faggioli332ac172013-11-07 14:43:45 +01008944{
Peter Zijlstra17248132013-12-17 12:44:49 +01008945 u64 runtime = global_rt_runtime();
8946 u64 period = global_rt_period();
Dario Faggioli332ac172013-11-07 14:43:45 +01008947 u64 new_bw = to_ratio(period, runtime);
Kirill Tkhaif10e00f2014-09-30 12:23:37 +04008948 struct dl_bw *dl_b;
Peter Zijlstra17248132013-12-17 12:44:49 +01008949 int cpu, ret = 0;
Juri Lelli49516342014-02-11 09:24:27 +01008950 unsigned long flags;
Dario Faggioli332ac172013-11-07 14:43:45 +01008951
8952 /*
8953 * Here we want to check the bandwidth not being set to some
8954 * value smaller than the currently allocated bandwidth in
8955 * any of the root_domains.
8956 *
8957 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
8958 * cycling on root_domains... Discussion on different/better
8959 * solutions is welcome!
8960 */
Peter Zijlstra17248132013-12-17 12:44:49 +01008961 for_each_possible_cpu(cpu) {
Kirill Tkhaif10e00f2014-09-30 12:23:37 +04008962 rcu_read_lock_sched();
8963 dl_b = dl_bw_of(cpu);
Dario Faggioli332ac172013-11-07 14:43:45 +01008964
Juri Lelli49516342014-02-11 09:24:27 +01008965 raw_spin_lock_irqsave(&dl_b->lock, flags);
Peter Zijlstra17248132013-12-17 12:44:49 +01008966 if (new_bw < dl_b->total_bw)
8967 ret = -EBUSY;
Juri Lelli49516342014-02-11 09:24:27 +01008968 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
Peter Zijlstra17248132013-12-17 12:44:49 +01008969
Kirill Tkhaif10e00f2014-09-30 12:23:37 +04008970 rcu_read_unlock_sched();
8971
Peter Zijlstra17248132013-12-17 12:44:49 +01008972 if (ret)
8973 break;
Dario Faggioli332ac172013-11-07 14:43:45 +01008974 }
8975
Peter Zijlstra17248132013-12-17 12:44:49 +01008976 return ret;
8977}
8978
8979static void sched_dl_do_global(void)
8980{
8981 u64 new_bw = -1;
Kirill Tkhaif10e00f2014-09-30 12:23:37 +04008982 struct dl_bw *dl_b;
Peter Zijlstra17248132013-12-17 12:44:49 +01008983 int cpu;
Juri Lelli49516342014-02-11 09:24:27 +01008984 unsigned long flags;
Peter Zijlstra17248132013-12-17 12:44:49 +01008985
8986 def_dl_bandwidth.dl_period = global_rt_period();
8987 def_dl_bandwidth.dl_runtime = global_rt_runtime();
8988
8989 if (global_rt_runtime() != RUNTIME_INF)
8990 new_bw = to_ratio(global_rt_period(), global_rt_runtime());
8991
8992 /*
8993 * FIXME: As above...
8994 */
8995 for_each_possible_cpu(cpu) {
Kirill Tkhaif10e00f2014-09-30 12:23:37 +04008996 rcu_read_lock_sched();
8997 dl_b = dl_bw_of(cpu);
Peter Zijlstra17248132013-12-17 12:44:49 +01008998
Juri Lelli49516342014-02-11 09:24:27 +01008999 raw_spin_lock_irqsave(&dl_b->lock, flags);
Peter Zijlstra17248132013-12-17 12:44:49 +01009000 dl_b->bw = new_bw;
Juri Lelli49516342014-02-11 09:24:27 +01009001 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
Kirill Tkhaif10e00f2014-09-30 12:23:37 +04009002
9003 rcu_read_unlock_sched();
Peter Zijlstra17248132013-12-17 12:44:49 +01009004 }
9005}
9006
9007static int sched_rt_global_validate(void)
9008{
9009 if (sysctl_sched_rt_period <= 0)
9010 return -EINVAL;
9011
Juri Lellie9e7cb32014-02-11 09:24:26 +01009012 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
9013 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
Peter Zijlstra17248132013-12-17 12:44:49 +01009014 return -EINVAL;
9015
Dario Faggioli332ac172013-11-07 14:43:45 +01009016 return 0;
9017}
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009018
Peter Zijlstra17248132013-12-17 12:44:49 +01009019static void sched_rt_do_global(void)
9020{
9021 def_rt_bandwidth.rt_runtime = global_rt_runtime();
9022 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
9023}
9024
9025int sched_rt_handler(struct ctl_table *table, int write,
9026 void __user *buffer, size_t *lenp,
9027 loff_t *ppos)
9028{
9029 int old_period, old_runtime;
9030 static DEFINE_MUTEX(mutex);
9031 int ret;
9032
9033 mutex_lock(&mutex);
9034 old_period = sysctl_sched_rt_period;
9035 old_runtime = sysctl_sched_rt_runtime;
9036
9037 ret = proc_dointvec(table, write, buffer, lenp, ppos);
9038
9039 if (!ret && write) {
9040 ret = sched_rt_global_validate();
9041 if (ret)
9042 goto undo;
9043
Wanpeng Lia1963b82015-03-17 19:15:31 +08009044 ret = sched_dl_global_validate();
Peter Zijlstra17248132013-12-17 12:44:49 +01009045 if (ret)
9046 goto undo;
9047
Wanpeng Lia1963b82015-03-17 19:15:31 +08009048 ret = sched_rt_global_constraints();
Peter Zijlstra17248132013-12-17 12:44:49 +01009049 if (ret)
9050 goto undo;
9051
9052 sched_rt_do_global();
9053 sched_dl_do_global();
9054 }
9055 if (0) {
9056undo:
9057 sysctl_sched_rt_period = old_period;
9058 sysctl_sched_rt_runtime = old_runtime;
9059 }
9060 mutex_unlock(&mutex);
9061
9062 return ret;
9063}
9064
Clark Williamsce0dbbb2013-02-07 09:47:04 -06009065int sched_rr_handler(struct ctl_table *table, int write,
9066 void __user *buffer, size_t *lenp,
9067 loff_t *ppos)
9068{
9069 int ret;
9070 static DEFINE_MUTEX(mutex);
9071
9072 mutex_lock(&mutex);
9073 ret = proc_dointvec(table, write, buffer, lenp, ppos);
9074 /* make sure that internally we keep jiffies */
9075 /* also, writing zero resets timeslice to default */
9076 if (!ret && write) {
9077 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
9078 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
9079 }
9080 mutex_unlock(&mutex);
9081 return ret;
9082}
9083
Joonwoo Parkb02fc002017-06-16 11:58:58 -07009084#ifdef CONFIG_PROC_SYSCTL
9085int sched_updown_migrate_handler(struct ctl_table *table, int write,
9086 void __user *buffer, size_t *lenp,
9087 loff_t *ppos)
9088{
9089 int ret;
9090 unsigned int *data = (unsigned int *)table->data;
9091 unsigned int old_val;
9092 static DEFINE_MUTEX(mutex);
9093
9094 mutex_lock(&mutex);
9095 old_val = *data;
9096
9097 ret = proc_douintvec_capacity(table, write, buffer, lenp, ppos);
9098
9099 if (!ret && write &&
9100 sysctl_sched_capacity_margin > sysctl_sched_capacity_margin_down) {
9101 ret = -EINVAL;
9102 *data = old_val;
9103 }
9104 mutex_unlock(&mutex);
9105
9106 return ret;
9107}
9108#endif
9109
Suren Baghdasaryanc405bfb2019-02-17 15:07:38 -08009110void threadgroup_change_begin(struct task_struct *tsk)
9111{
9112 might_sleep();
9113 cgroup_threadgroup_change_begin(tsk);
9114}
9115
9116void threadgroup_change_end(struct task_struct *tsk)
9117{
9118 cgroup_threadgroup_change_end(tsk);
9119}
9120
Suren Baghdasaryancbbb29d2019-03-25 20:17:33 -07009121#ifdef CONFIG_CGROUP_SCHED
9122
jianzhou19d550f2019-05-23 14:26:39 +08009123inline struct task_group *css_tg(struct cgroup_subsys_state *css)
Suren Baghdasaryancbbb29d2019-03-25 20:17:33 -07009124{
9125 return css ? container_of(css, struct task_group, css) : NULL;
9126}
9127
Tejun Heoeb954192013-08-08 20:11:23 -04009128static struct cgroup_subsys_state *
9129cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009130{
Tejun Heoeb954192013-08-08 20:11:23 -04009131 struct task_group *parent = css_tg(parent_css);
9132 struct task_group *tg;
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009133
Tejun Heoeb954192013-08-08 20:11:23 -04009134 if (!parent) {
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009135 /* This is early initialization for the top cgroup */
Yong Zhang07e06b02011-01-07 15:17:36 +08009136 return &root_task_group.css;
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009137 }
9138
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02009139 tg = sched_create_group(parent);
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009140 if (IS_ERR(tg))
9141 return ERR_PTR(-ENOMEM);
9142
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009143 return &tg->css;
9144}
9145
Konstantin Khlebnikov62b57762017-02-08 14:27:27 +03009146/* Expose task group only after completing cgroup initialization */
9147static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
9148{
9149 struct task_group *tg = css_tg(css);
9150 struct task_group *parent = css_tg(css->parent);
9151
9152 if (parent)
9153 sched_online_group(tg, parent);
9154 return 0;
9155}
9156
Peter Zijlstra2f5177f2016-03-16 16:22:45 +01009157static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
Li Zefanace783b2013-01-24 14:30:48 +08009158{
Tejun Heoeb954192013-08-08 20:11:23 -04009159 struct task_group *tg = css_tg(css);
Li Zefanace783b2013-01-24 14:30:48 +08009160
Peter Zijlstra2f5177f2016-03-16 16:22:45 +01009161 sched_offline_group(tg);
Li Zefanace783b2013-01-24 14:30:48 +08009162}
9163
Tejun Heoeb954192013-08-08 20:11:23 -04009164static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009165{
Tejun Heoeb954192013-08-08 20:11:23 -04009166 struct task_group *tg = css_tg(css);
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009167
Peter Zijlstra2f5177f2016-03-16 16:22:45 +01009168 /*
9169 * Relies on the RCU grace period between css_released() and this.
9170 */
9171 sched_free_group(tg);
Li Zefanace783b2013-01-24 14:30:48 +08009172}
9173
Vincent Guittotea86cb42016-06-17 13:38:55 +02009174/*
9175 * This is called before wake_up_new_task(), therefore we really only
9176 * have to set its group bits, all the other stuff does not apply.
9177 */
Oleg Nesterovb53202e2015-12-03 10:24:08 -05009178static void cpu_cgroup_fork(struct task_struct *task)
Kirill Tkhaieeb61e52014-10-27 14:18:25 +04009179{
Vincent Guittotea86cb42016-06-17 13:38:55 +02009180 struct rq_flags rf;
9181 struct rq *rq;
9182
9183 rq = task_rq_lock(task, &rf);
9184
9185 sched_change_group(task, TASK_SET_GROUP);
9186
9187 task_rq_unlock(rq, task, &rf);
Kirill Tkhaieeb61e52014-10-27 14:18:25 +04009188}
9189
Tejun Heo1f7dd3e52015-12-03 10:18:21 -05009190static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009191{
Tejun Heobb9d97b2011-12-12 18:12:21 -08009192 struct task_struct *task;
Tejun Heo1f7dd3e52015-12-03 10:18:21 -05009193 struct cgroup_subsys_state *css;
Peter Zijlstra7dc603c2016-06-16 13:29:28 +02009194 int ret = 0;
Tejun Heobb9d97b2011-12-12 18:12:21 -08009195
Tejun Heo1f7dd3e52015-12-03 10:18:21 -05009196 cgroup_taskset_for_each(task, css, tset) {
Peter Zijlstrab68aa232008-02-13 15:45:40 +01009197#ifdef CONFIG_RT_GROUP_SCHED
Tejun Heoeb954192013-08-08 20:11:23 -04009198 if (!sched_rt_can_attach(css_tg(css), task))
Tejun Heobb9d97b2011-12-12 18:12:21 -08009199 return -EINVAL;
Peter Zijlstrab68aa232008-02-13 15:45:40 +01009200#endif
Peter Zijlstra7dc603c2016-06-16 13:29:28 +02009201 /*
9202 * Serialize against wake_up_new_task() such that if its
9203 * running, we're sure to observe its full state.
9204 */
9205 raw_spin_lock_irq(&task->pi_lock);
9206 /*
9207 * Avoid calling sched_move_task() before wake_up_new_task()
9208 * has happened. This would lead to problems with PELT, due to
9209 * move wanting to detach+attach while we're not attached yet.
9210 */
9211 if (task->state == TASK_NEW)
9212 ret = -EINVAL;
9213 raw_spin_unlock_irq(&task->pi_lock);
9214
9215 if (ret)
9216 break;
Tejun Heobb9d97b2011-12-12 18:12:21 -08009217 }
Peter Zijlstra7dc603c2016-06-16 13:29:28 +02009218 return ret;
Ben Blumbe367d02009-09-23 15:56:31 -07009219}
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009220
Tejun Heo1f7dd3e52015-12-03 10:18:21 -05009221static void cpu_cgroup_attach(struct cgroup_taskset *tset)
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009222{
Tejun Heobb9d97b2011-12-12 18:12:21 -08009223 struct task_struct *task;
Tejun Heo1f7dd3e52015-12-03 10:18:21 -05009224 struct cgroup_subsys_state *css;
Tejun Heobb9d97b2011-12-12 18:12:21 -08009225
Tejun Heo1f7dd3e52015-12-03 10:18:21 -05009226 cgroup_taskset_for_each(task, css, tset)
Tejun Heobb9d97b2011-12-12 18:12:21 -08009227 sched_move_task(task);
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009228}
9229
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01009230#ifdef CONFIG_FAIR_GROUP_SCHED
Tejun Heo182446d2013-08-08 20:11:24 -04009231static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
9232 struct cftype *cftype, u64 shareval)
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009233{
Konstantin Khlebnikovc58f0e82019-02-27 11:10:18 +03009234 if (shareval > scale_load_down(ULONG_MAX))
9235 shareval = MAX_SHARES;
Tejun Heo182446d2013-08-08 20:11:24 -04009236 return sched_group_set_shares(css_tg(css), scale_load(shareval));
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009237}
9238
Tejun Heo182446d2013-08-08 20:11:24 -04009239static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
9240 struct cftype *cft)
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009241{
Tejun Heo182446d2013-08-08 20:11:24 -04009242 struct task_group *tg = css_tg(css);
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009243
Nikhil Raoc8b28112011-05-18 14:37:48 -07009244 return (u64) scale_load_down(tg->shares);
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009245}
Paul Turnerab84d312011-07-21 09:43:28 -07009246
9247#ifdef CONFIG_CFS_BANDWIDTH
Paul Turnera790de92011-07-21 09:43:29 -07009248static DEFINE_MUTEX(cfs_constraints_mutex);
9249
Paul Turnerab84d312011-07-21 09:43:28 -07009250const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
9251const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
9252
Paul Turnera790de92011-07-21 09:43:29 -07009253static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9254
Paul Turnerab84d312011-07-21 09:43:28 -07009255static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9256{
Paul Turner56f570e2011-11-07 20:26:33 -08009257 int i, ret = 0, runtime_enabled, runtime_was_enabled;
Peter Zijlstra029632f2011-10-25 10:00:11 +02009258 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
Paul Turnerab84d312011-07-21 09:43:28 -07009259
9260 if (tg == &root_task_group)
9261 return -EINVAL;
9262
9263 /*
9264 * Ensure we have at some amount of bandwidth every period. This is
9265 * to prevent reaching a state of large arrears when throttled via
9266 * entity_tick() resulting in prolonged exit starvation.
9267 */
9268 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
9269 return -EINVAL;
9270
9271 /*
9272 * Likewise, bound things on the otherside by preventing insane quota
9273 * periods. This also allows us to normalize in computing quota
9274 * feasibility.
9275 */
9276 if (period > max_cfs_quota_period)
9277 return -EINVAL;
9278
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04009279 /*
9280 * Prevent race between setting of cfs_rq->runtime_enabled and
9281 * unthrottle_offline_cfs_rqs().
9282 */
9283 get_online_cpus();
Paul Turnera790de92011-07-21 09:43:29 -07009284 mutex_lock(&cfs_constraints_mutex);
9285 ret = __cfs_schedulable(tg, period, quota);
9286 if (ret)
9287 goto out_unlock;
9288
Paul Turner58088ad2011-07-21 09:43:31 -07009289 runtime_enabled = quota != RUNTIME_INF;
Paul Turner56f570e2011-11-07 20:26:33 -08009290 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
Ben Segall1ee14e62013-10-16 11:16:12 -07009291 /*
9292 * If we need to toggle cfs_bandwidth_used, off->on must occur
9293 * before making related changes, and on->off must occur afterwards
9294 */
9295 if (runtime_enabled && !runtime_was_enabled)
9296 cfs_bandwidth_usage_inc();
Paul Turnerab84d312011-07-21 09:43:28 -07009297 raw_spin_lock_irq(&cfs_b->lock);
9298 cfs_b->period = ns_to_ktime(period);
9299 cfs_b->quota = quota;
Paul Turner58088ad2011-07-21 09:43:31 -07009300
Paul Turnera9cf55b2011-07-21 09:43:32 -07009301 __refill_cfs_bandwidth_runtime(cfs_b);
Paul Turner58088ad2011-07-21 09:43:31 -07009302 /* restart the period timer (if active) to handle new period expiry */
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02009303 if (runtime_enabled)
9304 start_cfs_bandwidth(cfs_b);
Paul Turnerab84d312011-07-21 09:43:28 -07009305 raw_spin_unlock_irq(&cfs_b->lock);
9306
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04009307 for_each_online_cpu(i) {
Paul Turnerab84d312011-07-21 09:43:28 -07009308 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
Peter Zijlstra029632f2011-10-25 10:00:11 +02009309 struct rq *rq = cfs_rq->rq;
Paul Turnerab84d312011-07-21 09:43:28 -07009310
9311 raw_spin_lock_irq(&rq->lock);
Paul Turner58088ad2011-07-21 09:43:31 -07009312 cfs_rq->runtime_enabled = runtime_enabled;
Paul Turnerab84d312011-07-21 09:43:28 -07009313 cfs_rq->runtime_remaining = 0;
Paul Turner671fd9d2011-07-21 09:43:34 -07009314
Peter Zijlstra029632f2011-10-25 10:00:11 +02009315 if (cfs_rq->throttled)
Paul Turner671fd9d2011-07-21 09:43:34 -07009316 unthrottle_cfs_rq(cfs_rq);
Paul Turnerab84d312011-07-21 09:43:28 -07009317 raw_spin_unlock_irq(&rq->lock);
9318 }
Ben Segall1ee14e62013-10-16 11:16:12 -07009319 if (runtime_was_enabled && !runtime_enabled)
9320 cfs_bandwidth_usage_dec();
Paul Turnera790de92011-07-21 09:43:29 -07009321out_unlock:
9322 mutex_unlock(&cfs_constraints_mutex);
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04009323 put_online_cpus();
Paul Turnerab84d312011-07-21 09:43:28 -07009324
Paul Turnera790de92011-07-21 09:43:29 -07009325 return ret;
Paul Turnerab84d312011-07-21 09:43:28 -07009326}
9327
9328int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9329{
9330 u64 quota, period;
9331
Peter Zijlstra029632f2011-10-25 10:00:11 +02009332 period = ktime_to_ns(tg->cfs_bandwidth.period);
Paul Turnerab84d312011-07-21 09:43:28 -07009333 if (cfs_quota_us < 0)
9334 quota = RUNTIME_INF;
Konstantin Khlebnikov5e4ea982019-02-27 11:10:20 +03009335 else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
Paul Turnerab84d312011-07-21 09:43:28 -07009336 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
Konstantin Khlebnikov5e4ea982019-02-27 11:10:20 +03009337 else
9338 return -EINVAL;
Paul Turnerab84d312011-07-21 09:43:28 -07009339
9340 return tg_set_cfs_bandwidth(tg, period, quota);
9341}
9342
9343long tg_get_cfs_quota(struct task_group *tg)
9344{
9345 u64 quota_us;
9346
Peter Zijlstra029632f2011-10-25 10:00:11 +02009347 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
Paul Turnerab84d312011-07-21 09:43:28 -07009348 return -1;
9349
Peter Zijlstra029632f2011-10-25 10:00:11 +02009350 quota_us = tg->cfs_bandwidth.quota;
Paul Turnerab84d312011-07-21 09:43:28 -07009351 do_div(quota_us, NSEC_PER_USEC);
9352
9353 return quota_us;
9354}
9355
9356int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9357{
9358 u64 quota, period;
9359
Konstantin Khlebnikov5e4ea982019-02-27 11:10:20 +03009360 if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
9361 return -EINVAL;
9362
Paul Turnerab84d312011-07-21 09:43:28 -07009363 period = (u64)cfs_period_us * NSEC_PER_USEC;
Peter Zijlstra029632f2011-10-25 10:00:11 +02009364 quota = tg->cfs_bandwidth.quota;
Paul Turnerab84d312011-07-21 09:43:28 -07009365
Paul Turnerab84d312011-07-21 09:43:28 -07009366 return tg_set_cfs_bandwidth(tg, period, quota);
9367}
9368
9369long tg_get_cfs_period(struct task_group *tg)
9370{
9371 u64 cfs_period_us;
9372
Peter Zijlstra029632f2011-10-25 10:00:11 +02009373 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
Paul Turnerab84d312011-07-21 09:43:28 -07009374 do_div(cfs_period_us, NSEC_PER_USEC);
9375
9376 return cfs_period_us;
9377}
9378
Tejun Heo182446d2013-08-08 20:11:24 -04009379static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
9380 struct cftype *cft)
Paul Turnerab84d312011-07-21 09:43:28 -07009381{
Tejun Heo182446d2013-08-08 20:11:24 -04009382 return tg_get_cfs_quota(css_tg(css));
Paul Turnerab84d312011-07-21 09:43:28 -07009383}
9384
Tejun Heo182446d2013-08-08 20:11:24 -04009385static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
9386 struct cftype *cftype, s64 cfs_quota_us)
Paul Turnerab84d312011-07-21 09:43:28 -07009387{
Tejun Heo182446d2013-08-08 20:11:24 -04009388 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
Paul Turnerab84d312011-07-21 09:43:28 -07009389}
9390
Tejun Heo182446d2013-08-08 20:11:24 -04009391static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
9392 struct cftype *cft)
Paul Turnerab84d312011-07-21 09:43:28 -07009393{
Tejun Heo182446d2013-08-08 20:11:24 -04009394 return tg_get_cfs_period(css_tg(css));
Paul Turnerab84d312011-07-21 09:43:28 -07009395}
9396
Tejun Heo182446d2013-08-08 20:11:24 -04009397static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
9398 struct cftype *cftype, u64 cfs_period_us)
Paul Turnerab84d312011-07-21 09:43:28 -07009399{
Tejun Heo182446d2013-08-08 20:11:24 -04009400 return tg_set_cfs_period(css_tg(css), cfs_period_us);
Paul Turnerab84d312011-07-21 09:43:28 -07009401}
9402
Paul Turnera790de92011-07-21 09:43:29 -07009403struct cfs_schedulable_data {
9404 struct task_group *tg;
9405 u64 period, quota;
9406};
9407
9408/*
9409 * normalize group quota/period to be quota/max_period
9410 * note: units are usecs
9411 */
9412static u64 normalize_cfs_quota(struct task_group *tg,
9413 struct cfs_schedulable_data *d)
9414{
9415 u64 quota, period;
9416
9417 if (tg == d->tg) {
9418 period = d->period;
9419 quota = d->quota;
9420 } else {
9421 period = tg_get_cfs_period(tg);
9422 quota = tg_get_cfs_quota(tg);
9423 }
9424
9425 /* note: these should typically be equivalent */
9426 if (quota == RUNTIME_INF || quota == -1)
9427 return RUNTIME_INF;
9428
9429 return to_ratio(period, quota);
9430}
9431
9432static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9433{
9434 struct cfs_schedulable_data *d = data;
Peter Zijlstra029632f2011-10-25 10:00:11 +02009435 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
Paul Turnera790de92011-07-21 09:43:29 -07009436 s64 quota = 0, parent_quota = -1;
9437
9438 if (!tg->parent) {
9439 quota = RUNTIME_INF;
9440 } else {
Peter Zijlstra029632f2011-10-25 10:00:11 +02009441 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
Paul Turnera790de92011-07-21 09:43:29 -07009442
9443 quota = normalize_cfs_quota(tg, d);
Zhihui Zhang9c58c792014-09-20 21:24:36 -04009444 parent_quota = parent_b->hierarchical_quota;
Paul Turnera790de92011-07-21 09:43:29 -07009445
9446 /*
9447 * ensure max(child_quota) <= parent_quota, inherit when no
9448 * limit is set
9449 */
9450 if (quota == RUNTIME_INF)
9451 quota = parent_quota;
9452 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
9453 return -EINVAL;
9454 }
Zhihui Zhang9c58c792014-09-20 21:24:36 -04009455 cfs_b->hierarchical_quota = quota;
Paul Turnera790de92011-07-21 09:43:29 -07009456
9457 return 0;
9458}
9459
9460static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
9461{
Paul Turner82774342011-07-21 09:43:35 -07009462 int ret;
Paul Turnera790de92011-07-21 09:43:29 -07009463 struct cfs_schedulable_data data = {
9464 .tg = tg,
9465 .period = period,
9466 .quota = quota,
9467 };
9468
9469 if (quota != RUNTIME_INF) {
9470 do_div(data.period, NSEC_PER_USEC);
9471 do_div(data.quota, NSEC_PER_USEC);
9472 }
9473
Paul Turner82774342011-07-21 09:43:35 -07009474 rcu_read_lock();
9475 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
9476 rcu_read_unlock();
9477
9478 return ret;
Paul Turnera790de92011-07-21 09:43:29 -07009479}
Nikhil Raoe8da1b12011-07-21 09:43:40 -07009480
Tejun Heo2da8ca82013-12-05 12:28:04 -05009481static int cpu_stats_show(struct seq_file *sf, void *v)
Nikhil Raoe8da1b12011-07-21 09:43:40 -07009482{
Tejun Heo2da8ca82013-12-05 12:28:04 -05009483 struct task_group *tg = css_tg(seq_css(sf));
Peter Zijlstra029632f2011-10-25 10:00:11 +02009484 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
Nikhil Raoe8da1b12011-07-21 09:43:40 -07009485
Tejun Heo44ffc752013-12-05 12:28:01 -05009486 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
9487 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
9488 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
Nikhil Raoe8da1b12011-07-21 09:43:40 -07009489
9490 return 0;
9491}
Paul Turnerab84d312011-07-21 09:43:28 -07009492#endif /* CONFIG_CFS_BANDWIDTH */
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02009493#endif /* CONFIG_FAIR_GROUP_SCHED */
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009494
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01009495#ifdef CONFIG_RT_GROUP_SCHED
Tejun Heo182446d2013-08-08 20:11:24 -04009496static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
9497 struct cftype *cft, s64 val)
Peter Zijlstra6f505b12008-01-25 21:08:30 +01009498{
Tejun Heo182446d2013-08-08 20:11:24 -04009499 return sched_group_set_rt_runtime(css_tg(css), val);
Peter Zijlstra6f505b12008-01-25 21:08:30 +01009500}
9501
Tejun Heo182446d2013-08-08 20:11:24 -04009502static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
9503 struct cftype *cft)
Peter Zijlstra6f505b12008-01-25 21:08:30 +01009504{
Tejun Heo182446d2013-08-08 20:11:24 -04009505 return sched_group_rt_runtime(css_tg(css));
Peter Zijlstra6f505b12008-01-25 21:08:30 +01009506}
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009507
Tejun Heo182446d2013-08-08 20:11:24 -04009508static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
9509 struct cftype *cftype, u64 rt_period_us)
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009510{
Tejun Heo182446d2013-08-08 20:11:24 -04009511 return sched_group_set_rt_period(css_tg(css), rt_period_us);
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009512}
9513
Tejun Heo182446d2013-08-08 20:11:24 -04009514static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
9515 struct cftype *cft)
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009516{
Tejun Heo182446d2013-08-08 20:11:24 -04009517 return sched_group_rt_period(css_tg(css));
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009518}
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02009519#endif /* CONFIG_RT_GROUP_SCHED */
Peter Zijlstra6f505b12008-01-25 21:08:30 +01009520
Paul Menagefe5c7cc2007-10-29 21:18:11 +01009521static struct cftype cpu_files[] = {
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01009522#ifdef CONFIG_FAIR_GROUP_SCHED
Paul Menagefe5c7cc2007-10-29 21:18:11 +01009523 {
9524 .name = "shares",
Paul Menagef4c753b2008-04-29 00:59:56 -07009525 .read_u64 = cpu_shares_read_u64,
9526 .write_u64 = cpu_shares_write_u64,
Paul Menagefe5c7cc2007-10-29 21:18:11 +01009527 },
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01009528#endif
Paul Turnerab84d312011-07-21 09:43:28 -07009529#ifdef CONFIG_CFS_BANDWIDTH
9530 {
9531 .name = "cfs_quota_us",
9532 .read_s64 = cpu_cfs_quota_read_s64,
9533 .write_s64 = cpu_cfs_quota_write_s64,
9534 },
9535 {
9536 .name = "cfs_period_us",
9537 .read_u64 = cpu_cfs_period_read_u64,
9538 .write_u64 = cpu_cfs_period_write_u64,
9539 },
Nikhil Raoe8da1b12011-07-21 09:43:40 -07009540 {
9541 .name = "stat",
Tejun Heo2da8ca82013-12-05 12:28:04 -05009542 .seq_show = cpu_stats_show,
Nikhil Raoe8da1b12011-07-21 09:43:40 -07009543 },
Paul Turnerab84d312011-07-21 09:43:28 -07009544#endif
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01009545#ifdef CONFIG_RT_GROUP_SCHED
Peter Zijlstra6f505b12008-01-25 21:08:30 +01009546 {
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01009547 .name = "rt_runtime_us",
Paul Menage06ecb272008-04-29 01:00:06 -07009548 .read_s64 = cpu_rt_runtime_read,
9549 .write_s64 = cpu_rt_runtime_write,
Peter Zijlstra6f505b12008-01-25 21:08:30 +01009550 },
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009551 {
9552 .name = "rt_period_us",
Paul Menagef4c753b2008-04-29 00:59:56 -07009553 .read_u64 = cpu_rt_period_read_uint,
9554 .write_u64 = cpu_rt_period_write_uint,
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009555 },
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01009556#endif
Tejun Heo4baf6e32012-04-01 12:09:55 -07009557 { } /* terminate */
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009558};
9559
Tejun Heo073219e2014-02-08 10:36:58 -05009560struct cgroup_subsys cpu_cgrp_subsys = {
Tejun Heo92fb9742012-11-19 08:13:38 -08009561 .css_alloc = cpu_cgroup_css_alloc,
Konstantin Khlebnikov62b57762017-02-08 14:27:27 +03009562 .css_online = cpu_cgroup_css_online,
Peter Zijlstra2f5177f2016-03-16 16:22:45 +01009563 .css_released = cpu_cgroup_css_released,
Tejun Heo92fb9742012-11-19 08:13:38 -08009564 .css_free = cpu_cgroup_css_free,
Kirill Tkhaieeb61e52014-10-27 14:18:25 +04009565 .fork = cpu_cgroup_fork,
Tejun Heobb9d97b2011-12-12 18:12:21 -08009566 .can_attach = cpu_cgroup_can_attach,
9567 .attach = cpu_cgroup_attach,
Rom Lemarchand6a97fd92015-05-26 17:00:44 -07009568 .allow_attach = subsys_cgroup_allow_attach,
Tejun Heo55779642014-07-15 11:05:09 -04009569 .legacy_cftypes = cpu_files,
Tejun Heob38e42e2016-02-23 10:00:50 -05009570 .early_init = true,
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009571};
9572
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01009573#endif /* CONFIG_CGROUP_SCHED */
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009574
Paul E. McKenneyb637a322012-09-19 16:58:38 -07009575void dump_cpu_task(int cpu)
9576{
9577 pr_info("Task dump for CPU %d:\n", cpu);
9578 sched_show_task(cpu_curr(cpu));
9579}
Andi Kleened82b8a2015-11-29 20:59:43 -08009580
9581/*
9582 * Nice levels are multiplicative, with a gentle 10% change for every
9583 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
9584 * nice 1, it will get ~10% less CPU time than another CPU-bound task
9585 * that remained on nice 0.
9586 *
9587 * The "10% effect" is relative and cumulative: from _any_ nice level,
9588 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
9589 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
9590 * If a task goes up by ~10% and another task goes down by ~10% then
9591 * the relative distance between them is ~25%.)
9592 */
9593const int sched_prio_to_weight[40] = {
9594 /* -20 */ 88761, 71755, 56483, 46273, 36291,
9595 /* -15 */ 29154, 23254, 18705, 14949, 11916,
9596 /* -10 */ 9548, 7620, 6100, 4904, 3906,
9597 /* -5 */ 3121, 2501, 1991, 1586, 1277,
9598 /* 0 */ 1024, 820, 655, 526, 423,
9599 /* 5 */ 335, 272, 215, 172, 137,
9600 /* 10 */ 110, 87, 70, 56, 45,
9601 /* 15 */ 36, 29, 23, 18, 15,
9602};
9603
9604/*
9605 * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
9606 *
9607 * In cases where the weight does not change often, we can use the
9608 * precalculated inverse to speed up arithmetics by turning divisions
9609 * into multiplications:
9610 */
9611const u32 sched_prio_to_wmult[40] = {
9612 /* -20 */ 48388, 59856, 76040, 92818, 118348,
9613 /* -15 */ 147320, 184698, 229616, 287308, 360437,
9614 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
9615 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
9616 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
9617 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
9618 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
9619 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
9620};
Vikram Mulukutlad056dbc2017-02-07 18:58:07 -08009621
Joonwoo Parkf7d6cd42017-01-17 15:19:43 -08009622#ifdef CONFIG_SCHED_WALT
Vikram Mulukutlad056dbc2017-02-07 18:58:07 -08009623/*
9624 * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field
9625 *
9626 * Stop accounting (exiting) task's future cpu usage
9627 *
9628 * We need this so that reset_all_windows_stats() can function correctly.
9629 * reset_all_window_stats() depends on do_each_thread/for_each_thread task
9630 * iterators to reset *all* task's statistics. Exiting tasks however become
9631 * invisible to those iterators. sched_exit() is called on a exiting task prior
9632 * to being removed from task_list, which will let reset_all_window_stats()
9633 * function correctly.
9634 */
9635void sched_exit(struct task_struct *p)
9636{
9637 struct rq_flags rf;
9638 struct rq *rq;
9639 u64 wallclock;
9640
9641 sched_set_group_id(p, 0);
9642
9643 rq = task_rq_lock(p, &rf);
9644
9645 /* rq->curr == p */
Pavankumar Kondetifaa04442018-06-25 16:13:39 +05309646 wallclock = sched_ktime_clock();
Vikram Mulukutlad056dbc2017-02-07 18:58:07 -08009647 update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
9648 dequeue_task(rq, p, 0);
Pavankumar Kondeti0cebff02017-07-21 16:28:12 +05309649 /*
9650 * task's contribution is already removed from the
9651 * cumulative window demand in dequeue. As the
9652 * task's stats are reset, the next enqueue does
9653 * not change the cumulative window demand.
9654 */
Pavankumar Kondetibc4cef72017-07-21 19:15:19 +05309655 reset_task_stats(p);
Vikram Mulukutlad056dbc2017-02-07 18:58:07 -08009656 p->ravg.mark_start = wallclock;
9657 p->ravg.sum_history[0] = EXITING_TASK_MARKER;
Vikram Mulukutlad056dbc2017-02-07 18:58:07 -08009658
9659 enqueue_task(rq, p, 0);
9660 clear_ed_task(p, rq);
9661 task_rq_unlock(rq, p, &rf);
Pavankumar Kondeti4d091222018-01-10 15:15:41 +05309662 free_task_load_ptrs(p);
Vikram Mulukutlad056dbc2017-02-07 18:58:07 -08009663}
Joonwoo Parkf7d6cd42017-01-17 15:19:43 -08009664#endif /* CONFIG_SCHED_WALT */
Syed Rameez Mustafae14a2332017-05-19 14:42:35 -07009665
Olav Hauganad0b1412017-09-20 11:56:05 -07009666__read_mostly bool sched_predl = 1;
Joonwoo Parka5e601e2017-09-20 16:13:03 -07009667
9668#ifdef CONFIG_SCHED_CORE_ROTATE
9669int
9670find_first_cpu_bit(struct task_struct *p, const cpumask_t *search_cpus,
9671 struct sched_group *sg_target, bool *avoid_prev_cpu,
9672 bool *do_rotate, struct find_first_cpu_bit_env *env)
9673{
9674 int i = -1;
9675 unsigned long mcc;
9676 int cpu = smp_processor_id();
9677
9678 mcc = cpu_rq(cpu)->rd->max_cpu_capacity.val;
9679
9680 /* do rotation only for big CPUs. */
9681 *do_rotate = (cpumask_first(search_cpus) < nr_cpu_ids &&
9682 capacity_orig_of(cpumask_first(search_cpus)) == mcc);
9683
9684 if (*do_rotate) {
9685 if (time_before_eq(jiffies, *env->avoid_prev_cpu_last +
9686 env->interval))
9687 return *env->rotate_cpu_start;
9688
9689 spin_lock(env->rotate_lock);
9690 if (time_after(jiffies, *env->avoid_prev_cpu_last +
9691 env->interval)) {
9692 cpumask_t tmpmask;
9693
9694 *env->avoid_prev_cpu_last = jiffies;
9695 *avoid_prev_cpu = true;
9696
9697 cpumask_copy(&tmpmask, sched_group_cpus(sg_target));
9698 cpumask_andnot(&tmpmask, &tmpmask, cpu_isolated_mask);
9699
9700 i = cpumask_next(*env->rotate_cpu_start, &tmpmask);
9701 if (i >= nr_cpu_ids)
9702 i = cpumask_first(&tmpmask) - 1;
9703 /* Change start CPU every interval. */
9704 *env->rotate_cpu_start = i;
9705 } else {
9706 i = *env->rotate_cpu_start;
9707 }
9708 spin_unlock(env->rotate_lock);
9709 }
9710
9711 return i;
9712}
9713#endif