blob: 2e4eab22ca370f45ed7b8f02ebaef905b3637265 [file] [log] [blame]
Joe Perchesc767a542012-05-21 19:50:07 -07001#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
Suresh Siddha61c46282008-03-10 15:28:04 -07003#include <linux/errno.h>
4#include <linux/kernel.h>
5#include <linux/mm.h>
6#include <linux/smp.h>
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -08007#include <linux/prctl.h>
Suresh Siddha61c46282008-03-10 15:28:04 -07008#include <linux/slab.h>
9#include <linux/sched.h>
Paul Gortmaker186f4362016-07-13 20:18:56 -040010#include <linux/init.h>
11#include <linux/export.h>
Peter Zijlstra7f424a82008-04-25 17:39:01 +020012#include <linux/pm.h>
Thomas Gleixner162a6882015-04-03 02:01:28 +020013#include <linux/tick.h>
Amerigo Wang9d62dcd2009-05-11 22:05:28 -040014#include <linux/random.h>
Avi Kivity7c68af62009-09-19 09:40:22 +030015#include <linux/user-return-notifier.h>
Andy Isaacson814e2c82009-12-08 00:29:42 -080016#include <linux/dmi.h>
17#include <linux/utsname.h>
Richard Weinberger90e24012012-03-25 23:00:04 +020018#include <linux/stackprotector.h>
19#include <linux/tick.h>
20#include <linux/cpuidle.h>
Arjan van de Ven61613522009-09-17 16:11:28 +020021#include <trace/events/power.h>
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +020022#include <linux/hw_breakpoint.h>
Borislav Petkov93789b32011-01-20 15:42:52 +010023#include <asm/cpu.h>
Ivan Vecerad3ec5ca2008-11-11 14:33:44 +010024#include <asm/apic.h>
Jaswinder Singh Rajput2c1b2842009-04-11 00:03:10 +053025#include <asm/syscalls.h>
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -080026#include <asm/idle.h>
27#include <asm/uaccess.h>
Len Brownb2531492014-01-15 00:37:34 -050028#include <asm/mwait.h>
Ingo Molnar78f7f1e2015-04-24 02:54:44 +020029#include <asm/fpu/internal.h>
K.Prasad66cb5912009-06-01 23:44:55 +053030#include <asm/debugreg.h>
Richard Weinberger90e24012012-03-25 23:00:04 +020031#include <asm/nmi.h>
Andy Lutomirski375074c2014-10-24 15:58:07 -070032#include <asm/tlbflush.h>
Ashok Raj8838eb62015-08-12 18:29:40 +020033#include <asm/mce.h>
Brian Gerst9fda6a02015-07-29 01:41:16 -040034#include <asm/vm86.h>
Brian Gerst7b32aea2016-08-13 12:38:18 -040035#include <asm/switch_to.h>
Thomas Gleixner89c6e9b2018-04-29 15:21:42 +020036#include <asm/spec-ctrl.h>
Richard Weinberger90e24012012-03-25 23:00:04 +020037
Thomas Gleixnerb5741ef2018-11-25 19:33:47 +010038#include "process.h"
39
Thomas Gleixner45046892012-05-03 09:03:01 +000040/*
41 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
42 * no more per-task TSS's. The TSS size is kept cacheline-aligned
43 * so they are allowed to end up in the .data..cacheline_aligned
44 * section. Since TSS's are completely CPU-local, we want them
45 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
46 */
Richard Fellner13be4482017-05-04 14:26:50 +020047__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
Andy Lutomirskid0a0de22015-03-05 19:19:06 -080048 .x86_tss = {
Andy Lutomirskid9e05cc2015-03-10 11:05:59 -070049 .sp0 = TOP_OF_INIT_STACK,
Andy Lutomirskid0a0de22015-03-05 19:19:06 -080050#ifdef CONFIG_X86_32
51 .ss0 = __KERNEL_DS,
52 .ss1 = __KERNEL_CS,
53 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
54#endif
55 },
56#ifdef CONFIG_X86_32
57 /*
58 * Note that the .io_bitmap member must be extra-big. This is because
59 * the CPU will access an additional byte beyond the end of the IO
60 * permission bitmap. The extra byte must be all 1 bits, and must
61 * be within the limit.
62 */
63 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
64#endif
Andy Lutomirski2a41aa42016-03-09 19:00:33 -080065#ifdef CONFIG_X86_32
66 .SYSENTER_stack_canary = STACK_END_MAGIC,
67#endif
Andy Lutomirskid0a0de22015-03-05 19:19:06 -080068};
Marc Dionnede71ad22015-05-04 15:16:44 -030069EXPORT_PER_CPU_SYMBOL(cpu_tss);
Thomas Gleixner45046892012-05-03 09:03:01 +000070
Richard Weinberger90e24012012-03-25 23:00:04 +020071#ifdef CONFIG_X86_64
72static DEFINE_PER_CPU(unsigned char, is_idle);
73static ATOMIC_NOTIFIER_HEAD(idle_notifier);
74
75void idle_notifier_register(struct notifier_block *n)
76{
77 atomic_notifier_chain_register(&idle_notifier, n);
78}
79EXPORT_SYMBOL_GPL(idle_notifier_register);
80
81void idle_notifier_unregister(struct notifier_block *n)
82{
83 atomic_notifier_chain_unregister(&idle_notifier, n);
84}
85EXPORT_SYMBOL_GPL(idle_notifier_unregister);
86#endif
Zhao Yakuic1e3b372008-06-24 17:58:53 +080087
Suresh Siddha55ccf3f2012-05-16 15:03:51 -070088/*
89 * this gets called so that we can store lazy state into memory and copy the
90 * current task into the new thread.
91 */
Suresh Siddha61c46282008-03-10 15:28:04 -070092int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
93{
Ingo Molnar5aaeb5c2015-07-17 12:28:12 +020094 memcpy(dst, src, arch_task_struct_size);
Andy Lutomirski2459ee82015-10-30 22:42:46 -070095#ifdef CONFIG_VM86
96 dst->thread.vm86 = NULL;
97#endif
Oleg Nesterovf1853502014-09-02 19:57:23 +020098
Ingo Molnarc69e0982015-04-24 02:07:15 +020099 return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
Suresh Siddha61c46282008-03-10 15:28:04 -0700100}
Peter Zijlstra7f424a82008-04-25 17:39:01 +0200101
Thomas Gleixner00dba562008-06-09 18:35:28 +0200102/*
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -0800103 * Free current thread data structures etc..
104 */
Jiri Slabye6464692016-05-20 17:00:20 -0700105void exit_thread(struct task_struct *tsk)
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -0800106{
Jiri Slabye6464692016-05-20 17:00:20 -0700107 struct thread_struct *t = &tsk->thread;
Thomas Gleixner250981e2009-03-16 13:07:21 +0100108 unsigned long *bp = t->io_bitmap_ptr;
Ingo Molnarca6787b2015-04-23 12:33:50 +0200109 struct fpu *fpu = &t->fpu;
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -0800110
Thomas Gleixner250981e2009-03-16 13:07:21 +0100111 if (bp) {
Andy Lutomirski24933b82015-03-05 19:19:05 -0800112 struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -0800113
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -0800114 t->io_bitmap_ptr = NULL;
115 clear_thread_flag(TIF_IO_BITMAP);
116 /*
117 * Careful, clear this in the TSS too:
118 */
119 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
120 t->io_bitmap_max = 0;
121 put_cpu();
Thomas Gleixner250981e2009-03-16 13:07:21 +0100122 kfree(bp);
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -0800123 }
Suresh Siddha1dcc8d72012-05-16 15:03:54 -0700124
Brian Gerst9fda6a02015-07-29 01:41:16 -0400125 free_vm86(t);
126
Ingo Molnar50338612015-04-29 19:04:31 +0200127 fpu__drop(fpu);
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -0800128}
129
130void flush_thread(void)
131{
132 struct task_struct *tsk = current;
133
Frederic Weisbecker24f1e32c2009-09-09 19:22:48 +0200134 flush_ptrace_hw_breakpoint(tsk);
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -0800135 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
Oleg Nesterov110d7f72015-01-19 19:52:12 +0100136
Ingo Molnar04c8e012015-04-29 20:35:33 +0200137 fpu__clear(&tsk->thread.fpu);
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -0800138}
139
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -0800140void disable_TSC(void)
141{
142 preempt_disable();
143 if (!test_and_set_thread_flag(TIF_NOTSC))
144 /*
145 * Must flip the CPU state synchronously with
146 * TIF_NOTSC in the current running context.
147 */
Thomas Gleixner5ed77882017-02-14 00:11:04 -0800148 cr4_set_bits(X86_CR4_TSD);
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -0800149 preempt_enable();
150}
151
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -0800152static void enable_TSC(void)
153{
154 preempt_disable();
155 if (test_and_clear_thread_flag(TIF_NOTSC))
156 /*
157 * Must flip the CPU state synchronously with
158 * TIF_NOTSC in the current running context.
159 */
Thomas Gleixner5ed77882017-02-14 00:11:04 -0800160 cr4_clear_bits(X86_CR4_TSD);
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -0800161 preempt_enable();
162}
163
164int get_tsc_mode(unsigned long adr)
165{
166 unsigned int val;
167
168 if (test_thread_flag(TIF_NOTSC))
169 val = PR_TSC_SIGSEGV;
170 else
171 val = PR_TSC_ENABLE;
172
173 return put_user(val, (unsigned int __user *)adr);
174}
175
176int set_tsc_mode(unsigned int val)
177{
178 if (val == PR_TSC_SIGSEGV)
179 disable_TSC();
180 else if (val == PR_TSC_ENABLE)
181 enable_TSC();
182 else
183 return -EINVAL;
184
185 return 0;
186}
187
Thomas Gleixnerb5741ef2018-11-25 19:33:47 +0100188static inline void switch_to_bitmap(struct thread_struct *prev,
Kyle Hueyfd01e822017-02-14 00:11:02 -0800189 struct thread_struct *next,
190 unsigned long tifp, unsigned long tifn)
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -0800191{
Thomas Gleixnerb5741ef2018-11-25 19:33:47 +0100192 struct tss_struct *tss = this_cpu_ptr(&cpu_tss);
193
Kyle Hueyfd01e822017-02-14 00:11:02 -0800194 if (tifn & _TIF_IO_BITMAP) {
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -0800195 /*
196 * Copy the relevant range of the IO bitmap.
197 * Normally this is 128 bytes or less:
198 */
199 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
200 max(prev->io_bitmap_max, next->io_bitmap_max));
Kyle Hueyfd01e822017-02-14 00:11:02 -0800201 } else if (tifp & _TIF_IO_BITMAP) {
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -0800202 /*
203 * Clear any possible leftover bits:
204 */
205 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
206 }
Kyle Hueyfd01e822017-02-14 00:11:02 -0800207}
208
Thomas Gleixnerd0cb78f2018-05-09 21:53:09 +0200209#ifdef CONFIG_SMP
210
211struct ssb_state {
212 struct ssb_state *shared_state;
213 raw_spinlock_t lock;
214 unsigned int disable_state;
215 unsigned long local_state;
216};
217
218#define LSTATE_SSB 0
219
220static DEFINE_PER_CPU(struct ssb_state, ssb_state);
221
222void speculative_store_bypass_ht_init(void)
223{
224 struct ssb_state *st = this_cpu_ptr(&ssb_state);
225 unsigned int this_cpu = smp_processor_id();
226 unsigned int cpu;
227
228 st->local_state = 0;
229
230 /*
231 * Shared state setup happens once on the first bringup
232 * of the CPU. It's not destroyed on CPU hotunplug.
233 */
234 if (st->shared_state)
235 return;
236
237 raw_spin_lock_init(&st->lock);
238
239 /*
240 * Go over HT siblings and check whether one of them has set up the
241 * shared state pointer already.
242 */
243 for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) {
244 if (cpu == this_cpu)
245 continue;
246
247 if (!per_cpu(ssb_state, cpu).shared_state)
248 continue;
249
250 /* Link it to the state of the sibling: */
251 st->shared_state = per_cpu(ssb_state, cpu).shared_state;
252 return;
253 }
254
255 /*
256 * First HT sibling to come up on the core. Link shared state of
257 * the first HT sibling to itself. The siblings on the same core
258 * which come up later will see the shared state pointer and link
259 * themself to the state of this CPU.
260 */
261 st->shared_state = st;
262}
263
264/*
265 * Logic is: First HT sibling enables SSBD for both siblings in the core
266 * and last sibling to disable it, disables it for the whole core. This how
267 * MSR_SPEC_CTRL works in "hardware":
268 *
269 * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL
270 */
271static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
272{
273 struct ssb_state *st = this_cpu_ptr(&ssb_state);
274 u64 msr = x86_amd_ls_cfg_base;
275
276 if (!static_cpu_has(X86_FEATURE_ZEN)) {
277 msr |= ssbd_tif_to_amd_ls_cfg(tifn);
278 wrmsrl(MSR_AMD64_LS_CFG, msr);
279 return;
280 }
281
282 if (tifn & _TIF_SSBD) {
283 /*
284 * Since this can race with prctl(), block reentry on the
285 * same CPU.
286 */
287 if (__test_and_set_bit(LSTATE_SSB, &st->local_state))
288 return;
289
290 msr |= x86_amd_ls_cfg_ssbd_mask;
291
292 raw_spin_lock(&st->shared_state->lock);
293 /* First sibling enables SSBD: */
294 if (!st->shared_state->disable_state)
295 wrmsrl(MSR_AMD64_LS_CFG, msr);
296 st->shared_state->disable_state++;
297 raw_spin_unlock(&st->shared_state->lock);
298 } else {
299 if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state))
300 return;
301
302 raw_spin_lock(&st->shared_state->lock);
303 st->shared_state->disable_state--;
304 if (!st->shared_state->disable_state)
305 wrmsrl(MSR_AMD64_LS_CFG, msr);
306 raw_spin_unlock(&st->shared_state->lock);
307 }
308}
309#else
310static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
311{
312 u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn);
313
314 wrmsrl(MSR_AMD64_LS_CFG, msr);
315}
316#endif
317
Tom Lendacky7c0b2dc2018-05-17 17:09:18 +0200318static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
319{
320 /*
321 * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL,
322 * so ssbd_tif_to_spec_ctrl() just works.
323 */
324 wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
325}
326
Tim Chendbbc5332018-11-25 19:33:35 +0100327/*
328 * Update the MSRs managing speculation control, during context switch.
329 *
330 * tifp: Previous task's thread flags
331 * tifn: Next task's thread flags
332 */
333static __always_inline void __speculation_ctrl_update(unsigned long tifp,
334 unsigned long tifn)
Thomas Gleixnerd0cb78f2018-05-09 21:53:09 +0200335{
Tim Chena35a8c62018-11-25 19:33:46 +0100336 unsigned long tif_diff = tifp ^ tifn;
Tim Chendbbc5332018-11-25 19:33:35 +0100337 u64 msr = x86_spec_ctrl_base;
338 bool updmsr = false;
Thomas Gleixnerd0cb78f2018-05-09 21:53:09 +0200339
Tim Chena35a8c62018-11-25 19:33:46 +0100340 /*
341 * If TIF_SSBD is different, select the proper mitigation
342 * method. Note that if SSBD mitigation is disabled or permanentely
343 * enabled this branch can't be taken because nothing can set
344 * TIF_SSBD.
345 */
346 if (tif_diff & _TIF_SSBD) {
Tim Chendbbc5332018-11-25 19:33:35 +0100347 if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
348 amd_set_ssb_virt_state(tifn);
349 } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
350 amd_set_core_ssb_state(tifn);
351 } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
352 static_cpu_has(X86_FEATURE_AMD_SSBD)) {
353 msr |= ssbd_tif_to_spec_ctrl(tifn);
354 updmsr = true;
355 }
356 }
Thomas Gleixnerd0cb78f2018-05-09 21:53:09 +0200357
Tim Chena35a8c62018-11-25 19:33:46 +0100358 /*
359 * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled,
360 * otherwise avoid the MSR write.
361 */
362 if (IS_ENABLED(CONFIG_SMP) &&
363 static_branch_unlikely(&switch_to_cond_stibp)) {
364 updmsr |= !!(tif_diff & _TIF_SPEC_IB);
365 msr |= stibp_tif_to_spec_ctrl(tifn);
366 }
367
Tim Chendbbc5332018-11-25 19:33:35 +0100368 if (updmsr)
369 wrmsrl(MSR_IA32_SPEC_CTRL, msr);
Thomas Gleixner89c6e9b2018-04-29 15:21:42 +0200370}
371
Thomas Gleixner6febf942018-11-28 10:56:57 +0100372static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
373{
374 if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) {
375 if (task_spec_ssb_disable(tsk))
376 set_tsk_thread_flag(tsk, TIF_SSBD);
377 else
378 clear_tsk_thread_flag(tsk, TIF_SSBD);
Thomas Gleixner2d99bc02018-11-25 19:33:53 +0100379
380 if (task_spec_ib_disable(tsk))
381 set_tsk_thread_flag(tsk, TIF_SPEC_IB);
382 else
383 clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
Thomas Gleixner6febf942018-11-28 10:56:57 +0100384 }
385 /* Return the updated threadinfo flags*/
386 return task_thread_info(tsk)->flags;
387}
388
Thomas Gleixnerfd8d77e2018-11-25 19:33:34 +0100389void speculation_ctrl_update(unsigned long tif)
Thomas Gleixner89c6e9b2018-04-29 15:21:42 +0200390{
Tim Chendbbc5332018-11-25 19:33:35 +0100391 /* Forced update. Make sure all relevant TIF flags are different */
Thomas Gleixnerd0cb78f2018-05-09 21:53:09 +0200392 preempt_disable();
Tim Chendbbc5332018-11-25 19:33:35 +0100393 __speculation_ctrl_update(~tif, tif);
Thomas Gleixnerd0cb78f2018-05-09 21:53:09 +0200394 preempt_enable();
Thomas Gleixner89c6e9b2018-04-29 15:21:42 +0200395}
396
Thomas Gleixner6febf942018-11-28 10:56:57 +0100397/* Called from seccomp/prctl update */
398void speculation_ctrl_update_current(void)
399{
400 preempt_disable();
401 speculation_ctrl_update(speculation_ctrl_update_tif(current));
402 preempt_enable();
403}
404
Thomas Gleixnerb5741ef2018-11-25 19:33:47 +0100405void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
Kyle Hueyfd01e822017-02-14 00:11:02 -0800406{
407 struct thread_struct *prev, *next;
408 unsigned long tifp, tifn;
409
410 prev = &prev_p->thread;
411 next = &next_p->thread;
412
413 tifn = READ_ONCE(task_thread_info(next_p)->flags);
414 tifp = READ_ONCE(task_thread_info(prev_p)->flags);
Thomas Gleixnerb5741ef2018-11-25 19:33:47 +0100415 switch_to_bitmap(prev, next, tifp, tifn);
Kyle Hueyfd01e822017-02-14 00:11:02 -0800416
Avi Kivity7c68af62009-09-19 09:40:22 +0300417 propagate_user_return_notify(prev_p, next_p);
Kyle Hueyfd01e822017-02-14 00:11:02 -0800418
Kyle Huey439f2ef82017-02-14 00:11:03 -0800419 if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) &&
420 arch_has_block_step()) {
421 unsigned long debugctl, msk;
Kyle Hueyfd01e822017-02-14 00:11:02 -0800422
Kyle Huey439f2ef82017-02-14 00:11:03 -0800423 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
Kyle Hueyfd01e822017-02-14 00:11:02 -0800424 debugctl &= ~DEBUGCTLMSR_BTF;
Kyle Huey439f2ef82017-02-14 00:11:03 -0800425 msk = tifn & _TIF_BLOCKSTEP;
426 debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT;
427 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
Kyle Hueyfd01e822017-02-14 00:11:02 -0800428 }
429
Thomas Gleixner5ed77882017-02-14 00:11:04 -0800430 if ((tifp ^ tifn) & _TIF_NOTSC)
431 cr4_toggle_bits(X86_CR4_TSD);
Thomas Gleixner89c6e9b2018-04-29 15:21:42 +0200432
Thomas Gleixner6febf942018-11-28 10:56:57 +0100433 if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) {
434 __speculation_ctrl_update(tifp, tifn);
435 } else {
436 speculation_ctrl_update_tif(prev_p);
437 tifn = speculation_ctrl_update_tif(next_p);
438
439 /* Enforce MSR update to ensure consistent state */
440 __speculation_ctrl_update(~tifn, tifn);
441 }
Jeremy Fitzhardinge389d1fb2009-02-27 13:25:28 -0800442}
443
Brian Gerstdf59e7b2009-12-09 12:34:44 -0500444/*
Thomas Gleixner00dba562008-06-09 18:35:28 +0200445 * Idle related variables and functions
446 */
Thomas Renningerd1896042010-11-03 17:06:14 +0100447unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
Thomas Gleixner00dba562008-06-09 18:35:28 +0200448EXPORT_SYMBOL(boot_option_idle_override);
449
Len Browna476bda2013-02-09 21:45:03 -0500450static void (*x86_idle)(void);
Thomas Gleixner00dba562008-06-09 18:35:28 +0200451
Richard Weinberger90e24012012-03-25 23:00:04 +0200452#ifndef CONFIG_SMP
453static inline void play_dead(void)
454{
455 BUG();
456}
457#endif
458
459#ifdef CONFIG_X86_64
460void enter_idle(void)
461{
Alex Shic6ae41e2012-05-11 15:35:27 +0800462 this_cpu_write(is_idle, 1);
Richard Weinberger90e24012012-03-25 23:00:04 +0200463 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
464}
465
466static void __exit_idle(void)
467{
468 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
469 return;
470 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
471}
472
473/* Called from interrupts to signify idle end */
474void exit_idle(void)
475{
476 /* idle loop has pid 0 */
477 if (current->pid)
478 return;
479 __exit_idle();
480}
481#endif
482
Thomas Gleixner7d1a9412013-03-21 22:50:03 +0100483void arch_cpu_idle_enter(void)
484{
485 local_touch_nmi();
486 enter_idle();
487}
Richard Weinberger90e24012012-03-25 23:00:04 +0200488
Thomas Gleixner7d1a9412013-03-21 22:50:03 +0100489void arch_cpu_idle_exit(void)
490{
491 __exit_idle();
492}
Richard Weinberger90e24012012-03-25 23:00:04 +0200493
Thomas Gleixner7d1a9412013-03-21 22:50:03 +0100494void arch_cpu_idle_dead(void)
495{
496 play_dead();
Richard Weinberger90e24012012-03-25 23:00:04 +0200497}
498
Thomas Gleixner00dba562008-06-09 18:35:28 +0200499/*
Thomas Gleixner7d1a9412013-03-21 22:50:03 +0100500 * Called from the generic idle code.
501 */
502void arch_cpu_idle(void)
503{
Nicolas Pitre16f8b052014-01-29 12:45:12 -0500504 x86_idle();
Thomas Gleixner7d1a9412013-03-21 22:50:03 +0100505}
506
507/*
508 * We use this if we don't have any better idle routine..
Thomas Gleixner00dba562008-06-09 18:35:28 +0200509 */
Chris Metcalf6727ad92016-10-07 17:02:55 -0700510void __cpuidle default_idle(void)
Thomas Gleixner00dba562008-06-09 18:35:28 +0200511{
Daniel Lezcano4d0e42c2012-10-25 18:13:11 +0200512 trace_cpu_idle_rcuidle(1, smp_processor_id());
Thomas Gleixner7d1a9412013-03-21 22:50:03 +0100513 safe_halt();
Daniel Lezcano4d0e42c2012-10-25 18:13:11 +0200514 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
Thomas Gleixner00dba562008-06-09 18:35:28 +0200515}
Andy Whitcroft60b8b1d2011-06-14 12:45:10 -0700516#ifdef CONFIG_APM_MODULE
Thomas Gleixner00dba562008-06-09 18:35:28 +0200517EXPORT_SYMBOL(default_idle);
518#endif
519
Len Brown6a377dd2013-02-09 23:08:07 -0500520#ifdef CONFIG_XEN
521bool xen_set_default_idle(void)
Konrad Rzeszutek Wilke5fd47b2011-11-21 18:02:02 -0500522{
Len Browna476bda2013-02-09 21:45:03 -0500523 bool ret = !!x86_idle;
Konrad Rzeszutek Wilke5fd47b2011-11-21 18:02:02 -0500524
Len Browna476bda2013-02-09 21:45:03 -0500525 x86_idle = default_idle;
Konrad Rzeszutek Wilke5fd47b2011-11-21 18:02:02 -0500526
527 return ret;
528}
Len Brown6a377dd2013-02-09 23:08:07 -0500529#endif
Ivan Vecerad3ec5ca2008-11-11 14:33:44 +0100530void stop_this_cpu(void *dummy)
531{
532 local_irq_disable();
533 /*
534 * Remove this CPU:
535 */
Rusty Russell4f062892009-03-13 14:49:54 +1030536 set_cpu_online(smp_processor_id(), false);
Ivan Vecerad3ec5ca2008-11-11 14:33:44 +0100537 disable_local_APIC();
Ashok Raj8838eb62015-08-12 18:29:40 +0200538 mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
Ivan Vecerad3ec5ca2008-11-11 14:33:44 +0100539
Len Brown27be4572013-02-10 02:28:46 -0500540 for (;;)
541 halt();
Peter Zijlstra7f424a82008-04-25 17:39:01 +0200542}
543
Len Brown02c68a02011-04-01 16:59:53 -0400544bool amd_e400_c1e_detected;
545EXPORT_SYMBOL(amd_e400_c1e_detected);
Thomas Gleixneraa276e12008-06-09 19:15:00 +0200546
Len Brown02c68a02011-04-01 16:59:53 -0400547static cpumask_var_t amd_e400_c1e_mask;
Thomas Gleixner4faac972008-09-22 18:54:29 +0200548
Len Brown02c68a02011-04-01 16:59:53 -0400549void amd_e400_remove_cpu(int cpu)
Thomas Gleixner4faac972008-09-22 18:54:29 +0200550{
Len Brown02c68a02011-04-01 16:59:53 -0400551 if (amd_e400_c1e_mask != NULL)
552 cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
Thomas Gleixner4faac972008-09-22 18:54:29 +0200553}
554
Thomas Gleixneraa276e12008-06-09 19:15:00 +0200555/*
Len Brown02c68a02011-04-01 16:59:53 -0400556 * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
Thomas Gleixneraa276e12008-06-09 19:15:00 +0200557 * pending message MSR. If we detect C1E, then we handle it the same
558 * way as C3 power states (local apic timer and TSC stop)
559 */
Len Brown02c68a02011-04-01 16:59:53 -0400560static void amd_e400_idle(void)
Thomas Gleixneraa276e12008-06-09 19:15:00 +0200561{
Len Brown02c68a02011-04-01 16:59:53 -0400562 if (!amd_e400_c1e_detected) {
Thomas Gleixneraa276e12008-06-09 19:15:00 +0200563 u32 lo, hi;
564
565 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
Michal Schmidte8c534e2010-07-27 18:53:35 +0200566
Thomas Gleixneraa276e12008-06-09 19:15:00 +0200567 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
Len Brown02c68a02011-04-01 16:59:53 -0400568 amd_e400_c1e_detected = true;
Venki Pallipadi40fb1712008-11-17 16:11:37 -0800569 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
Andreas Herrmann09bfeea2008-09-18 21:12:10 +0200570 mark_tsc_unstable("TSC halt in AMD C1E");
Joe Perchesc767a542012-05-21 19:50:07 -0700571 pr_info("System has AMD C1E enabled\n");
Thomas Gleixneraa276e12008-06-09 19:15:00 +0200572 }
573 }
574
Len Brown02c68a02011-04-01 16:59:53 -0400575 if (amd_e400_c1e_detected) {
Thomas Gleixneraa276e12008-06-09 19:15:00 +0200576 int cpu = smp_processor_id();
577
Len Brown02c68a02011-04-01 16:59:53 -0400578 if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
579 cpumask_set_cpu(cpu, amd_e400_c1e_mask);
Thomas Gleixner162a6882015-04-03 02:01:28 +0200580 /* Force broadcast so ACPI can not interfere. */
581 tick_broadcast_force();
Joe Perchesc767a542012-05-21 19:50:07 -0700582 pr_info("Switch to broadcast mode on CPU%d\n", cpu);
Thomas Gleixneraa276e12008-06-09 19:15:00 +0200583 }
Thomas Gleixner435c3502015-04-03 02:05:53 +0200584 tick_broadcast_enter();
Thomas Gleixner0beefa22008-06-17 09:12:03 +0200585
Thomas Gleixneraa276e12008-06-09 19:15:00 +0200586 default_idle();
Thomas Gleixner0beefa22008-06-17 09:12:03 +0200587
588 /*
589 * The switch back from broadcast mode needs to be
590 * called with interrupts disabled.
591 */
Peter Zijlstraea811742013-09-11 12:43:13 +0200592 local_irq_disable();
Thomas Gleixner435c3502015-04-03 02:05:53 +0200593 tick_broadcast_exit();
Peter Zijlstraea811742013-09-11 12:43:13 +0200594 local_irq_enable();
Thomas Gleixneraa276e12008-06-09 19:15:00 +0200595 } else
596 default_idle();
597}
598
Len Brownb2531492014-01-15 00:37:34 -0500599/*
600 * Intel Core2 and older machines prefer MWAIT over HALT for C1.
601 * We can't rely on cpuidle installing MWAIT, because it will not load
602 * on systems that support only C1 -- so the boot default must be MWAIT.
603 *
604 * Some AMD machines are the opposite, they depend on using HALT.
605 *
606 * So for default C1, which is used during boot until cpuidle loads,
607 * use MWAIT-C1 on Intel HW that has it, else use HALT.
608 */
609static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
610{
611 if (c->x86_vendor != X86_VENDOR_INTEL)
612 return 0;
613
Peter Zijlstra08e237f2016-07-18 11:41:10 -0700614 if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR))
Len Brownb2531492014-01-15 00:37:34 -0500615 return 0;
616
617 return 1;
618}
619
620/*
Huang Rui0fb03282015-05-26 10:28:09 +0200621 * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
622 * with interrupts enabled and no flags, which is backwards compatible with the
623 * original MWAIT implementation.
Len Brownb2531492014-01-15 00:37:34 -0500624 */
Chris Metcalf6727ad92016-10-07 17:02:55 -0700625static __cpuidle void mwait_idle(void)
Len Brownb2531492014-01-15 00:37:34 -0500626{
Mike Galbraithf8e617f2014-01-18 17:14:44 +0100627 if (!current_set_polling_and_test()) {
Jisheng Zhange43d0182015-08-20 12:54:39 +0800628 trace_cpu_idle_rcuidle(1, smp_processor_id());
Mike Galbraithf8e617f2014-01-18 17:14:44 +0100629 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
Michael S. Tsirkinca598092016-01-28 19:02:51 +0200630 mb(); /* quirk */
Len Brownb2531492014-01-15 00:37:34 -0500631 clflush((void *)&current_thread_info()->flags);
Michael S. Tsirkinca598092016-01-28 19:02:51 +0200632 mb(); /* quirk */
Mike Galbraithf8e617f2014-01-18 17:14:44 +0100633 }
Len Brownb2531492014-01-15 00:37:34 -0500634
635 __monitor((void *)&current_thread_info()->flags, 0, 0);
Len Brownb2531492014-01-15 00:37:34 -0500636 if (!need_resched())
637 __sti_mwait(0, 0);
638 else
639 local_irq_enable();
Jisheng Zhange43d0182015-08-20 12:54:39 +0800640 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
Mike Galbraithf8e617f2014-01-18 17:14:44 +0100641 } else {
Len Brownb2531492014-01-15 00:37:34 -0500642 local_irq_enable();
Mike Galbraithf8e617f2014-01-18 17:14:44 +0100643 }
644 __current_clr_polling();
Len Brownb2531492014-01-15 00:37:34 -0500645}
646
Paul Gortmaker148f9bb2013-06-18 18:23:59 -0400647void select_idle_routine(const struct cpuinfo_x86 *c)
Peter Zijlstra7f424a82008-04-25 17:39:01 +0200648{
Ingo Molnar3e5095d2009-01-27 17:07:08 +0100649#ifdef CONFIG_SMP
Thomas Gleixner7d1a9412013-03-21 22:50:03 +0100650 if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1)
Joe Perchesc767a542012-05-21 19:50:07 -0700651 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
Peter Zijlstra7f424a82008-04-25 17:39:01 +0200652#endif
Thomas Gleixner7d1a9412013-03-21 22:50:03 +0100653 if (x86_idle || boot_option_idle_override == IDLE_POLL)
Thomas Gleixner6ddd2a22008-06-09 16:59:53 +0200654 return;
655
Thomas Gleixnerbd7e7692016-12-09 19:29:09 +0100656 if (boot_cpu_has_bug(X86_BUG_AMD_E400)) {
Joe Perchesc767a542012-05-21 19:50:07 -0700657 pr_info("using AMD E400 aware idle routine\n");
Len Browna476bda2013-02-09 21:45:03 -0500658 x86_idle = amd_e400_idle;
Len Brownb2531492014-01-15 00:37:34 -0500659 } else if (prefer_mwait_c1_over_halt(c)) {
660 pr_info("using mwait in idle threads\n");
661 x86_idle = mwait_idle;
Thomas Gleixner6ddd2a22008-06-09 16:59:53 +0200662 } else
Len Browna476bda2013-02-09 21:45:03 -0500663 x86_idle = default_idle;
Peter Zijlstra7f424a82008-04-25 17:39:01 +0200664}
665
Len Brown02c68a02011-04-01 16:59:53 -0400666void __init init_amd_e400_c1e_mask(void)
Rusty Russell30e1e6d2009-03-17 14:50:34 +1030667{
Len Brown02c68a02011-04-01 16:59:53 -0400668 /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
Len Browna476bda2013-02-09 21:45:03 -0500669 if (x86_idle == amd_e400_idle)
Len Brown02c68a02011-04-01 16:59:53 -0400670 zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
Rusty Russell30e1e6d2009-03-17 14:50:34 +1030671}
672
Peter Zijlstra7f424a82008-04-25 17:39:01 +0200673static int __init idle_setup(char *str)
674{
Cyrill Gorcunovab6bc3e2008-07-05 15:53:36 +0400675 if (!str)
676 return -EINVAL;
677
Peter Zijlstra7f424a82008-04-25 17:39:01 +0200678 if (!strcmp(str, "poll")) {
Joe Perchesc767a542012-05-21 19:50:07 -0700679 pr_info("using polling idle threads\n");
Thomas Renningerd1896042010-11-03 17:06:14 +0100680 boot_option_idle_override = IDLE_POLL;
Thomas Gleixner7d1a9412013-03-21 22:50:03 +0100681 cpu_idle_poll_ctrl(true);
Thomas Renningerd1896042010-11-03 17:06:14 +0100682 } else if (!strcmp(str, "halt")) {
Zhao Yakuic1e3b372008-06-24 17:58:53 +0800683 /*
684 * When the boot option of idle=halt is added, halt is
685 * forced to be used for CPU idle. In such case CPU C2/C3
686 * won't be used again.
687 * To continue to load the CPU idle driver, don't touch
688 * the boot_option_idle_override.
689 */
Len Browna476bda2013-02-09 21:45:03 -0500690 x86_idle = default_idle;
Thomas Renningerd1896042010-11-03 17:06:14 +0100691 boot_option_idle_override = IDLE_HALT;
Zhao Yakuida5e09a2008-06-24 18:01:09 +0800692 } else if (!strcmp(str, "nomwait")) {
693 /*
694 * If the boot option of "idle=nomwait" is added,
695 * it means that mwait will be disabled for CPU C2/C3
696 * states. In such case it won't touch the variable
697 * of boot_option_idle_override.
698 */
Thomas Renningerd1896042010-11-03 17:06:14 +0100699 boot_option_idle_override = IDLE_NOMWAIT;
Zhao Yakuic1e3b372008-06-24 17:58:53 +0800700 } else
Peter Zijlstra7f424a82008-04-25 17:39:01 +0200701 return -1;
702
Peter Zijlstra7f424a82008-04-25 17:39:01 +0200703 return 0;
704}
705early_param("idle", idle_setup);
706
Amerigo Wang9d62dcd2009-05-11 22:05:28 -0400707unsigned long arch_align_stack(unsigned long sp)
708{
709 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
710 sp -= get_random_int() % 8192;
711 return sp & ~0xf;
712}
713
714unsigned long arch_randomize_brk(struct mm_struct *mm)
715{
Jason Cooper9c6f0902016-10-11 13:53:56 -0700716 return randomize_page(mm->brk, 0x02000000);
Amerigo Wang9d62dcd2009-05-11 22:05:28 -0400717}
718
Thomas Gleixner7ba78052015-09-30 08:38:23 +0000719/*
Brian Gerstffcb0432016-08-13 12:38:21 -0400720 * Return saved PC of a blocked thread.
721 * What is this good for? it will be always the scheduler or ret_from_fork.
722 */
723unsigned long thread_saved_pc(struct task_struct *tsk)
724{
725 struct inactive_task_frame *frame =
726 (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp);
727 return READ_ONCE_NOCHECK(frame->ret_addr);
728}
729
730/*
Thomas Gleixner7ba78052015-09-30 08:38:23 +0000731 * Called from fs/proc with a reference on @p to find the function
732 * which called into schedule(). This needs to be done carefully
733 * because the task might wake up and we might look at a stack
734 * changing under us.
735 */
736unsigned long get_wchan(struct task_struct *p)
737{
Andy Lutomirski74327a32016-09-15 22:45:46 -0700738 unsigned long start, bottom, top, sp, fp, ip, ret = 0;
Thomas Gleixner7ba78052015-09-30 08:38:23 +0000739 int count = 0;
740
741 if (!p || p == current || p->state == TASK_RUNNING)
742 return 0;
743
Andy Lutomirski74327a32016-09-15 22:45:46 -0700744 if (!try_get_task_stack(p))
745 return 0;
746
Thomas Gleixner7ba78052015-09-30 08:38:23 +0000747 start = (unsigned long)task_stack_page(p);
748 if (!start)
Andy Lutomirski74327a32016-09-15 22:45:46 -0700749 goto out;
Thomas Gleixner7ba78052015-09-30 08:38:23 +0000750
751 /*
752 * Layout of the stack page:
753 *
754 * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long)
755 * PADDING
756 * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
757 * stack
Andy Lutomirski15f4eae2016-09-13 14:29:25 -0700758 * ----------- bottom = start
Thomas Gleixner7ba78052015-09-30 08:38:23 +0000759 *
760 * The tasks stack pointer points at the location where the
761 * framepointer is stored. The data on the stack is:
762 * ... IP FP ... IP FP
763 *
764 * We need to read FP and IP, so we need to adjust the upper
765 * bound by another unsigned long.
766 */
767 top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
768 top -= 2 * sizeof(unsigned long);
Andy Lutomirski15f4eae2016-09-13 14:29:25 -0700769 bottom = start;
Thomas Gleixner7ba78052015-09-30 08:38:23 +0000770
771 sp = READ_ONCE(p->thread.sp);
772 if (sp < bottom || sp > top)
Andy Lutomirski74327a32016-09-15 22:45:46 -0700773 goto out;
Thomas Gleixner7ba78052015-09-30 08:38:23 +0000774
Brian Gerst7b32aea2016-08-13 12:38:18 -0400775 fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp);
Thomas Gleixner7ba78052015-09-30 08:38:23 +0000776 do {
777 if (fp < bottom || fp > top)
Andy Lutomirski74327a32016-09-15 22:45:46 -0700778 goto out;
Andrey Ryabininf7d27c32015-10-19 11:37:18 +0300779 ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long)));
Andy Lutomirski74327a32016-09-15 22:45:46 -0700780 if (!in_sched_functions(ip)) {
781 ret = ip;
782 goto out;
783 }
Andrey Ryabininf7d27c32015-10-19 11:37:18 +0300784 fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
Thomas Gleixner7ba78052015-09-30 08:38:23 +0000785 } while (count++ < 16 && p->state != TASK_RUNNING);
Andy Lutomirski74327a32016-09-15 22:45:46 -0700786
787out:
788 put_task_stack(p);
789 return ret;
Thomas Gleixner7ba78052015-09-30 08:38:23 +0000790}