blob: d8376b4ad9f0ad0e1cf6387890857e032b2ce600 [file] [log] [blame]
Dave Hansen8f0baad2017-08-30 16:23:00 -07001#include <linux/bug.h>
Richard Fellner13be4482017-05-04 14:26:50 +02002#include <linux/kernel.h>
3#include <linux/errno.h>
4#include <linux/string.h>
5#include <linux/types.h>
6#include <linux/bug.h>
7#include <linux/init.h>
Dave Hansen8f0baad2017-08-30 16:23:00 -07008#include <linux/interrupt.h>
Richard Fellner13be4482017-05-04 14:26:50 +02009#include <linux/spinlock.h>
10#include <linux/mm.h>
Richard Fellner13be4482017-05-04 14:26:50 +020011#include <linux/uaccess.h>
Dave Hansen8f0baad2017-08-30 16:23:00 -070012
Kees Cookea6cd392018-01-03 10:18:01 -080013#undef pr_fmt
14#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
15
Dave Hansen8f0baad2017-08-30 16:23:00 -070016#include <asm/kaiser.h>
Hugh Dickins0b5ca9d2017-08-17 15:00:37 -070017#include <asm/tlbflush.h> /* to verify its kaiser declarations */
Richard Fellner13be4482017-05-04 14:26:50 +020018#include <asm/pgtable.h>
19#include <asm/pgalloc.h>
20#include <asm/desc.h>
Borislav Petkov80183072018-01-02 14:19:48 +010021#include <asm/cmdline.h>
Richard Fellner13be4482017-05-04 14:26:50 +020022
Hugh Dickins23e09432017-09-24 16:59:49 -070023int kaiser_enabled __read_mostly = 1;
24EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
25
Hugh Dickins0b5ca9d2017-08-17 15:00:37 -070026__visible
27DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
28
29/*
30 * These can have bit 63 set, so we can not just use a plain "or"
31 * instruction to get their value or'd into CR3. It would take
32 * another register. So, we use a memory reference to these instead.
33 *
34 * This is also handy because systems that do not support PCIDs
35 * just end up or'ing a 0 into their CR3, which does no harm.
36 */
Hugh Dickinsd0142ce2017-08-27 16:24:27 -070037DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
Hugh Dickins0b5ca9d2017-08-17 15:00:37 -070038
Dave Hansen8f0baad2017-08-30 16:23:00 -070039/*
40 * At runtime, the only things we map are some things for CPU
41 * hotplug, and stacks for new processes. No two CPUs will ever
42 * be populating the same addresses, so we only need to ensure
43 * that we protect between two CPUs trying to allocate and
44 * populate the same page table page.
45 *
46 * Only take this lock when doing a set_p[4um]d(), but it is not
47 * needed for doing a set_pte(). We assume that only the *owner*
48 * of a given allocation will be doing this for _their_
49 * allocation.
50 *
51 * This ensures that once a system has been running for a while
52 * and there have been stacks all over and these page tables
53 * are fully populated, there will be no further acquisitions of
54 * this lock.
Richard Fellner13be4482017-05-04 14:26:50 +020055 */
Dave Hansen8f0baad2017-08-30 16:23:00 -070056static DEFINE_SPINLOCK(shadow_table_allocation_lock);
57
58/*
59 * Returns -1 on error.
60 */
61static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
Richard Fellner13be4482017-05-04 14:26:50 +020062{
63 pgd_t *pgd;
64 pud_t *pud;
65 pmd_t *pmd;
66 pte_t *pte;
67
Dave Hansen8f0baad2017-08-30 16:23:00 -070068 pgd = pgd_offset_k(vaddr);
69 /*
70 * We made all the kernel PGDs present in kaiser_init().
71 * We expect them to stay that way.
72 */
73 BUG_ON(pgd_none(*pgd));
74 /*
75 * PGDs are either 512GB or 128TB on all x86_64
76 * configurations. We don't handle these.
77 */
78 BUG_ON(pgd_large(*pgd));
79
80 pud = pud_offset(pgd, vaddr);
81 if (pud_none(*pud)) {
82 WARN_ON_ONCE(1);
83 return -1;
84 }
85
86 if (pud_large(*pud))
87 return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
88
89 pmd = pmd_offset(pud, vaddr);
90 if (pmd_none(*pmd)) {
91 WARN_ON_ONCE(1);
92 return -1;
93 }
94
95 if (pmd_large(*pmd))
96 return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
97
98 pte = pte_offset_kernel(pmd, vaddr);
99 if (pte_none(*pte)) {
100 WARN_ON_ONCE(1);
101 return -1;
102 }
103
104 return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
105}
106
107/*
108 * This is a relatively normal page table walk, except that it
109 * also tries to allocate page tables pages along the way.
110 *
111 * Returns a pointer to a PTE on success, or NULL on failure.
112 */
Hugh Dickins8c2f8a52017-10-29 11:36:19 -0700113static pte_t *kaiser_pagetable_walk(unsigned long address)
Dave Hansen8f0baad2017-08-30 16:23:00 -0700114{
115 pmd_t *pmd;
116 pud_t *pud;
117 pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
118 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
119
Dave Hansen8f0baad2017-08-30 16:23:00 -0700120 if (pgd_none(*pgd)) {
121 WARN_ONCE(1, "All shadow pgds should have been populated");
122 return NULL;
123 }
124 BUILD_BUG_ON(pgd_large(*pgd) != 0);
Richard Fellner13be4482017-05-04 14:26:50 +0200125
126 pud = pud_offset(pgd, address);
Dave Hansen8f0baad2017-08-30 16:23:00 -0700127 /* The shadow page tables do not use large mappings: */
Richard Fellner13be4482017-05-04 14:26:50 +0200128 if (pud_large(*pud)) {
Dave Hansen8f0baad2017-08-30 16:23:00 -0700129 WARN_ON(1);
130 return NULL;
131 }
132 if (pud_none(*pud)) {
133 unsigned long new_pmd_page = __get_free_page(gfp);
134 if (!new_pmd_page)
135 return NULL;
136 spin_lock(&shadow_table_allocation_lock);
Hugh Dickins1972bb92017-09-09 21:27:32 -0700137 if (pud_none(*pud)) {
Dave Hansen8f0baad2017-08-30 16:23:00 -0700138 set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
Hugh Dickins1972bb92017-09-09 21:27:32 -0700139 __inc_zone_page_state(virt_to_page((void *)
140 new_pmd_page), NR_KAISERTABLE);
141 } else
Dave Hansen8f0baad2017-08-30 16:23:00 -0700142 free_page(new_pmd_page);
143 spin_unlock(&shadow_table_allocation_lock);
Richard Fellner13be4482017-05-04 14:26:50 +0200144 }
145
146 pmd = pmd_offset(pud, address);
Dave Hansen8f0baad2017-08-30 16:23:00 -0700147 /* The shadow page tables do not use large mappings: */
Richard Fellner13be4482017-05-04 14:26:50 +0200148 if (pmd_large(*pmd)) {
Dave Hansen8f0baad2017-08-30 16:23:00 -0700149 WARN_ON(1);
150 return NULL;
151 }
152 if (pmd_none(*pmd)) {
153 unsigned long new_pte_page = __get_free_page(gfp);
154 if (!new_pte_page)
155 return NULL;
156 spin_lock(&shadow_table_allocation_lock);
Hugh Dickins1972bb92017-09-09 21:27:32 -0700157 if (pmd_none(*pmd)) {
Dave Hansen8f0baad2017-08-30 16:23:00 -0700158 set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
Hugh Dickins1972bb92017-09-09 21:27:32 -0700159 __inc_zone_page_state(virt_to_page((void *)
160 new_pte_page), NR_KAISERTABLE);
161 } else
Dave Hansen8f0baad2017-08-30 16:23:00 -0700162 free_page(new_pte_page);
163 spin_unlock(&shadow_table_allocation_lock);
Richard Fellner13be4482017-05-04 14:26:50 +0200164 }
165
Dave Hansen8f0baad2017-08-30 16:23:00 -0700166 return pte_offset_kernel(pmd, address);
Richard Fellner13be4482017-05-04 14:26:50 +0200167}
168
Hugh Dickins23e09432017-09-24 16:59:49 -0700169static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
170 unsigned long flags)
Richard Fellner13be4482017-05-04 14:26:50 +0200171{
Dave Hansen8f0baad2017-08-30 16:23:00 -0700172 int ret = 0;
Richard Fellner13be4482017-05-04 14:26:50 +0200173 pte_t *pte;
Dave Hansen8f0baad2017-08-30 16:23:00 -0700174 unsigned long start_addr = (unsigned long )__start_addr;
175 unsigned long address = start_addr & PAGE_MASK;
176 unsigned long end_addr = PAGE_ALIGN(start_addr + size);
Richard Fellner13be4482017-05-04 14:26:50 +0200177 unsigned long target_address;
178
Hugh Dickins23e09432017-09-24 16:59:49 -0700179 /*
180 * It is convenient for callers to pass in __PAGE_KERNEL etc,
181 * and there is no actual harm from setting _PAGE_GLOBAL, so
182 * long as CR4.PGE is not set. But it is nonetheless troubling
183 * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
184 * requires that not to be #defined to 0): so mask it off here.
185 */
186 flags &= ~_PAGE_GLOBAL;
187
Hugh Dickinsf43f3862017-09-03 18:48:02 -0700188 for (; address < end_addr; address += PAGE_SIZE) {
Richard Fellner13be4482017-05-04 14:26:50 +0200189 target_address = get_pa_from_mapping(address);
Dave Hansen8f0baad2017-08-30 16:23:00 -0700190 if (target_address == -1) {
191 ret = -EIO;
192 break;
Richard Fellner13be4482017-05-04 14:26:50 +0200193 }
Hugh Dickins8c2f8a52017-10-29 11:36:19 -0700194 pte = kaiser_pagetable_walk(address);
Hugh Dickinsf43f3862017-09-03 18:48:02 -0700195 if (!pte) {
196 ret = -ENOMEM;
197 break;
198 }
Richard Fellner13be4482017-05-04 14:26:50 +0200199 if (pte_none(*pte)) {
200 set_pte(pte, __pte(flags | target_address));
201 } else {
Dave Hansen8f0baad2017-08-30 16:23:00 -0700202 pte_t tmp;
203 set_pte(&tmp, __pte(flags | target_address));
204 WARN_ON_ONCE(!pte_same(*pte, tmp));
Richard Fellner13be4482017-05-04 14:26:50 +0200205 }
206 }
Dave Hansen8f0baad2017-08-30 16:23:00 -0700207 return ret;
Richard Fellner13be4482017-05-04 14:26:50 +0200208}
209
Dave Hansen8f0baad2017-08-30 16:23:00 -0700210static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
211{
212 unsigned long size = end - start;
213
214 return kaiser_add_user_map(start, size, flags);
215}
216
217/*
218 * Ensure that the top level of the (shadow) page tables are
219 * entirely populated. This ensures that all processes that get
220 * forked have the same entries. This way, we do not have to
221 * ever go set up new entries in older processes.
222 *
223 * Note: we never free these, so there are no updates to them
224 * after this.
225 */
226static void __init kaiser_init_all_pgds(void)
Richard Fellner13be4482017-05-04 14:26:50 +0200227{
228 pgd_t *pgd;
229 int i = 0;
230
231 pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
232 for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
Dave Hansen8f0baad2017-08-30 16:23:00 -0700233 pgd_t new_pgd;
Hugh Dickins1972bb92017-09-09 21:27:32 -0700234 pud_t *pud = pud_alloc_one(&init_mm,
235 PAGE_OFFSET + i * PGDIR_SIZE);
Dave Hansen8f0baad2017-08-30 16:23:00 -0700236 if (!pud) {
237 WARN_ON(1);
238 break;
239 }
Hugh Dickins1972bb92017-09-09 21:27:32 -0700240 inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
Dave Hansen8f0baad2017-08-30 16:23:00 -0700241 new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
242 /*
243 * Make sure not to stomp on some other pgd entry.
244 */
245 if (!pgd_none(pgd[i])) {
246 WARN_ON(1);
247 continue;
248 }
249 set_pgd(pgd + i, new_pgd);
Richard Fellner13be4482017-05-04 14:26:50 +0200250 }
251}
252
Dave Hansen8f0baad2017-08-30 16:23:00 -0700253#define kaiser_add_user_map_early(start, size, flags) do { \
254 int __ret = kaiser_add_user_map(start, size, flags); \
255 WARN_ON(__ret); \
256} while (0)
257
258#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
259 int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
260 WARN_ON(__ret); \
261} while (0)
262
Borislav Petkov80183072018-01-02 14:19:48 +0100263void __init kaiser_check_boottime_disable(void)
264{
265 bool enable = true;
266 char arg[5];
267 int ret;
268
Jiri Kosina402e63d2018-01-02 14:19:49 +0100269 if (boot_cpu_has(X86_FEATURE_XENPV))
270 goto silent_disable;
271
Borislav Petkov80183072018-01-02 14:19:48 +0100272 ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
273 if (ret > 0) {
274 if (!strncmp(arg, "on", 2))
275 goto enable;
276
277 if (!strncmp(arg, "off", 3))
278 goto disable;
279
280 if (!strncmp(arg, "auto", 4))
281 goto skip;
282 }
283
284 if (cmdline_find_option_bool(boot_command_line, "nopti"))
285 goto disable;
286
287skip:
288 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
289 goto disable;
290
291enable:
292 if (enable)
293 setup_force_cpu_cap(X86_FEATURE_KAISER);
294
295 return;
296
297disable:
Kees Cookea6cd392018-01-03 10:18:01 -0800298 pr_info("disabled\n");
Jiri Kosina402e63d2018-01-02 14:19:49 +0100299
300silent_disable:
Borislav Petkov80183072018-01-02 14:19:48 +0100301 kaiser_enabled = 0;
302 setup_clear_cpu_cap(X86_FEATURE_KAISER);
303}
304
Dave Hansen8f0baad2017-08-30 16:23:00 -0700305/*
306 * If anything in here fails, we will likely die on one of the
307 * first kernel->user transitions and init will die. But, we
308 * will have most of the kernel up by then and should be able to
309 * get a clean warning out of it. If we BUG_ON() here, we run
310 * the risk of being before we have good console output.
311 */
Richard Fellner13be4482017-05-04 14:26:50 +0200312void __init kaiser_init(void)
313{
314 int cpu;
Richard Fellner13be4482017-05-04 14:26:50 +0200315
Borislav Petkov80183072018-01-02 14:19:48 +0100316 if (!kaiser_enabled)
317 return;
Borislav Petkov50624dd2018-01-02 14:19:48 +0100318
Dave Hansen8f0baad2017-08-30 16:23:00 -0700319 kaiser_init_all_pgds();
Richard Fellner13be4482017-05-04 14:26:50 +0200320
321 for_each_possible_cpu(cpu) {
Dave Hansen8f0baad2017-08-30 16:23:00 -0700322 void *percpu_vaddr = __per_cpu_user_mapped_start +
323 per_cpu_offset(cpu);
324 unsigned long percpu_sz = __per_cpu_user_mapped_end -
325 __per_cpu_user_mapped_start;
326 kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
327 __PAGE_KERNEL);
Richard Fellner13be4482017-05-04 14:26:50 +0200328 }
329
Dave Hansen8f0baad2017-08-30 16:23:00 -0700330 /*
331 * Map the entry/exit text section, which is needed at
332 * switches from user to and from kernel.
333 */
334 kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
335 __PAGE_KERNEL_RX);
Richard Fellner13be4482017-05-04 14:26:50 +0200336
Dave Hansen8f0baad2017-08-30 16:23:00 -0700337#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
338 kaiser_add_user_map_ptrs_early(__irqentry_text_start,
339 __irqentry_text_end,
340 __PAGE_KERNEL_RX);
341#endif
342 kaiser_add_user_map_early((void *)idt_descr.address,
343 sizeof(gate_desc) * NR_VECTORS,
344 __PAGE_KERNEL_RO);
345#ifdef CONFIG_TRACING
346 kaiser_add_user_map_early(&trace_idt_descr,
347 sizeof(trace_idt_descr),
348 __PAGE_KERNEL);
349 kaiser_add_user_map_early(&trace_idt_table,
350 sizeof(gate_desc) * NR_VECTORS,
351 __PAGE_KERNEL);
352#endif
353 kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
354 __PAGE_KERNEL);
355 kaiser_add_user_map_early(&debug_idt_table,
356 sizeof(gate_desc) * NR_VECTORS,
357 __PAGE_KERNEL);
Kees Cookea6cd392018-01-03 10:18:01 -0800358
359 pr_info("enabled\n");
Richard Fellner13be4482017-05-04 14:26:50 +0200360}
361
Hugh Dickinsbe6bf012017-09-03 19:23:08 -0700362/* Add a mapping to the shadow mapping, and synchronize the mappings */
Dave Hansen8f0baad2017-08-30 16:23:00 -0700363int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
Richard Fellner13be4482017-05-04 14:26:50 +0200364{
Hugh Dickins23e09432017-09-24 16:59:49 -0700365 if (!kaiser_enabled)
366 return 0;
Dave Hansen8f0baad2017-08-30 16:23:00 -0700367 return kaiser_add_user_map((const void *)addr, size, flags);
Richard Fellner13be4482017-05-04 14:26:50 +0200368}
369
Richard Fellner13be4482017-05-04 14:26:50 +0200370void kaiser_remove_mapping(unsigned long start, unsigned long size)
371{
Hugh Dickinsbe6bf012017-09-03 19:23:08 -0700372 extern void unmap_pud_range_nofree(pgd_t *pgd,
373 unsigned long start, unsigned long end);
Dave Hansen8f0baad2017-08-30 16:23:00 -0700374 unsigned long end = start + size;
Hugh Dickins3df14612017-10-02 10:57:24 -0700375 unsigned long addr, next;
376 pgd_t *pgd;
Dave Hansen8f0baad2017-08-30 16:23:00 -0700377
Hugh Dickins23e09432017-09-24 16:59:49 -0700378 if (!kaiser_enabled)
379 return;
Hugh Dickins3df14612017-10-02 10:57:24 -0700380 pgd = native_get_shadow_pgd(pgd_offset_k(start));
381 for (addr = start; addr < end; pgd++, addr = next) {
382 next = pgd_addr_end(addr, end);
383 unmap_pud_range_nofree(pgd, addr, next);
Dave Hansen8f0baad2017-08-30 16:23:00 -0700384 }
Richard Fellner13be4482017-05-04 14:26:50 +0200385}
Hugh Dickinsac2f1012017-09-05 12:05:01 -0700386
387/*
388 * Page table pages are page-aligned. The lower half of the top
389 * level is used for userspace and the top half for the kernel.
390 * This returns true for user pages that need to get copied into
391 * both the user and kernel copies of the page tables, and false
392 * for kernel pages that should only be in the kernel copy.
393 */
394static inline bool is_userspace_pgd(pgd_t *pgdp)
395{
396 return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
397}
398
399pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
400{
Hugh Dickins23e09432017-09-24 16:59:49 -0700401 if (!kaiser_enabled)
402 return pgd;
Hugh Dickinsac2f1012017-09-05 12:05:01 -0700403 /*
404 * Do we need to also populate the shadow pgd? Check _PAGE_USER to
405 * skip cases like kexec and EFI which make temporary low mappings.
406 */
407 if (pgd.pgd & _PAGE_USER) {
408 if (is_userspace_pgd(pgdp)) {
409 native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
410 /*
411 * Even if the entry is *mapping* userspace, ensure
412 * that userspace can not use it. This way, if we
413 * get out to userspace running on the kernel CR3,
414 * userspace will crash instead of running.
415 */
Guenter Roeck92fd81f2018-01-04 13:41:55 -0800416 if (__supported_pte_mask & _PAGE_NX)
417 pgd.pgd |= _PAGE_NX;
Hugh Dickinsac2f1012017-09-05 12:05:01 -0700418 }
419 } else if (!pgd.pgd) {
420 /*
421 * pgd_clear() cannot check _PAGE_USER, and is even used to
422 * clear corrupted pgd entries: so just rely on cases like
423 * kexec and EFI never to be using pgd_clear().
424 */
425 if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
426 is_userspace_pgd(pgdp))
427 native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
428 }
429 return pgd;
430}
Hugh Dickins0b5ca9d2017-08-17 15:00:37 -0700431
432void kaiser_setup_pcid(void)
433{
Hugh Dickins0b5ca9d2017-08-17 15:00:37 -0700434 unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
435
Hugh Dickins169b3692017-10-03 20:49:04 -0700436 if (this_cpu_has(X86_FEATURE_PCID))
Hugh Dickins0b5ca9d2017-08-17 15:00:37 -0700437 user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
Hugh Dickins0b5ca9d2017-08-17 15:00:37 -0700438 /*
439 * These variables are used by the entry/exit
440 * code to change PCID and pgd and TLB flushing.
441 */
Hugh Dickinsd0142ce2017-08-27 16:24:27 -0700442 this_cpu_write(x86_cr3_pcid_user, user_cr3);
Hugh Dickins0b5ca9d2017-08-17 15:00:37 -0700443}
444
445/*
446 * Make a note that this cpu will need to flush USER tlb on return to user.
Hugh Dickinsfe5cb752017-11-04 18:43:06 -0700447 * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
Hugh Dickins0b5ca9d2017-08-17 15:00:37 -0700448 */
449void kaiser_flush_tlb_on_return_to_user(void)
450{
Hugh Dickinsfe5cb752017-11-04 18:43:06 -0700451 if (this_cpu_has(X86_FEATURE_PCID))
452 this_cpu_write(x86_cr3_pcid_user,
Hugh Dickins0b5ca9d2017-08-17 15:00:37 -0700453 X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
454}
455EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);