Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 1 | #include <linux/bug.h> |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 2 | #include <linux/kernel.h> |
| 3 | #include <linux/errno.h> |
| 4 | #include <linux/string.h> |
| 5 | #include <linux/types.h> |
| 6 | #include <linux/bug.h> |
| 7 | #include <linux/init.h> |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 8 | #include <linux/interrupt.h> |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 9 | #include <linux/spinlock.h> |
| 10 | #include <linux/mm.h> |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 11 | #include <linux/uaccess.h> |
Josh Poimboeuf | 1709284 | 2019-04-12 15:39:29 -0500 | [diff] [blame] | 12 | #include <linux/cpu.h> |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 13 | |
Kees Cook | ea6cd39 | 2018-01-03 10:18:01 -0800 | [diff] [blame] | 14 | #undef pr_fmt |
| 15 | #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt |
| 16 | |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 17 | #include <asm/kaiser.h> |
Hugh Dickins | 0b5ca9d | 2017-08-17 15:00:37 -0700 | [diff] [blame] | 18 | #include <asm/tlbflush.h> /* to verify its kaiser declarations */ |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 19 | #include <asm/pgtable.h> |
| 20 | #include <asm/pgalloc.h> |
| 21 | #include <asm/desc.h> |
Borislav Petkov | 8018307 | 2018-01-02 14:19:48 +0100 | [diff] [blame] | 22 | #include <asm/cmdline.h> |
Borislav Petkov | beca4e2 | 2018-01-04 17:42:45 +0100 | [diff] [blame] | 23 | #include <asm/vsyscall.h> |
Nicolai Stange | 8574df1 | 2018-07-29 12:15:33 +0200 | [diff] [blame] | 24 | #include <asm/sections.h> |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 25 | |
Hugh Dickins | 23e0943 | 2017-09-24 16:59:49 -0700 | [diff] [blame] | 26 | int kaiser_enabled __read_mostly = 1; |
| 27 | EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */ |
| 28 | |
Hugh Dickins | 0b5ca9d | 2017-08-17 15:00:37 -0700 | [diff] [blame] | 29 | __visible |
| 30 | DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); |
| 31 | |
| 32 | /* |
| 33 | * These can have bit 63 set, so we can not just use a plain "or" |
| 34 | * instruction to get their value or'd into CR3. It would take |
| 35 | * another register. So, we use a memory reference to these instead. |
| 36 | * |
| 37 | * This is also handy because systems that do not support PCIDs |
| 38 | * just end up or'ing a 0 into their CR3, which does no harm. |
| 39 | */ |
Hugh Dickins | d0142ce | 2017-08-27 16:24:27 -0700 | [diff] [blame] | 40 | DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user); |
Hugh Dickins | 0b5ca9d | 2017-08-17 15:00:37 -0700 | [diff] [blame] | 41 | |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 42 | /* |
| 43 | * At runtime, the only things we map are some things for CPU |
| 44 | * hotplug, and stacks for new processes. No two CPUs will ever |
| 45 | * be populating the same addresses, so we only need to ensure |
| 46 | * that we protect between two CPUs trying to allocate and |
| 47 | * populate the same page table page. |
| 48 | * |
| 49 | * Only take this lock when doing a set_p[4um]d(), but it is not |
| 50 | * needed for doing a set_pte(). We assume that only the *owner* |
| 51 | * of a given allocation will be doing this for _their_ |
| 52 | * allocation. |
| 53 | * |
| 54 | * This ensures that once a system has been running for a while |
| 55 | * and there have been stacks all over and these page tables |
| 56 | * are fully populated, there will be no further acquisitions of |
| 57 | * this lock. |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 58 | */ |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 59 | static DEFINE_SPINLOCK(shadow_table_allocation_lock); |
| 60 | |
| 61 | /* |
| 62 | * Returns -1 on error. |
| 63 | */ |
| 64 | static inline unsigned long get_pa_from_mapping(unsigned long vaddr) |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 65 | { |
| 66 | pgd_t *pgd; |
| 67 | pud_t *pud; |
| 68 | pmd_t *pmd; |
| 69 | pte_t *pte; |
| 70 | |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 71 | pgd = pgd_offset_k(vaddr); |
| 72 | /* |
| 73 | * We made all the kernel PGDs present in kaiser_init(). |
| 74 | * We expect them to stay that way. |
| 75 | */ |
| 76 | BUG_ON(pgd_none(*pgd)); |
| 77 | /* |
| 78 | * PGDs are either 512GB or 128TB on all x86_64 |
| 79 | * configurations. We don't handle these. |
| 80 | */ |
| 81 | BUG_ON(pgd_large(*pgd)); |
| 82 | |
| 83 | pud = pud_offset(pgd, vaddr); |
| 84 | if (pud_none(*pud)) { |
| 85 | WARN_ON_ONCE(1); |
| 86 | return -1; |
| 87 | } |
| 88 | |
| 89 | if (pud_large(*pud)) |
| 90 | return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK); |
| 91 | |
| 92 | pmd = pmd_offset(pud, vaddr); |
| 93 | if (pmd_none(*pmd)) { |
| 94 | WARN_ON_ONCE(1); |
| 95 | return -1; |
| 96 | } |
| 97 | |
| 98 | if (pmd_large(*pmd)) |
| 99 | return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK); |
| 100 | |
| 101 | pte = pte_offset_kernel(pmd, vaddr); |
| 102 | if (pte_none(*pte)) { |
| 103 | WARN_ON_ONCE(1); |
| 104 | return -1; |
| 105 | } |
| 106 | |
| 107 | return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); |
| 108 | } |
| 109 | |
| 110 | /* |
| 111 | * This is a relatively normal page table walk, except that it |
| 112 | * also tries to allocate page tables pages along the way. |
| 113 | * |
| 114 | * Returns a pointer to a PTE on success, or NULL on failure. |
| 115 | */ |
Borislav Petkov | beca4e2 | 2018-01-04 17:42:45 +0100 | [diff] [blame] | 116 | static pte_t *kaiser_pagetable_walk(unsigned long address, bool user) |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 117 | { |
| 118 | pmd_t *pmd; |
| 119 | pud_t *pud; |
| 120 | pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); |
| 121 | gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); |
Borislav Petkov | beca4e2 | 2018-01-04 17:42:45 +0100 | [diff] [blame] | 122 | unsigned long prot = _KERNPG_TABLE; |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 123 | |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 124 | if (pgd_none(*pgd)) { |
| 125 | WARN_ONCE(1, "All shadow pgds should have been populated"); |
| 126 | return NULL; |
| 127 | } |
| 128 | BUILD_BUG_ON(pgd_large(*pgd) != 0); |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 129 | |
Borislav Petkov | beca4e2 | 2018-01-04 17:42:45 +0100 | [diff] [blame] | 130 | if (user) { |
| 131 | /* |
| 132 | * The vsyscall page is the only page that will have |
| 133 | * _PAGE_USER set. Catch everything else. |
| 134 | */ |
| 135 | BUG_ON(address != VSYSCALL_ADDR); |
| 136 | |
| 137 | set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); |
| 138 | prot = _PAGE_TABLE; |
| 139 | } |
| 140 | |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 141 | pud = pud_offset(pgd, address); |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 142 | /* The shadow page tables do not use large mappings: */ |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 143 | if (pud_large(*pud)) { |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 144 | WARN_ON(1); |
| 145 | return NULL; |
| 146 | } |
| 147 | if (pud_none(*pud)) { |
| 148 | unsigned long new_pmd_page = __get_free_page(gfp); |
| 149 | if (!new_pmd_page) |
| 150 | return NULL; |
| 151 | spin_lock(&shadow_table_allocation_lock); |
Hugh Dickins | 1972bb9 | 2017-09-09 21:27:32 -0700 | [diff] [blame] | 152 | if (pud_none(*pud)) { |
Borislav Petkov | beca4e2 | 2018-01-04 17:42:45 +0100 | [diff] [blame] | 153 | set_pud(pud, __pud(prot | __pa(new_pmd_page))); |
Hugh Dickins | 1972bb9 | 2017-09-09 21:27:32 -0700 | [diff] [blame] | 154 | __inc_zone_page_state(virt_to_page((void *) |
| 155 | new_pmd_page), NR_KAISERTABLE); |
| 156 | } else |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 157 | free_page(new_pmd_page); |
| 158 | spin_unlock(&shadow_table_allocation_lock); |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 159 | } |
| 160 | |
| 161 | pmd = pmd_offset(pud, address); |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 162 | /* The shadow page tables do not use large mappings: */ |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 163 | if (pmd_large(*pmd)) { |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 164 | WARN_ON(1); |
| 165 | return NULL; |
| 166 | } |
| 167 | if (pmd_none(*pmd)) { |
| 168 | unsigned long new_pte_page = __get_free_page(gfp); |
| 169 | if (!new_pte_page) |
| 170 | return NULL; |
| 171 | spin_lock(&shadow_table_allocation_lock); |
Hugh Dickins | 1972bb9 | 2017-09-09 21:27:32 -0700 | [diff] [blame] | 172 | if (pmd_none(*pmd)) { |
Borislav Petkov | beca4e2 | 2018-01-04 17:42:45 +0100 | [diff] [blame] | 173 | set_pmd(pmd, __pmd(prot | __pa(new_pte_page))); |
Hugh Dickins | 1972bb9 | 2017-09-09 21:27:32 -0700 | [diff] [blame] | 174 | __inc_zone_page_state(virt_to_page((void *) |
| 175 | new_pte_page), NR_KAISERTABLE); |
| 176 | } else |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 177 | free_page(new_pte_page); |
| 178 | spin_unlock(&shadow_table_allocation_lock); |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 179 | } |
| 180 | |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 181 | return pte_offset_kernel(pmd, address); |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 182 | } |
| 183 | |
Hugh Dickins | 23e0943 | 2017-09-24 16:59:49 -0700 | [diff] [blame] | 184 | static int kaiser_add_user_map(const void *__start_addr, unsigned long size, |
| 185 | unsigned long flags) |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 186 | { |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 187 | int ret = 0; |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 188 | pte_t *pte; |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 189 | unsigned long start_addr = (unsigned long )__start_addr; |
| 190 | unsigned long address = start_addr & PAGE_MASK; |
| 191 | unsigned long end_addr = PAGE_ALIGN(start_addr + size); |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 192 | unsigned long target_address; |
| 193 | |
Hugh Dickins | 23e0943 | 2017-09-24 16:59:49 -0700 | [diff] [blame] | 194 | /* |
| 195 | * It is convenient for callers to pass in __PAGE_KERNEL etc, |
| 196 | * and there is no actual harm from setting _PAGE_GLOBAL, so |
| 197 | * long as CR4.PGE is not set. But it is nonetheless troubling |
| 198 | * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser" |
| 199 | * requires that not to be #defined to 0): so mask it off here. |
| 200 | */ |
| 201 | flags &= ~_PAGE_GLOBAL; |
Lepton Wu | ec61baf | 2018-01-12 13:42:56 -0800 | [diff] [blame] | 202 | if (!(__supported_pte_mask & _PAGE_NX)) |
| 203 | flags &= ~_PAGE_NX; |
Hugh Dickins | 23e0943 | 2017-09-24 16:59:49 -0700 | [diff] [blame] | 204 | |
Hugh Dickins | f43f386 | 2017-09-03 18:48:02 -0700 | [diff] [blame] | 205 | for (; address < end_addr; address += PAGE_SIZE) { |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 206 | target_address = get_pa_from_mapping(address); |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 207 | if (target_address == -1) { |
| 208 | ret = -EIO; |
| 209 | break; |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 210 | } |
Borislav Petkov | beca4e2 | 2018-01-04 17:42:45 +0100 | [diff] [blame] | 211 | pte = kaiser_pagetable_walk(address, flags & _PAGE_USER); |
Hugh Dickins | f43f386 | 2017-09-03 18:48:02 -0700 | [diff] [blame] | 212 | if (!pte) { |
| 213 | ret = -ENOMEM; |
| 214 | break; |
| 215 | } |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 216 | if (pte_none(*pte)) { |
| 217 | set_pte(pte, __pte(flags | target_address)); |
| 218 | } else { |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 219 | pte_t tmp; |
| 220 | set_pte(&tmp, __pte(flags | target_address)); |
| 221 | WARN_ON_ONCE(!pte_same(*pte, tmp)); |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 222 | } |
| 223 | } |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 224 | return ret; |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 225 | } |
| 226 | |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 227 | static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags) |
| 228 | { |
| 229 | unsigned long size = end - start; |
| 230 | |
| 231 | return kaiser_add_user_map(start, size, flags); |
| 232 | } |
| 233 | |
| 234 | /* |
| 235 | * Ensure that the top level of the (shadow) page tables are |
| 236 | * entirely populated. This ensures that all processes that get |
| 237 | * forked have the same entries. This way, we do not have to |
| 238 | * ever go set up new entries in older processes. |
| 239 | * |
| 240 | * Note: we never free these, so there are no updates to them |
| 241 | * after this. |
| 242 | */ |
| 243 | static void __init kaiser_init_all_pgds(void) |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 244 | { |
| 245 | pgd_t *pgd; |
| 246 | int i = 0; |
| 247 | |
| 248 | pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); |
| 249 | for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 250 | pgd_t new_pgd; |
Hugh Dickins | 1972bb9 | 2017-09-09 21:27:32 -0700 | [diff] [blame] | 251 | pud_t *pud = pud_alloc_one(&init_mm, |
| 252 | PAGE_OFFSET + i * PGDIR_SIZE); |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 253 | if (!pud) { |
| 254 | WARN_ON(1); |
| 255 | break; |
| 256 | } |
Hugh Dickins | 1972bb9 | 2017-09-09 21:27:32 -0700 | [diff] [blame] | 257 | inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE); |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 258 | new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); |
| 259 | /* |
| 260 | * Make sure not to stomp on some other pgd entry. |
| 261 | */ |
| 262 | if (!pgd_none(pgd[i])) { |
| 263 | WARN_ON(1); |
| 264 | continue; |
| 265 | } |
| 266 | set_pgd(pgd + i, new_pgd); |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 267 | } |
| 268 | } |
| 269 | |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 270 | #define kaiser_add_user_map_early(start, size, flags) do { \ |
| 271 | int __ret = kaiser_add_user_map(start, size, flags); \ |
| 272 | WARN_ON(__ret); \ |
| 273 | } while (0) |
| 274 | |
| 275 | #define kaiser_add_user_map_ptrs_early(start, end, flags) do { \ |
| 276 | int __ret = kaiser_add_user_map_ptrs(start, end, flags); \ |
| 277 | WARN_ON(__ret); \ |
| 278 | } while (0) |
| 279 | |
Borislav Petkov | 8018307 | 2018-01-02 14:19:48 +0100 | [diff] [blame] | 280 | void __init kaiser_check_boottime_disable(void) |
| 281 | { |
| 282 | bool enable = true; |
| 283 | char arg[5]; |
| 284 | int ret; |
| 285 | |
Jiri Kosina | 402e63d | 2018-01-02 14:19:49 +0100 | [diff] [blame] | 286 | if (boot_cpu_has(X86_FEATURE_XENPV)) |
| 287 | goto silent_disable; |
| 288 | |
Borislav Petkov | 8018307 | 2018-01-02 14:19:48 +0100 | [diff] [blame] | 289 | ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); |
| 290 | if (ret > 0) { |
| 291 | if (!strncmp(arg, "on", 2)) |
| 292 | goto enable; |
| 293 | |
| 294 | if (!strncmp(arg, "off", 3)) |
| 295 | goto disable; |
| 296 | |
| 297 | if (!strncmp(arg, "auto", 4)) |
| 298 | goto skip; |
| 299 | } |
| 300 | |
Josh Poimboeuf | 1709284 | 2019-04-12 15:39:29 -0500 | [diff] [blame] | 301 | if (cmdline_find_option_bool(boot_command_line, "nopti") || |
| 302 | cpu_mitigations_off()) |
Borislav Petkov | 8018307 | 2018-01-02 14:19:48 +0100 | [diff] [blame] | 303 | goto disable; |
| 304 | |
| 305 | skip: |
| 306 | if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) |
| 307 | goto disable; |
| 308 | |
| 309 | enable: |
| 310 | if (enable) |
| 311 | setup_force_cpu_cap(X86_FEATURE_KAISER); |
| 312 | |
| 313 | return; |
| 314 | |
| 315 | disable: |
Kees Cook | ea6cd39 | 2018-01-03 10:18:01 -0800 | [diff] [blame] | 316 | pr_info("disabled\n"); |
Jiri Kosina | 402e63d | 2018-01-02 14:19:49 +0100 | [diff] [blame] | 317 | |
| 318 | silent_disable: |
Borislav Petkov | 8018307 | 2018-01-02 14:19:48 +0100 | [diff] [blame] | 319 | kaiser_enabled = 0; |
| 320 | setup_clear_cpu_cap(X86_FEATURE_KAISER); |
| 321 | } |
| 322 | |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 323 | /* |
| 324 | * If anything in here fails, we will likely die on one of the |
| 325 | * first kernel->user transitions and init will die. But, we |
| 326 | * will have most of the kernel up by then and should be able to |
| 327 | * get a clean warning out of it. If we BUG_ON() here, we run |
| 328 | * the risk of being before we have good console output. |
| 329 | */ |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 330 | void __init kaiser_init(void) |
| 331 | { |
| 332 | int cpu; |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 333 | |
Borislav Petkov | 8018307 | 2018-01-02 14:19:48 +0100 | [diff] [blame] | 334 | if (!kaiser_enabled) |
| 335 | return; |
Borislav Petkov | 50624dd | 2018-01-02 14:19:48 +0100 | [diff] [blame] | 336 | |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 337 | kaiser_init_all_pgds(); |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 338 | |
Borislav Petkov | beca4e2 | 2018-01-04 17:42:45 +0100 | [diff] [blame] | 339 | /* |
| 340 | * Note that this sets _PAGE_USER and it needs to happen when the |
| 341 | * pagetable hierarchy gets created, i.e., early. Otherwise |
| 342 | * kaiser_pagetable_walk() will encounter initialized PTEs in the |
| 343 | * hierarchy and not set the proper permissions, leading to the |
| 344 | * pagefaults with page-protection violations when trying to read the |
| 345 | * vsyscall page. For example. |
| 346 | */ |
| 347 | if (vsyscall_enabled()) |
| 348 | kaiser_add_user_map_early((void *)VSYSCALL_ADDR, |
| 349 | PAGE_SIZE, |
Ben Hutchings | 9a0be5a | 2018-01-26 16:23:02 +0000 | [diff] [blame] | 350 | vsyscall_pgprot); |
Borislav Petkov | beca4e2 | 2018-01-04 17:42:45 +0100 | [diff] [blame] | 351 | |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 352 | for_each_possible_cpu(cpu) { |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 353 | void *percpu_vaddr = __per_cpu_user_mapped_start + |
| 354 | per_cpu_offset(cpu); |
| 355 | unsigned long percpu_sz = __per_cpu_user_mapped_end - |
| 356 | __per_cpu_user_mapped_start; |
| 357 | kaiser_add_user_map_early(percpu_vaddr, percpu_sz, |
| 358 | __PAGE_KERNEL); |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 359 | } |
| 360 | |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 361 | /* |
| 362 | * Map the entry/exit text section, which is needed at |
| 363 | * switches from user to and from kernel. |
| 364 | */ |
| 365 | kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end, |
| 366 | __PAGE_KERNEL_RX); |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 367 | |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 368 | #if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) |
| 369 | kaiser_add_user_map_ptrs_early(__irqentry_text_start, |
| 370 | __irqentry_text_end, |
| 371 | __PAGE_KERNEL_RX); |
| 372 | #endif |
| 373 | kaiser_add_user_map_early((void *)idt_descr.address, |
| 374 | sizeof(gate_desc) * NR_VECTORS, |
| 375 | __PAGE_KERNEL_RO); |
| 376 | #ifdef CONFIG_TRACING |
| 377 | kaiser_add_user_map_early(&trace_idt_descr, |
| 378 | sizeof(trace_idt_descr), |
| 379 | __PAGE_KERNEL); |
| 380 | kaiser_add_user_map_early(&trace_idt_table, |
| 381 | sizeof(gate_desc) * NR_VECTORS, |
| 382 | __PAGE_KERNEL); |
| 383 | #endif |
| 384 | kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr), |
| 385 | __PAGE_KERNEL); |
| 386 | kaiser_add_user_map_early(&debug_idt_table, |
| 387 | sizeof(gate_desc) * NR_VECTORS, |
| 388 | __PAGE_KERNEL); |
Kees Cook | ea6cd39 | 2018-01-03 10:18:01 -0800 | [diff] [blame] | 389 | |
| 390 | pr_info("enabled\n"); |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 391 | } |
| 392 | |
Hugh Dickins | be6bf01 | 2017-09-03 19:23:08 -0700 | [diff] [blame] | 393 | /* Add a mapping to the shadow mapping, and synchronize the mappings */ |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 394 | int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 395 | { |
Hugh Dickins | 23e0943 | 2017-09-24 16:59:49 -0700 | [diff] [blame] | 396 | if (!kaiser_enabled) |
| 397 | return 0; |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 398 | return kaiser_add_user_map((const void *)addr, size, flags); |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 399 | } |
| 400 | |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 401 | void kaiser_remove_mapping(unsigned long start, unsigned long size) |
| 402 | { |
Hugh Dickins | be6bf01 | 2017-09-03 19:23:08 -0700 | [diff] [blame] | 403 | extern void unmap_pud_range_nofree(pgd_t *pgd, |
| 404 | unsigned long start, unsigned long end); |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 405 | unsigned long end = start + size; |
Hugh Dickins | 3df1461 | 2017-10-02 10:57:24 -0700 | [diff] [blame] | 406 | unsigned long addr, next; |
| 407 | pgd_t *pgd; |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 408 | |
Hugh Dickins | 23e0943 | 2017-09-24 16:59:49 -0700 | [diff] [blame] | 409 | if (!kaiser_enabled) |
| 410 | return; |
Hugh Dickins | 3df1461 | 2017-10-02 10:57:24 -0700 | [diff] [blame] | 411 | pgd = native_get_shadow_pgd(pgd_offset_k(start)); |
| 412 | for (addr = start; addr < end; pgd++, addr = next) { |
| 413 | next = pgd_addr_end(addr, end); |
| 414 | unmap_pud_range_nofree(pgd, addr, next); |
Dave Hansen | 8f0baad | 2017-08-30 16:23:00 -0700 | [diff] [blame] | 415 | } |
Richard Fellner | 13be448 | 2017-05-04 14:26:50 +0200 | [diff] [blame] | 416 | } |
Hugh Dickins | ac2f101 | 2017-09-05 12:05:01 -0700 | [diff] [blame] | 417 | |
| 418 | /* |
| 419 | * Page table pages are page-aligned. The lower half of the top |
| 420 | * level is used for userspace and the top half for the kernel. |
| 421 | * This returns true for user pages that need to get copied into |
| 422 | * both the user and kernel copies of the page tables, and false |
| 423 | * for kernel pages that should only be in the kernel copy. |
| 424 | */ |
| 425 | static inline bool is_userspace_pgd(pgd_t *pgdp) |
| 426 | { |
| 427 | return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2); |
| 428 | } |
| 429 | |
| 430 | pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) |
| 431 | { |
Hugh Dickins | 23e0943 | 2017-09-24 16:59:49 -0700 | [diff] [blame] | 432 | if (!kaiser_enabled) |
| 433 | return pgd; |
Hugh Dickins | ac2f101 | 2017-09-05 12:05:01 -0700 | [diff] [blame] | 434 | /* |
| 435 | * Do we need to also populate the shadow pgd? Check _PAGE_USER to |
| 436 | * skip cases like kexec and EFI which make temporary low mappings. |
| 437 | */ |
| 438 | if (pgd.pgd & _PAGE_USER) { |
| 439 | if (is_userspace_pgd(pgdp)) { |
| 440 | native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; |
| 441 | /* |
| 442 | * Even if the entry is *mapping* userspace, ensure |
| 443 | * that userspace can not use it. This way, if we |
| 444 | * get out to userspace running on the kernel CR3, |
| 445 | * userspace will crash instead of running. |
| 446 | */ |
Guenter Roeck | 92fd81f | 2018-01-04 13:41:55 -0800 | [diff] [blame] | 447 | if (__supported_pte_mask & _PAGE_NX) |
| 448 | pgd.pgd |= _PAGE_NX; |
Hugh Dickins | ac2f101 | 2017-09-05 12:05:01 -0700 | [diff] [blame] | 449 | } |
| 450 | } else if (!pgd.pgd) { |
| 451 | /* |
| 452 | * pgd_clear() cannot check _PAGE_USER, and is even used to |
| 453 | * clear corrupted pgd entries: so just rely on cases like |
| 454 | * kexec and EFI never to be using pgd_clear(). |
| 455 | */ |
| 456 | if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) && |
| 457 | is_userspace_pgd(pgdp)) |
| 458 | native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; |
| 459 | } |
| 460 | return pgd; |
| 461 | } |
Hugh Dickins | 0b5ca9d | 2017-08-17 15:00:37 -0700 | [diff] [blame] | 462 | |
| 463 | void kaiser_setup_pcid(void) |
| 464 | { |
Hugh Dickins | 0b5ca9d | 2017-08-17 15:00:37 -0700 | [diff] [blame] | 465 | unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET; |
| 466 | |
Hugh Dickins | 169b369 | 2017-10-03 20:49:04 -0700 | [diff] [blame] | 467 | if (this_cpu_has(X86_FEATURE_PCID)) |
Hugh Dickins | 0b5ca9d | 2017-08-17 15:00:37 -0700 | [diff] [blame] | 468 | user_cr3 |= X86_CR3_PCID_USER_NOFLUSH; |
Hugh Dickins | 0b5ca9d | 2017-08-17 15:00:37 -0700 | [diff] [blame] | 469 | /* |
| 470 | * These variables are used by the entry/exit |
| 471 | * code to change PCID and pgd and TLB flushing. |
| 472 | */ |
Hugh Dickins | d0142ce | 2017-08-27 16:24:27 -0700 | [diff] [blame] | 473 | this_cpu_write(x86_cr3_pcid_user, user_cr3); |
Hugh Dickins | 0b5ca9d | 2017-08-17 15:00:37 -0700 | [diff] [blame] | 474 | } |
| 475 | |
| 476 | /* |
| 477 | * Make a note that this cpu will need to flush USER tlb on return to user. |
Hugh Dickins | fe5cb75 | 2017-11-04 18:43:06 -0700 | [diff] [blame] | 478 | * If cpu does not have PCID, then the NOFLUSH bit will never have been set. |
Hugh Dickins | 0b5ca9d | 2017-08-17 15:00:37 -0700 | [diff] [blame] | 479 | */ |
| 480 | void kaiser_flush_tlb_on_return_to_user(void) |
| 481 | { |
Hugh Dickins | fe5cb75 | 2017-11-04 18:43:06 -0700 | [diff] [blame] | 482 | if (this_cpu_has(X86_FEATURE_PCID)) |
| 483 | this_cpu_write(x86_cr3_pcid_user, |
Hugh Dickins | 0b5ca9d | 2017-08-17 15:00:37 -0700 | [diff] [blame] | 484 | X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); |
| 485 | } |
| 486 | EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); |