Blame - arch/x86/mm/kaiser.c - kernel/msm-4.9

blob: 12522dbae615d9f984c4b7fba7ed168719648a39 [file] [log] [blame]

Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	1	#include <linux/bug.h>
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	2	#include <linux/kernel.h>
				3	#include <linux/errno.h>
				4	#include <linux/string.h>
				5	#include <linux/types.h>
				6	#include <linux/bug.h>
				7	#include <linux/init.h>
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	8	#include <linux/interrupt.h>
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	9	#include <linux/spinlock.h>
				10	#include <linux/mm.h>
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	11	#include <linux/uaccess.h>
Josh Poimboeuf	1709284	2019-04-12 15:39:29 -0500	[diff] [blame]	12	#include <linux/cpu.h>
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	13
Kees Cook	ea6cd39	2018-01-03 10:18:01 -0800	[diff] [blame]	14	#undef pr_fmt
				15	#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
				16
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	17	#include <asm/kaiser.h>
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	18	#include <asm/tlbflush.h> /* to verify its kaiser declarations */
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	19	#include <asm/pgtable.h>
				20	#include <asm/pgalloc.h>
				21	#include <asm/desc.h>
Borislav Petkov	8018307	2018-01-02 14:19:48 +0100	[diff] [blame]	22	#include <asm/cmdline.h>
Borislav Petkov	beca4e2	2018-01-04 17:42:45 +0100	[diff] [blame]	23	#include <asm/vsyscall.h>
Nicolai Stange	8574df1	2018-07-29 12:15:33 +0200	[diff] [blame]	24	#include <asm/sections.h>
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	25
Hugh Dickins	23e0943	2017-09-24 16:59:49 -0700	[diff] [blame]	26	int kaiser_enabled __read_mostly = 1;
				27	EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
				28
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	29	__visible
				30	DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
				31
				32	/*
				33	* These can have bit 63 set, so we can not just use a plain "or"
				34	* instruction to get their value or'd into CR3. It would take
				35	* another register. So, we use a memory reference to these instead.
				36	*
				37	* This is also handy because systems that do not support PCIDs
				38	* just end up or'ing a 0 into their CR3, which does no harm.
				39	*/
Hugh Dickins	d0142ce	2017-08-27 16:24:27 -0700	[diff] [blame]	40	DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	41
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	42	/*
				43	* At runtime, the only things we map are some things for CPU
				44	* hotplug, and stacks for new processes. No two CPUs will ever
				45	* be populating the same addresses, so we only need to ensure
				46	* that we protect between two CPUs trying to allocate and
				47	* populate the same page table page.
				48	*
				49	* Only take this lock when doing a set_p[4um]d(), but it is not
				50	* needed for doing a set_pte(). We assume that only the owner
				51	* of a given allocation will be doing this for _their_
				52	* allocation.
				53	*
				54	* This ensures that once a system has been running for a while
				55	* and there have been stacks all over and these page tables
				56	* are fully populated, there will be no further acquisitions of
				57	* this lock.
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	58	*/
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	59	static DEFINE_SPINLOCK(shadow_table_allocation_lock);
				60
				61	/*
				62	* Returns -1 on error.
				63	*/
				64	static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	65	{
				66	pgd_t *pgd;
				67	pud_t *pud;
				68	pmd_t *pmd;
				69	pte_t *pte;
				70
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	71	pgd = pgd_offset_k(vaddr);
				72	/*
				73	* We made all the kernel PGDs present in kaiser_init().
				74	* We expect them to stay that way.
				75	*/
				76	BUG_ON(pgd_none(*pgd));
				77	/*
				78	* PGDs are either 512GB or 128TB on all x86_64
				79	* configurations. We don't handle these.
				80	*/
				81	BUG_ON(pgd_large(*pgd));
				82
				83	pud = pud_offset(pgd, vaddr);
				84	if (pud_none(*pud)) {
				85	WARN_ON_ONCE(1);
				86	return -1;
				87	}
				88
				89	if (pud_large(*pud))
				90	return (pud_pfn(*pud) << PAGE_SHIFT) \| (vaddr & ~PUD_PAGE_MASK);
				91
				92	pmd = pmd_offset(pud, vaddr);
				93	if (pmd_none(*pmd)) {
				94	WARN_ON_ONCE(1);
				95	return -1;
				96	}
				97
				98	if (pmd_large(*pmd))
				99	return (pmd_pfn(*pmd) << PAGE_SHIFT) \| (vaddr & ~PMD_PAGE_MASK);
				100
				101	pte = pte_offset_kernel(pmd, vaddr);
				102	if (pte_none(*pte)) {
				103	WARN_ON_ONCE(1);
				104	return -1;
				105	}
				106
				107	return (pte_pfn(*pte) << PAGE_SHIFT) \| (vaddr & ~PAGE_MASK);
				108	}
				109
				110	/*
				111	* This is a relatively normal page table walk, except that it
				112	* also tries to allocate page tables pages along the way.
				113	*
				114	* Returns a pointer to a PTE on success, or NULL on failure.
				115	*/
Borislav Petkov	beca4e2	2018-01-04 17:42:45 +0100	[diff] [blame]	116	static pte_t *kaiser_pagetable_walk(unsigned long address, bool user)
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	117	{
				118	pmd_t *pmd;
				119	pud_t *pud;
				120	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
				121	gfp_t gfp = (GFP_KERNEL \| __GFP_NOTRACK \| __GFP_ZERO);
Borislav Petkov	beca4e2	2018-01-04 17:42:45 +0100	[diff] [blame]	122	unsigned long prot = _KERNPG_TABLE;
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	123
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	124	if (pgd_none(*pgd)) {
				125	WARN_ONCE(1, "All shadow pgds should have been populated");
				126	return NULL;
				127	}
				128	BUILD_BUG_ON(pgd_large(*pgd) != 0);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	129
Borislav Petkov	beca4e2	2018-01-04 17:42:45 +0100	[diff] [blame]	130	if (user) {
				131	/*
				132	* The vsyscall page is the only page that will have
				133	* _PAGE_USER set. Catch everything else.
				134	*/
				135	BUG_ON(address != VSYSCALL_ADDR);
				136
				137	set_pgd(pgd, __pgd(pgd_val(*pgd) \| _PAGE_USER));
				138	prot = _PAGE_TABLE;
				139	}
				140
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	141	pud = pud_offset(pgd, address);
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	142	/* The shadow page tables do not use large mappings: */
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	143	if (pud_large(*pud)) {
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	144	WARN_ON(1);
				145	return NULL;
				146	}
				147	if (pud_none(*pud)) {
				148	unsigned long new_pmd_page = __get_free_page(gfp);
				149	if (!new_pmd_page)
				150	return NULL;
				151	spin_lock(&shadow_table_allocation_lock);
Hugh Dickins	1972bb9	2017-09-09 21:27:32 -0700	[diff] [blame]	152	if (pud_none(*pud)) {
Borislav Petkov	beca4e2	2018-01-04 17:42:45 +0100	[diff] [blame]	153	set_pud(pud, __pud(prot \| __pa(new_pmd_page)));
Hugh Dickins	1972bb9	2017-09-09 21:27:32 -0700	[diff] [blame]	154	__inc_zone_page_state(virt_to_page((void *)
				155	new_pmd_page), NR_KAISERTABLE);
				156	} else
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	157	free_page(new_pmd_page);
				158	spin_unlock(&shadow_table_allocation_lock);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	159	}
				160
				161	pmd = pmd_offset(pud, address);
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	162	/* The shadow page tables do not use large mappings: */
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	163	if (pmd_large(*pmd)) {
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	164	WARN_ON(1);
				165	return NULL;
				166	}
				167	if (pmd_none(*pmd)) {
				168	unsigned long new_pte_page = __get_free_page(gfp);
				169	if (!new_pte_page)
				170	return NULL;
				171	spin_lock(&shadow_table_allocation_lock);
Hugh Dickins	1972bb9	2017-09-09 21:27:32 -0700	[diff] [blame]	172	if (pmd_none(*pmd)) {
Borislav Petkov	beca4e2	2018-01-04 17:42:45 +0100	[diff] [blame]	173	set_pmd(pmd, __pmd(prot \| __pa(new_pte_page)));
Hugh Dickins	1972bb9	2017-09-09 21:27:32 -0700	[diff] [blame]	174	__inc_zone_page_state(virt_to_page((void *)
				175	new_pte_page), NR_KAISERTABLE);
				176	} else
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	177	free_page(new_pte_page);
				178	spin_unlock(&shadow_table_allocation_lock);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	179	}
				180
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	181	return pte_offset_kernel(pmd, address);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	182	}
				183
Hugh Dickins	23e0943	2017-09-24 16:59:49 -0700	[diff] [blame]	184	static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
				185	unsigned long flags)
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	186	{
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	187	int ret = 0;
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	188	pte_t *pte;
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	189	unsigned long start_addr = (unsigned long )__start_addr;
				190	unsigned long address = start_addr & PAGE_MASK;
				191	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	192	unsigned long target_address;
				193
Hugh Dickins	23e0943	2017-09-24 16:59:49 -0700	[diff] [blame]	194	/*
				195	* It is convenient for callers to pass in __PAGE_KERNEL etc,
				196	* and there is no actual harm from setting _PAGE_GLOBAL, so
				197	* long as CR4.PGE is not set. But it is nonetheless troubling
				198	* to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
				199	* requires that not to be #defined to 0): so mask it off here.
				200	*/
				201	flags &= ~_PAGE_GLOBAL;
Lepton Wu	ec61baf	2018-01-12 13:42:56 -0800	[diff] [blame]	202	if (!(__supported_pte_mask & _PAGE_NX))
				203	flags &= ~_PAGE_NX;
Hugh Dickins	23e0943	2017-09-24 16:59:49 -0700	[diff] [blame]	204
Hugh Dickins	f43f386	2017-09-03 18:48:02 -0700	[diff] [blame]	205	for (; address < end_addr; address += PAGE_SIZE) {
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	206	target_address = get_pa_from_mapping(address);
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	207	if (target_address == -1) {
				208	ret = -EIO;
				209	break;
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	210	}
Borislav Petkov	beca4e2	2018-01-04 17:42:45 +0100	[diff] [blame]	211	pte = kaiser_pagetable_walk(address, flags & _PAGE_USER);
Hugh Dickins	f43f386	2017-09-03 18:48:02 -0700	[diff] [blame]	212	if (!pte) {
				213	ret = -ENOMEM;
				214	break;
				215	}
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	216	if (pte_none(*pte)) {
				217	set_pte(pte, __pte(flags \| target_address));
				218	} else {
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	219	pte_t tmp;
				220	set_pte(&tmp, __pte(flags \| target_address));
				221	WARN_ON_ONCE(!pte_same(*pte, tmp));
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	222	}
				223	}
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	224	return ret;
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	225	}
				226
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	227	static int kaiser_add_user_map_ptrs(const void start, const void end, unsigned long flags)
				228	{
				229	unsigned long size = end - start;
				230
				231	return kaiser_add_user_map(start, size, flags);
				232	}
				233
				234	/*
				235	* Ensure that the top level of the (shadow) page tables are
				236	* entirely populated. This ensures that all processes that get
				237	* forked have the same entries. This way, we do not have to
				238	* ever go set up new entries in older processes.
				239	*
				240	* Note: we never free these, so there are no updates to them
				241	* after this.
				242	*/
				243	static void __init kaiser_init_all_pgds(void)
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	244	{
				245	pgd_t *pgd;
				246	int i = 0;
				247
				248	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
				249	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	250	pgd_t new_pgd;
Hugh Dickins	1972bb9	2017-09-09 21:27:32 -0700	[diff] [blame]	251	pud_t *pud = pud_alloc_one(&init_mm,
				252	PAGE_OFFSET + i * PGDIR_SIZE);
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	253	if (!pud) {
				254	WARN_ON(1);
				255	break;
				256	}
Hugh Dickins	1972bb9	2017-09-09 21:27:32 -0700	[diff] [blame]	257	inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	258	new_pgd = __pgd(_KERNPG_TABLE \|__pa(pud));
				259	/*
				260	* Make sure not to stomp on some other pgd entry.
				261	*/
				262	if (!pgd_none(pgd[i])) {
				263	WARN_ON(1);
				264	continue;
				265	}
				266	set_pgd(pgd + i, new_pgd);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	267	}
				268	}
				269
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	270	#define kaiser_add_user_map_early(start, size, flags) do { \
				271	int __ret = kaiser_add_user_map(start, size, flags); \
				272	WARN_ON(__ret); \
				273	} while (0)
				274
				275	#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
				276	int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
				277	WARN_ON(__ret); \
				278	} while (0)
				279
Borislav Petkov	8018307	2018-01-02 14:19:48 +0100	[diff] [blame]	280	void __init kaiser_check_boottime_disable(void)
				281	{
				282	bool enable = true;
				283	char arg[5];
				284	int ret;
				285
Jiri Kosina	402e63d	2018-01-02 14:19:49 +0100	[diff] [blame]	286	if (boot_cpu_has(X86_FEATURE_XENPV))
				287	goto silent_disable;
				288
Borislav Petkov	8018307	2018-01-02 14:19:48 +0100	[diff] [blame]	289	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
				290	if (ret > 0) {
				291	if (!strncmp(arg, "on", 2))
				292	goto enable;
				293
				294	if (!strncmp(arg, "off", 3))
				295	goto disable;
				296
				297	if (!strncmp(arg, "auto", 4))
				298	goto skip;
				299	}
				300
Josh Poimboeuf	1709284	2019-04-12 15:39:29 -0500	[diff] [blame]	301	if (cmdline_find_option_bool(boot_command_line, "nopti") \|\|
				302	cpu_mitigations_off())
Borislav Petkov	8018307	2018-01-02 14:19:48 +0100	[diff] [blame]	303	goto disable;
				304
				305	skip:
				306	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
				307	goto disable;
				308
				309	enable:
				310	if (enable)
				311	setup_force_cpu_cap(X86_FEATURE_KAISER);
				312
				313	return;
				314
				315	disable:
Kees Cook	ea6cd39	2018-01-03 10:18:01 -0800	[diff] [blame]	316	pr_info("disabled\n");
Jiri Kosina	402e63d	2018-01-02 14:19:49 +0100	[diff] [blame]	317
				318	silent_disable:
Borislav Petkov	8018307	2018-01-02 14:19:48 +0100	[diff] [blame]	319	kaiser_enabled = 0;
				320	setup_clear_cpu_cap(X86_FEATURE_KAISER);
				321	}
				322
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	323	/*
				324	* If anything in here fails, we will likely die on one of the
				325	* first kernel->user transitions and init will die. But, we
				326	* will have most of the kernel up by then and should be able to
				327	* get a clean warning out of it. If we BUG_ON() here, we run
				328	* the risk of being before we have good console output.
				329	*/
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	330	void __init kaiser_init(void)
				331	{
				332	int cpu;
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	333
Borislav Petkov	8018307	2018-01-02 14:19:48 +0100	[diff] [blame]	334	if (!kaiser_enabled)
				335	return;
Borislav Petkov	50624dd	2018-01-02 14:19:48 +0100	[diff] [blame]	336
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	337	kaiser_init_all_pgds();
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	338
Borislav Petkov	beca4e2	2018-01-04 17:42:45 +0100	[diff] [blame]	339	/*
				340	* Note that this sets _PAGE_USER and it needs to happen when the
				341	* pagetable hierarchy gets created, i.e., early. Otherwise
				342	* kaiser_pagetable_walk() will encounter initialized PTEs in the
				343	* hierarchy and not set the proper permissions, leading to the
				344	* pagefaults with page-protection violations when trying to read the
				345	* vsyscall page. For example.
				346	*/
				347	if (vsyscall_enabled())
				348	kaiser_add_user_map_early((void *)VSYSCALL_ADDR,
				349	PAGE_SIZE,
Ben Hutchings	9a0be5a	2018-01-26 16:23:02 +0000	[diff] [blame]	350	vsyscall_pgprot);
Borislav Petkov	beca4e2	2018-01-04 17:42:45 +0100	[diff] [blame]	351
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	352	for_each_possible_cpu(cpu) {
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	353	void *percpu_vaddr = __per_cpu_user_mapped_start +
				354	per_cpu_offset(cpu);
				355	unsigned long percpu_sz = __per_cpu_user_mapped_end -
				356	__per_cpu_user_mapped_start;
				357	kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
				358	__PAGE_KERNEL);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	359	}
				360
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	361	/*
				362	* Map the entry/exit text section, which is needed at
				363	* switches from user to and from kernel.
				364	*/
				365	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
				366	__PAGE_KERNEL_RX);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	367
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	368	#if defined(CONFIG_FUNCTION_GRAPH_TRACER) \|\| defined(CONFIG_KASAN)
				369	kaiser_add_user_map_ptrs_early(__irqentry_text_start,
				370	__irqentry_text_end,
				371	__PAGE_KERNEL_RX);
				372	#endif
				373	kaiser_add_user_map_early((void *)idt_descr.address,
				374	sizeof(gate_desc) * NR_VECTORS,
				375	__PAGE_KERNEL_RO);
				376	#ifdef CONFIG_TRACING
				377	kaiser_add_user_map_early(&trace_idt_descr,
				378	sizeof(trace_idt_descr),
				379	__PAGE_KERNEL);
				380	kaiser_add_user_map_early(&trace_idt_table,
				381	sizeof(gate_desc) * NR_VECTORS,
				382	__PAGE_KERNEL);
				383	#endif
				384	kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
				385	__PAGE_KERNEL);
				386	kaiser_add_user_map_early(&debug_idt_table,
				387	sizeof(gate_desc) * NR_VECTORS,
				388	__PAGE_KERNEL);
Kees Cook	ea6cd39	2018-01-03 10:18:01 -0800	[diff] [blame]	389
				390	pr_info("enabled\n");
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	391	}
				392
Hugh Dickins	be6bf01	2017-09-03 19:23:08 -0700	[diff] [blame]	393	/* Add a mapping to the shadow mapping, and synchronize the mappings */
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	394	int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	395	{
Hugh Dickins	23e0943	2017-09-24 16:59:49 -0700	[diff] [blame]	396	if (!kaiser_enabled)
				397	return 0;
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	398	return kaiser_add_user_map((const void *)addr, size, flags);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	399	}
				400
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	401	void kaiser_remove_mapping(unsigned long start, unsigned long size)
				402	{
Hugh Dickins	be6bf01	2017-09-03 19:23:08 -0700	[diff] [blame]	403	extern void unmap_pud_range_nofree(pgd_t *pgd,
				404	unsigned long start, unsigned long end);
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	405	unsigned long end = start + size;
Hugh Dickins	3df1461	2017-10-02 10:57:24 -0700	[diff] [blame]	406	unsigned long addr, next;
				407	pgd_t *pgd;
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	408
Hugh Dickins	23e0943	2017-09-24 16:59:49 -0700	[diff] [blame]	409	if (!kaiser_enabled)
				410	return;
Hugh Dickins	3df1461	2017-10-02 10:57:24 -0700	[diff] [blame]	411	pgd = native_get_shadow_pgd(pgd_offset_k(start));
				412	for (addr = start; addr < end; pgd++, addr = next) {
				413	next = pgd_addr_end(addr, end);
				414	unmap_pud_range_nofree(pgd, addr, next);
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	415	}
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	416	}
Hugh Dickins	ac2f101	2017-09-05 12:05:01 -0700	[diff] [blame]	417
				418	/*
				419	* Page table pages are page-aligned. The lower half of the top
				420	* level is used for userspace and the top half for the kernel.
				421	* This returns true for user pages that need to get copied into
				422	* both the user and kernel copies of the page tables, and false
				423	* for kernel pages that should only be in the kernel copy.
				424	*/
				425	static inline bool is_userspace_pgd(pgd_t *pgdp)
				426	{
				427	return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
				428	}
				429
				430	pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
				431	{
Hugh Dickins	23e0943	2017-09-24 16:59:49 -0700	[diff] [blame]	432	if (!kaiser_enabled)
				433	return pgd;
Hugh Dickins	ac2f101	2017-09-05 12:05:01 -0700	[diff] [blame]	434	/*
				435	* Do we need to also populate the shadow pgd? Check _PAGE_USER to
				436	* skip cases like kexec and EFI which make temporary low mappings.
				437	*/
				438	if (pgd.pgd & _PAGE_USER) {
				439	if (is_userspace_pgd(pgdp)) {
				440	native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
				441	/*
				442	* Even if the entry is mapping userspace, ensure
				443	* that userspace can not use it. This way, if we
				444	* get out to userspace running on the kernel CR3,
				445	* userspace will crash instead of running.
				446	*/
Guenter Roeck	92fd81f	2018-01-04 13:41:55 -0800	[diff] [blame]	447	if (__supported_pte_mask & _PAGE_NX)
				448	pgd.pgd \|= _PAGE_NX;
Hugh Dickins	ac2f101	2017-09-05 12:05:01 -0700	[diff] [blame]	449	}
				450	} else if (!pgd.pgd) {
				451	/*
				452	* pgd_clear() cannot check _PAGE_USER, and is even used to
				453	* clear corrupted pgd entries: so just rely on cases like
				454	* kexec and EFI never to be using pgd_clear().
				455	*/
				456	if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
				457	is_userspace_pgd(pgdp))
				458	native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
				459	}
				460	return pgd;
				461	}
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	462
				463	void kaiser_setup_pcid(void)
				464	{
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	465	unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
				466
Hugh Dickins	169b369	2017-10-03 20:49:04 -0700	[diff] [blame]	467	if (this_cpu_has(X86_FEATURE_PCID))
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	468	user_cr3 \|= X86_CR3_PCID_USER_NOFLUSH;
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	469	/*
				470	* These variables are used by the entry/exit
				471	* code to change PCID and pgd and TLB flushing.
				472	*/
Hugh Dickins	d0142ce	2017-08-27 16:24:27 -0700	[diff] [blame]	473	this_cpu_write(x86_cr3_pcid_user, user_cr3);
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	474	}
				475
				476	/*
				477	* Make a note that this cpu will need to flush USER tlb on return to user.
Hugh Dickins	fe5cb75	2017-11-04 18:43:06 -0700	[diff] [blame]	478	* If cpu does not have PCID, then the NOFLUSH bit will never have been set.
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	479	*/
				480	void kaiser_flush_tlb_on_return_to_user(void)
				481	{
Hugh Dickins	fe5cb75	2017-11-04 18:43:06 -0700	[diff] [blame]	482	if (this_cpu_has(X86_FEATURE_PCID))
				483	this_cpu_write(x86_cr3_pcid_user,
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	484	X86_CR3_PCID_USER_FLUSH \| KAISER_SHADOW_PGD_OFFSET);
				485	}
				486	EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);