Blame - arch/x86/mm/kaiser.c - kernel/msm-4.9

blob: d8376b4ad9f0ad0e1cf6387890857e032b2ce600 [file] [log] [blame]

Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	1	#include <linux/bug.h>
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	2	#include <linux/kernel.h>
				3	#include <linux/errno.h>
				4	#include <linux/string.h>
				5	#include <linux/types.h>
				6	#include <linux/bug.h>
				7	#include <linux/init.h>
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	8	#include <linux/interrupt.h>
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	9	#include <linux/spinlock.h>
				10	#include <linux/mm.h>
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	11	#include <linux/uaccess.h>
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	12
Kees Cook	ea6cd39	2018-01-03 10:18:01 -0800	[diff] [blame]	13	#undef pr_fmt
				14	#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
				15
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	16	#include <asm/kaiser.h>
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	17	#include <asm/tlbflush.h> /* to verify its kaiser declarations */
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	18	#include <asm/pgtable.h>
				19	#include <asm/pgalloc.h>
				20	#include <asm/desc.h>
Borislav Petkov	8018307	2018-01-02 14:19:48 +0100	[diff] [blame]	21	#include <asm/cmdline.h>
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	22
Hugh Dickins	23e0943	2017-09-24 16:59:49 -0700	[diff] [blame]	23	int kaiser_enabled __read_mostly = 1;
				24	EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
				25
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	26	__visible
				27	DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
				28
				29	/*
				30	* These can have bit 63 set, so we can not just use a plain "or"
				31	* instruction to get their value or'd into CR3. It would take
				32	* another register. So, we use a memory reference to these instead.
				33	*
				34	* This is also handy because systems that do not support PCIDs
				35	* just end up or'ing a 0 into their CR3, which does no harm.
				36	*/
Hugh Dickins	d0142ce	2017-08-27 16:24:27 -0700	[diff] [blame]	37	DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	38
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	39	/*
				40	* At runtime, the only things we map are some things for CPU
				41	* hotplug, and stacks for new processes. No two CPUs will ever
				42	* be populating the same addresses, so we only need to ensure
				43	* that we protect between two CPUs trying to allocate and
				44	* populate the same page table page.
				45	*
				46	* Only take this lock when doing a set_p[4um]d(), but it is not
				47	* needed for doing a set_pte(). We assume that only the owner
				48	* of a given allocation will be doing this for _their_
				49	* allocation.
				50	*
				51	* This ensures that once a system has been running for a while
				52	* and there have been stacks all over and these page tables
				53	* are fully populated, there will be no further acquisitions of
				54	* this lock.
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	55	*/
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	56	static DEFINE_SPINLOCK(shadow_table_allocation_lock);
				57
				58	/*
				59	* Returns -1 on error.
				60	*/
				61	static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	62	{
				63	pgd_t *pgd;
				64	pud_t *pud;
				65	pmd_t *pmd;
				66	pte_t *pte;
				67
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	68	pgd = pgd_offset_k(vaddr);
				69	/*
				70	* We made all the kernel PGDs present in kaiser_init().
				71	* We expect them to stay that way.
				72	*/
				73	BUG_ON(pgd_none(*pgd));
				74	/*
				75	* PGDs are either 512GB or 128TB on all x86_64
				76	* configurations. We don't handle these.
				77	*/
				78	BUG_ON(pgd_large(*pgd));
				79
				80	pud = pud_offset(pgd, vaddr);
				81	if (pud_none(*pud)) {
				82	WARN_ON_ONCE(1);
				83	return -1;
				84	}
				85
				86	if (pud_large(*pud))
				87	return (pud_pfn(*pud) << PAGE_SHIFT) \| (vaddr & ~PUD_PAGE_MASK);
				88
				89	pmd = pmd_offset(pud, vaddr);
				90	if (pmd_none(*pmd)) {
				91	WARN_ON_ONCE(1);
				92	return -1;
				93	}
				94
				95	if (pmd_large(*pmd))
				96	return (pmd_pfn(*pmd) << PAGE_SHIFT) \| (vaddr & ~PMD_PAGE_MASK);
				97
				98	pte = pte_offset_kernel(pmd, vaddr);
				99	if (pte_none(*pte)) {
				100	WARN_ON_ONCE(1);
				101	return -1;
				102	}
				103
				104	return (pte_pfn(*pte) << PAGE_SHIFT) \| (vaddr & ~PAGE_MASK);
				105	}
				106
				107	/*
				108	* This is a relatively normal page table walk, except that it
				109	* also tries to allocate page tables pages along the way.
				110	*
				111	* Returns a pointer to a PTE on success, or NULL on failure.
				112	*/
Hugh Dickins	8c2f8a5	2017-10-29 11:36:19 -0700	[diff] [blame]	113	static pte_t *kaiser_pagetable_walk(unsigned long address)
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	114	{
				115	pmd_t *pmd;
				116	pud_t *pud;
				117	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
				118	gfp_t gfp = (GFP_KERNEL \| __GFP_NOTRACK \| __GFP_ZERO);
				119
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	120	if (pgd_none(*pgd)) {
				121	WARN_ONCE(1, "All shadow pgds should have been populated");
				122	return NULL;
				123	}
				124	BUILD_BUG_ON(pgd_large(*pgd) != 0);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	125
				126	pud = pud_offset(pgd, address);
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	127	/* The shadow page tables do not use large mappings: */
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	128	if (pud_large(*pud)) {
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	129	WARN_ON(1);
				130	return NULL;
				131	}
				132	if (pud_none(*pud)) {
				133	unsigned long new_pmd_page = __get_free_page(gfp);
				134	if (!new_pmd_page)
				135	return NULL;
				136	spin_lock(&shadow_table_allocation_lock);
Hugh Dickins	1972bb9	2017-09-09 21:27:32 -0700	[diff] [blame]	137	if (pud_none(*pud)) {
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	138	set_pud(pud, __pud(_KERNPG_TABLE \| __pa(new_pmd_page)));
Hugh Dickins	1972bb9	2017-09-09 21:27:32 -0700	[diff] [blame]	139	__inc_zone_page_state(virt_to_page((void *)
				140	new_pmd_page), NR_KAISERTABLE);
				141	} else
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	142	free_page(new_pmd_page);
				143	spin_unlock(&shadow_table_allocation_lock);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	144	}
				145
				146	pmd = pmd_offset(pud, address);
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	147	/* The shadow page tables do not use large mappings: */
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	148	if (pmd_large(*pmd)) {
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	149	WARN_ON(1);
				150	return NULL;
				151	}
				152	if (pmd_none(*pmd)) {
				153	unsigned long new_pte_page = __get_free_page(gfp);
				154	if (!new_pte_page)
				155	return NULL;
				156	spin_lock(&shadow_table_allocation_lock);
Hugh Dickins	1972bb9	2017-09-09 21:27:32 -0700	[diff] [blame]	157	if (pmd_none(*pmd)) {
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	158	set_pmd(pmd, __pmd(_KERNPG_TABLE \| __pa(new_pte_page)));
Hugh Dickins	1972bb9	2017-09-09 21:27:32 -0700	[diff] [blame]	159	__inc_zone_page_state(virt_to_page((void *)
				160	new_pte_page), NR_KAISERTABLE);
				161	} else
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	162	free_page(new_pte_page);
				163	spin_unlock(&shadow_table_allocation_lock);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	164	}
				165
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	166	return pte_offset_kernel(pmd, address);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	167	}
				168
Hugh Dickins	23e0943	2017-09-24 16:59:49 -0700	[diff] [blame]	169	static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
				170	unsigned long flags)
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	171	{
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	172	int ret = 0;
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	173	pte_t *pte;
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	174	unsigned long start_addr = (unsigned long )__start_addr;
				175	unsigned long address = start_addr & PAGE_MASK;
				176	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	177	unsigned long target_address;
				178
Hugh Dickins	23e0943	2017-09-24 16:59:49 -0700	[diff] [blame]	179	/*
				180	* It is convenient for callers to pass in __PAGE_KERNEL etc,
				181	* and there is no actual harm from setting _PAGE_GLOBAL, so
				182	* long as CR4.PGE is not set. But it is nonetheless troubling
				183	* to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
				184	* requires that not to be #defined to 0): so mask it off here.
				185	*/
				186	flags &= ~_PAGE_GLOBAL;
				187
Hugh Dickins	f43f386	2017-09-03 18:48:02 -0700	[diff] [blame]	188	for (; address < end_addr; address += PAGE_SIZE) {
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	189	target_address = get_pa_from_mapping(address);
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	190	if (target_address == -1) {
				191	ret = -EIO;
				192	break;
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	193	}
Hugh Dickins	8c2f8a5	2017-10-29 11:36:19 -0700	[diff] [blame]	194	pte = kaiser_pagetable_walk(address);
Hugh Dickins	f43f386	2017-09-03 18:48:02 -0700	[diff] [blame]	195	if (!pte) {
				196	ret = -ENOMEM;
				197	break;
				198	}
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	199	if (pte_none(*pte)) {
				200	set_pte(pte, __pte(flags \| target_address));
				201	} else {
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	202	pte_t tmp;
				203	set_pte(&tmp, __pte(flags \| target_address));
				204	WARN_ON_ONCE(!pte_same(*pte, tmp));
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	205	}
				206	}
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	207	return ret;
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	208	}
				209
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	210	static int kaiser_add_user_map_ptrs(const void start, const void end, unsigned long flags)
				211	{
				212	unsigned long size = end - start;
				213
				214	return kaiser_add_user_map(start, size, flags);
				215	}
				216
				217	/*
				218	* Ensure that the top level of the (shadow) page tables are
				219	* entirely populated. This ensures that all processes that get
				220	* forked have the same entries. This way, we do not have to
				221	* ever go set up new entries in older processes.
				222	*
				223	* Note: we never free these, so there are no updates to them
				224	* after this.
				225	*/
				226	static void __init kaiser_init_all_pgds(void)
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	227	{
				228	pgd_t *pgd;
				229	int i = 0;
				230
				231	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
				232	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	233	pgd_t new_pgd;
Hugh Dickins	1972bb9	2017-09-09 21:27:32 -0700	[diff] [blame]	234	pud_t *pud = pud_alloc_one(&init_mm,
				235	PAGE_OFFSET + i * PGDIR_SIZE);
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	236	if (!pud) {
				237	WARN_ON(1);
				238	break;
				239	}
Hugh Dickins	1972bb9	2017-09-09 21:27:32 -0700	[diff] [blame]	240	inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	241	new_pgd = __pgd(_KERNPG_TABLE \|__pa(pud));
				242	/*
				243	* Make sure not to stomp on some other pgd entry.
				244	*/
				245	if (!pgd_none(pgd[i])) {
				246	WARN_ON(1);
				247	continue;
				248	}
				249	set_pgd(pgd + i, new_pgd);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	250	}
				251	}
				252
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	253	#define kaiser_add_user_map_early(start, size, flags) do { \
				254	int __ret = kaiser_add_user_map(start, size, flags); \
				255	WARN_ON(__ret); \
				256	} while (0)
				257
				258	#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
				259	int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
				260	WARN_ON(__ret); \
				261	} while (0)
				262
Borislav Petkov	8018307	2018-01-02 14:19:48 +0100	[diff] [blame]	263	void __init kaiser_check_boottime_disable(void)
				264	{
				265	bool enable = true;
				266	char arg[5];
				267	int ret;
				268
Jiri Kosina	402e63d	2018-01-02 14:19:49 +0100	[diff] [blame]	269	if (boot_cpu_has(X86_FEATURE_XENPV))
				270	goto silent_disable;
				271
Borislav Petkov	8018307	2018-01-02 14:19:48 +0100	[diff] [blame]	272	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
				273	if (ret > 0) {
				274	if (!strncmp(arg, "on", 2))
				275	goto enable;
				276
				277	if (!strncmp(arg, "off", 3))
				278	goto disable;
				279
				280	if (!strncmp(arg, "auto", 4))
				281	goto skip;
				282	}
				283
				284	if (cmdline_find_option_bool(boot_command_line, "nopti"))
				285	goto disable;
				286
				287	skip:
				288	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
				289	goto disable;
				290
				291	enable:
				292	if (enable)
				293	setup_force_cpu_cap(X86_FEATURE_KAISER);
				294
				295	return;
				296
				297	disable:
Kees Cook	ea6cd39	2018-01-03 10:18:01 -0800	[diff] [blame]	298	pr_info("disabled\n");
Jiri Kosina	402e63d	2018-01-02 14:19:49 +0100	[diff] [blame]	299
				300	silent_disable:
Borislav Petkov	8018307	2018-01-02 14:19:48 +0100	[diff] [blame]	301	kaiser_enabled = 0;
				302	setup_clear_cpu_cap(X86_FEATURE_KAISER);
				303	}
				304
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	305	/*
				306	* If anything in here fails, we will likely die on one of the
				307	* first kernel->user transitions and init will die. But, we
				308	* will have most of the kernel up by then and should be able to
				309	* get a clean warning out of it. If we BUG_ON() here, we run
				310	* the risk of being before we have good console output.
				311	*/
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	312	void __init kaiser_init(void)
				313	{
				314	int cpu;
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	315
Borislav Petkov	8018307	2018-01-02 14:19:48 +0100	[diff] [blame]	316	if (!kaiser_enabled)
				317	return;
Borislav Petkov	50624dd	2018-01-02 14:19:48 +0100	[diff] [blame]	318
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	319	kaiser_init_all_pgds();
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	320
				321	for_each_possible_cpu(cpu) {
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	322	void *percpu_vaddr = __per_cpu_user_mapped_start +
				323	per_cpu_offset(cpu);
				324	unsigned long percpu_sz = __per_cpu_user_mapped_end -
				325	__per_cpu_user_mapped_start;
				326	kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
				327	__PAGE_KERNEL);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	328	}
				329
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	330	/*
				331	* Map the entry/exit text section, which is needed at
				332	* switches from user to and from kernel.
				333	*/
				334	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
				335	__PAGE_KERNEL_RX);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	336
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	337	#if defined(CONFIG_FUNCTION_GRAPH_TRACER) \|\| defined(CONFIG_KASAN)
				338	kaiser_add_user_map_ptrs_early(__irqentry_text_start,
				339	__irqentry_text_end,
				340	__PAGE_KERNEL_RX);
				341	#endif
				342	kaiser_add_user_map_early((void *)idt_descr.address,
				343	sizeof(gate_desc) * NR_VECTORS,
				344	__PAGE_KERNEL_RO);
				345	#ifdef CONFIG_TRACING
				346	kaiser_add_user_map_early(&trace_idt_descr,
				347	sizeof(trace_idt_descr),
				348	__PAGE_KERNEL);
				349	kaiser_add_user_map_early(&trace_idt_table,
				350	sizeof(gate_desc) * NR_VECTORS,
				351	__PAGE_KERNEL);
				352	#endif
				353	kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
				354	__PAGE_KERNEL);
				355	kaiser_add_user_map_early(&debug_idt_table,
				356	sizeof(gate_desc) * NR_VECTORS,
				357	__PAGE_KERNEL);
Kees Cook	ea6cd39	2018-01-03 10:18:01 -0800	[diff] [blame]	358
				359	pr_info("enabled\n");
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	360	}
				361
Hugh Dickins	be6bf01	2017-09-03 19:23:08 -0700	[diff] [blame]	362	/* Add a mapping to the shadow mapping, and synchronize the mappings */
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	363	int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	364	{
Hugh Dickins	23e0943	2017-09-24 16:59:49 -0700	[diff] [blame]	365	if (!kaiser_enabled)
				366	return 0;
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	367	return kaiser_add_user_map((const void *)addr, size, flags);
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	368	}
				369
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	370	void kaiser_remove_mapping(unsigned long start, unsigned long size)
				371	{
Hugh Dickins	be6bf01	2017-09-03 19:23:08 -0700	[diff] [blame]	372	extern void unmap_pud_range_nofree(pgd_t *pgd,
				373	unsigned long start, unsigned long end);
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	374	unsigned long end = start + size;
Hugh Dickins	3df1461	2017-10-02 10:57:24 -0700	[diff] [blame]	375	unsigned long addr, next;
				376	pgd_t *pgd;
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	377
Hugh Dickins	23e0943	2017-09-24 16:59:49 -0700	[diff] [blame]	378	if (!kaiser_enabled)
				379	return;
Hugh Dickins	3df1461	2017-10-02 10:57:24 -0700	[diff] [blame]	380	pgd = native_get_shadow_pgd(pgd_offset_k(start));
				381	for (addr = start; addr < end; pgd++, addr = next) {
				382	next = pgd_addr_end(addr, end);
				383	unmap_pud_range_nofree(pgd, addr, next);
Dave Hansen	8f0baad	2017-08-30 16:23:00 -0700	[diff] [blame]	384	}
Richard Fellner	13be448	2017-05-04 14:26:50 +0200	[diff] [blame]	385	}
Hugh Dickins	ac2f101	2017-09-05 12:05:01 -0700	[diff] [blame]	386
				387	/*
				388	* Page table pages are page-aligned. The lower half of the top
				389	* level is used for userspace and the top half for the kernel.
				390	* This returns true for user pages that need to get copied into
				391	* both the user and kernel copies of the page tables, and false
				392	* for kernel pages that should only be in the kernel copy.
				393	*/
				394	static inline bool is_userspace_pgd(pgd_t *pgdp)
				395	{
				396	return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
				397	}
				398
				399	pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
				400	{
Hugh Dickins	23e0943	2017-09-24 16:59:49 -0700	[diff] [blame]	401	if (!kaiser_enabled)
				402	return pgd;
Hugh Dickins	ac2f101	2017-09-05 12:05:01 -0700	[diff] [blame]	403	/*
				404	* Do we need to also populate the shadow pgd? Check _PAGE_USER to
				405	* skip cases like kexec and EFI which make temporary low mappings.
				406	*/
				407	if (pgd.pgd & _PAGE_USER) {
				408	if (is_userspace_pgd(pgdp)) {
				409	native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
				410	/*
				411	* Even if the entry is mapping userspace, ensure
				412	* that userspace can not use it. This way, if we
				413	* get out to userspace running on the kernel CR3,
				414	* userspace will crash instead of running.
				415	*/
Guenter Roeck	92fd81f	2018-01-04 13:41:55 -0800	[diff] [blame]	416	if (__supported_pte_mask & _PAGE_NX)
				417	pgd.pgd \|= _PAGE_NX;
Hugh Dickins	ac2f101	2017-09-05 12:05:01 -0700	[diff] [blame]	418	}
				419	} else if (!pgd.pgd) {
				420	/*
				421	* pgd_clear() cannot check _PAGE_USER, and is even used to
				422	* clear corrupted pgd entries: so just rely on cases like
				423	* kexec and EFI never to be using pgd_clear().
				424	*/
				425	if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
				426	is_userspace_pgd(pgdp))
				427	native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
				428	}
				429	return pgd;
				430	}
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	431
				432	void kaiser_setup_pcid(void)
				433	{
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	434	unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
				435
Hugh Dickins	169b369	2017-10-03 20:49:04 -0700	[diff] [blame]	436	if (this_cpu_has(X86_FEATURE_PCID))
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	437	user_cr3 \|= X86_CR3_PCID_USER_NOFLUSH;
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	438	/*
				439	* These variables are used by the entry/exit
				440	* code to change PCID and pgd and TLB flushing.
				441	*/
Hugh Dickins	d0142ce	2017-08-27 16:24:27 -0700	[diff] [blame]	442	this_cpu_write(x86_cr3_pcid_user, user_cr3);
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	443	}
				444
				445	/*
				446	* Make a note that this cpu will need to flush USER tlb on return to user.
Hugh Dickins	fe5cb75	2017-11-04 18:43:06 -0700	[diff] [blame]	447	* If cpu does not have PCID, then the NOFLUSH bit will never have been set.
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	448	*/
				449	void kaiser_flush_tlb_on_return_to_user(void)
				450	{
Hugh Dickins	fe5cb75	2017-11-04 18:43:06 -0700	[diff] [blame]	451	if (this_cpu_has(X86_FEATURE_PCID))
				452	this_cpu_write(x86_cr3_pcid_user,
Hugh Dickins	0b5ca9d	2017-08-17 15:00:37 -0700	[diff] [blame]	453	X86_CR3_PCID_USER_FLUSH \| KAISER_SHADOW_PGD_OFFSET);
				454	}
				455	EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);