Blame - drivers/lguest/page_tables.c - kernel/msm-5.4

blob: f9ca50d80466cfa45b3596e2e8174526d2a16ab8 [file] [log] [blame]

Rusty Russell	f938d2c	2007-07-26 10:41:02 -0700	[diff] [blame^]	1	/*P:700 The pagetable code, on the other hand, still shows the scars of
				2	* previous encounters. It's functional, and as neat as it can be in the
				3	* circumstances, but be wary, for these things are subtle and break easily.
				4	* The Guest provides a virtual to physical mapping, but we can neither trust
				5	* it nor use it: we verify and convert it here to point the hardware to the
				6	* actual Guest pages when running the Guest. :*/
				7
				8	/* Copyright (C) Rusty Russell IBM Corporation 2006.
Rusty Russell	d7e28ff	2007-07-19 01:49:23 -0700	[diff] [blame]	9	* GPL v2 and any later version */
				10	#include <linux/mm.h>
				11	#include <linux/types.h>
				12	#include <linux/spinlock.h>
				13	#include <linux/random.h>
				14	#include <linux/percpu.h>
				15	#include <asm/tlbflush.h>
				16	#include "lg.h"
				17
				18	#define PTES_PER_PAGE_SHIFT 10
				19	#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT)
				20	#define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1)
				21
				22	static DEFINE_PER_CPU(spte_t *, switcher_pte_pages);
				23	#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
				24
				25	static unsigned vaddr_to_pgd_index(unsigned long vaddr)
				26	{
				27	return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
				28	}
				29
				30	/* These access the shadow versions (ie. the ones used by the CPU). */
				31	static spgd_t spgd_addr(struct lguest lg, u32 i, unsigned long vaddr)
				32	{
				33	unsigned int index = vaddr_to_pgd_index(vaddr);
				34
				35	if (index >= SWITCHER_PGD_INDEX) {
				36	kill_guest(lg, "attempt to access switcher pages");
				37	index = 0;
				38	}
				39	return &lg->pgdirs[i].pgdir[index];
				40	}
				41
				42	static spte_t spte_addr(struct lguest lg, spgd_t spgd, unsigned long vaddr)
				43	{
				44	spte_t *page = __va(spgd.pfn << PAGE_SHIFT);
				45	BUG_ON(!(spgd.flags & _PAGE_PRESENT));
				46	return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE];
				47	}
				48
				49	/* These access the guest versions. */
				50	static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr)
				51	{
				52	unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
				53	return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(gpgd_t);
				54	}
				55
				56	static unsigned long gpte_addr(struct lguest *lg,
				57	gpgd_t gpgd, unsigned long vaddr)
				58	{
				59	unsigned long gpage = gpgd.pfn << PAGE_SHIFT;
				60	BUG_ON(!(gpgd.flags & _PAGE_PRESENT));
				61	return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t);
				62	}
				63
				64	/* Do a virtual -> physical mapping on a user page. */
				65	static unsigned long get_pfn(unsigned long virtpfn, int write)
				66	{
				67	struct page *page;
				68	unsigned long ret = -1UL;
				69
				70	down_read(&current->mm->mmap_sem);
				71	if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT,
				72	1, write, 1, &page, NULL) == 1)
				73	ret = page_to_pfn(page);
				74	up_read(&current->mm->mmap_sem);
				75	return ret;
				76	}
				77
				78	static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write)
				79	{
				80	spte_t spte;
				81	unsigned long pfn;
				82
				83	/* We ignore the global flag. */
				84	spte.flags = (gpte.flags & ~_PAGE_GLOBAL);
				85	pfn = get_pfn(gpte.pfn, write);
				86	if (pfn == -1UL) {
				87	kill_guest(lg, "failed to get page %u", gpte.pfn);
				88	/* Must not put_page() bogus page on cleanup. */
				89	spte.flags = 0;
				90	}
				91	spte.pfn = pfn;
				92	return spte;
				93	}
				94
				95	static void release_pte(spte_t pte)
				96	{
				97	if (pte.flags & _PAGE_PRESENT)
				98	put_page(pfn_to_page(pte.pfn));
				99	}
				100
				101	static void check_gpte(struct lguest *lg, gpte_t gpte)
				102	{
				103	if ((gpte.flags & (_PAGE_PWT\|_PAGE_PSE)) \|\| gpte.pfn >= lg->pfn_limit)
				104	kill_guest(lg, "bad page table entry");
				105	}
				106
				107	static void check_gpgd(struct lguest *lg, gpgd_t gpgd)
				108	{
				109	if ((gpgd.flags & ~_PAGE_TABLE) \|\| gpgd.pfn >= lg->pfn_limit)
				110	kill_guest(lg, "bad page directory entry");
				111	}
				112
				113	/* FIXME: We hold reference to pages, which prevents them from being
				114	swapped. It'd be nice to have a callback when Linux wants to swap out. */
				115
				116	/* We fault pages in, which allows us to update accessed/dirty bits.
				117	* Return true if we got page. */
				118	int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
				119	{
				120	gpgd_t gpgd;
				121	spgd_t *spgd;
				122	unsigned long gpte_ptr;
				123	gpte_t gpte;
				124	spte_t *spte;
				125
				126	gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr)));
				127	if (!(gpgd.flags & _PAGE_PRESENT))
				128	return 0;
				129
				130	spgd = spgd_addr(lg, lg->pgdidx, vaddr);
				131	if (!(spgd->flags & _PAGE_PRESENT)) {
				132	/* Get a page of PTEs for them. */
				133	unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
				134	/* FIXME: Steal from self in this case? */
				135	if (!ptepage) {
				136	kill_guest(lg, "out of memory allocating pte page");
				137	return 0;
				138	}
				139	check_gpgd(lg, gpgd);
				140	spgd->raw.val = (__pa(ptepage) \| gpgd.flags);
				141	}
				142
				143	gpte_ptr = gpte_addr(lg, gpgd, vaddr);
				144	gpte = mkgpte(lgread_u32(lg, gpte_ptr));
				145
				146	/* No page? */
				147	if (!(gpte.flags & _PAGE_PRESENT))
				148	return 0;
				149
				150	/* Write to read-only page? */
				151	if ((errcode & 2) && !(gpte.flags & _PAGE_RW))
				152	return 0;
				153
				154	/* User access to a non-user page? */
				155	if ((errcode & 4) && !(gpte.flags & _PAGE_USER))
				156	return 0;
				157
				158	check_gpte(lg, gpte);
				159	gpte.flags \|= _PAGE_ACCESSED;
				160	if (errcode & 2)
				161	gpte.flags \|= _PAGE_DIRTY;
				162
				163	/* We're done with the old pte. */
				164	spte = spte_addr(lg, *spgd, vaddr);
				165	release_pte(*spte);
				166
				167	/* We don't make it writable if this isn't a write: later
				168	* write will fault so we can set dirty bit in guest. */
				169	if (gpte.flags & _PAGE_DIRTY)
				170	*spte = gpte_to_spte(lg, gpte, 1);
				171	else {
				172	gpte_t ro_gpte = gpte;
				173	ro_gpte.flags &= ~_PAGE_RW;
				174	*spte = gpte_to_spte(lg, ro_gpte, 0);
				175	}
				176
				177	/* Now we update dirty/accessed on guest. */
				178	lgwrite_u32(lg, gpte_ptr, gpte.raw.val);
				179	return 1;
				180	}
				181
				182	/* This is much faster than the full demand_page logic. */
				183	static int page_writable(struct lguest *lg, unsigned long vaddr)
				184	{
				185	spgd_t *spgd;
				186	unsigned long flags;
				187
				188	spgd = spgd_addr(lg, lg->pgdidx, vaddr);
				189	if (!(spgd->flags & _PAGE_PRESENT))
				190	return 0;
				191
				192	flags = spte_addr(lg, *spgd, vaddr)->flags;
				193	return (flags & (_PAGE_PRESENT\|_PAGE_RW)) == (_PAGE_PRESENT\|_PAGE_RW);
				194	}
				195
				196	void pin_page(struct lguest *lg, unsigned long vaddr)
				197	{
				198	if (!page_writable(lg, vaddr) && !demand_page(lg, vaddr, 2))
				199	kill_guest(lg, "bad stack page %#lx", vaddr);
				200	}
				201
				202	static void release_pgd(struct lguest lg, spgd_t spgd)
				203	{
				204	if (spgd->flags & _PAGE_PRESENT) {
				205	unsigned int i;
				206	spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT);
				207	for (i = 0; i < PTES_PER_PAGE; i++)
				208	release_pte(ptepage[i]);
				209	free_page((long)ptepage);
				210	spgd->raw.val = 0;
				211	}
				212	}
				213
				214	static void flush_user_mappings(struct lguest *lg, int idx)
				215	{
				216	unsigned int i;
				217	for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++)
				218	release_pgd(lg, lg->pgdirs[idx].pgdir + i);
				219	}
				220
				221	void guest_pagetable_flush_user(struct lguest *lg)
				222	{
				223	flush_user_mappings(lg, lg->pgdidx);
				224	}
				225
				226	static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
				227	{
				228	unsigned int i;
				229	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
				230	if (lg->pgdirs[i].cr3 == pgtable)
				231	break;
				232	return i;
				233	}
				234
				235	static unsigned int new_pgdir(struct lguest *lg,
				236	unsigned long cr3,
				237	int *blank_pgdir)
				238	{
				239	unsigned int next;
				240
				241	next = random32() % ARRAY_SIZE(lg->pgdirs);
				242	if (!lg->pgdirs[next].pgdir) {
				243	lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL);
				244	if (!lg->pgdirs[next].pgdir)
				245	next = lg->pgdidx;
				246	else
				247	/* There are no mappings: you'll need to re-pin */
				248	*blank_pgdir = 1;
				249	}
				250	lg->pgdirs[next].cr3 = cr3;
				251	/* Release all the non-kernel mappings. */
				252	flush_user_mappings(lg, next);
				253
				254	return next;
				255	}
				256
				257	void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
				258	{
				259	int newpgdir, repin = 0;
				260
				261	newpgdir = find_pgdir(lg, pgtable);
				262	if (newpgdir == ARRAY_SIZE(lg->pgdirs))
				263	newpgdir = new_pgdir(lg, pgtable, &repin);
				264	lg->pgdidx = newpgdir;
				265	if (repin)
				266	pin_stack_pages(lg);
				267	}
				268
				269	static void release_all_pagetables(struct lguest *lg)
				270	{
				271	unsigned int i, j;
				272
				273	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
				274	if (lg->pgdirs[i].pgdir)
				275	for (j = 0; j < SWITCHER_PGD_INDEX; j++)
				276	release_pgd(lg, lg->pgdirs[i].pgdir + j);
				277	}
				278
				279	void guest_pagetable_clear_all(struct lguest *lg)
				280	{
				281	release_all_pagetables(lg);
				282	pin_stack_pages(lg);
				283	}
				284
				285	static void do_set_pte(struct lguest *lg, int idx,
				286	unsigned long vaddr, gpte_t gpte)
				287	{
				288	spgd_t *spgd = spgd_addr(lg, idx, vaddr);
				289	if (spgd->flags & _PAGE_PRESENT) {
				290	spte_t spte = spte_addr(lg, spgd, vaddr);
				291	release_pte(*spte);
				292	if (gpte.flags & (_PAGE_DIRTY \| _PAGE_ACCESSED)) {
				293	check_gpte(lg, gpte);
				294	*spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY);
				295	} else
				296	spte->raw.val = 0;
				297	}
				298	}
				299
				300	void guest_set_pte(struct lguest *lg,
				301	unsigned long cr3, unsigned long vaddr, gpte_t gpte)
				302	{
				303	/* Kernel mappings must be changed on all top levels. */
				304	if (vaddr >= lg->page_offset) {
				305	unsigned int i;
				306	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
				307	if (lg->pgdirs[i].pgdir)
				308	do_set_pte(lg, i, vaddr, gpte);
				309	} else {
				310	int pgdir = find_pgdir(lg, cr3);
				311	if (pgdir != ARRAY_SIZE(lg->pgdirs))
				312	do_set_pte(lg, pgdir, vaddr, gpte);
				313	}
				314	}
				315
				316	void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx)
				317	{
				318	int pgdir;
				319
				320	if (idx >= SWITCHER_PGD_INDEX)
				321	return;
				322
				323	pgdir = find_pgdir(lg, cr3);
				324	if (pgdir < ARRAY_SIZE(lg->pgdirs))
				325	release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
				326	}
				327
				328	int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
				329	{
				330	/* We assume this in flush_user_mappings, so check now */
				331	if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX)
				332	return -EINVAL;
				333	lg->pgdidx = 0;
				334	lg->pgdirs[lg->pgdidx].cr3 = pgtable;
				335	lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL);
				336	if (!lg->pgdirs[lg->pgdidx].pgdir)
				337	return -ENOMEM;
				338	return 0;
				339	}
				340
				341	void free_guest_pagetable(struct lguest *lg)
				342	{
				343	unsigned int i;
				344
				345	release_all_pagetables(lg);
				346	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
				347	free_page((long)lg->pgdirs[i].pgdir);
				348	}
				349
				350	/* Caller must be preempt-safe */
				351	void map_switcher_in_guest(struct lguest lg, struct lguest_pages pages)
				352	{
				353	spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
				354	spgd_t switcher_pgd;
				355	spte_t regs_pte;
				356
				357	/* Since switcher less that 4MB, we simply mug top pte page. */
				358	switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT;
				359	switcher_pgd.flags = _PAGE_KERNEL;
				360	lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
				361
				362	/* Map our regs page over stack page. */
				363	regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT;
				364	regs_pte.flags = _PAGE_KERNEL;
				365	switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE]
				366	= regs_pte;
				367	}
				368
				369	static void free_switcher_pte_pages(void)
				370	{
				371	unsigned int i;
				372
				373	for_each_possible_cpu(i)
				374	free_page((long)switcher_pte_page(i));
				375	}
				376
				377	static __init void populate_switcher_pte_page(unsigned int cpu,
				378	struct page *switcher_page[],
				379	unsigned int pages)
				380	{
				381	unsigned int i;
				382	spte_t *pte = switcher_pte_page(cpu);
				383
				384	for (i = 0; i < pages; i++) {
				385	pte[i].pfn = page_to_pfn(switcher_page[i]);
				386	pte[i].flags = _PAGE_PRESENT\|_PAGE_ACCESSED;
				387	}
				388
				389	/* We only map this CPU's pages, so guest can't see others. */
				390	i = pages + cpu*2;
				391
				392	/* First page (regs) is rw, second (state) is ro. */
				393	pte[i].pfn = page_to_pfn(switcher_page[i]);
				394	pte[i].flags = _PAGE_PRESENT\|_PAGE_ACCESSED\|_PAGE_RW;
				395	pte[i+1].pfn = page_to_pfn(switcher_page[i+1]);
				396	pte[i+1].flags = _PAGE_PRESENT\|_PAGE_ACCESSED;
				397	}
				398
				399	__init int init_pagetables(struct page **switcher_page, unsigned int pages)
				400	{
				401	unsigned int i;
				402
				403	for_each_possible_cpu(i) {
				404	switcher_pte_page(i) = (spte_t *)get_zeroed_page(GFP_KERNEL);
				405	if (!switcher_pte_page(i)) {
				406	free_switcher_pte_pages();
				407	return -ENOMEM;
				408	}
				409	populate_switcher_pte_page(i, switcher_page, pages);
				410	}
				411	return 0;
				412	}
				413
				414	void free_pagetables(void)
				415	{
				416	free_switcher_pte_pages();
				417	}