Blame - arch/x86/mm/gup.c - kernel/msm

blob: 4ba373c5b8c8b94eec4902f2ec3f1448b8b1f6af [file] [log] [blame]

Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	1	/*
				2	* Lockless get_user_pages_fast for x86
				3	*
				4	* Copyright (C) 2008 Nick Piggin
				5	* Copyright (C) 2008 Novell Inc.
				6	*/
				7	#include <linux/sched.h>
				8	#include <linux/mm.h>
				9	#include <linux/vmstat.h>
				10	#include <linux/highmem.h>
				11
				12	#include <asm/pgtable.h>
				13
				14	static inline pte_t gup_get_pte(pte_t *ptep)
				15	{
				16	#ifndef CONFIG_X86_PAE
				17	return *ptep;
				18	#else
				19	/*
				20	* With get_user_pages_fast, we walk down the pagetables without taking
				21	* any locks. For this we would like to load the pointers atoimcally,
				22	* but that is not possible (without expensive cmpxchg8b) on PAE. What
				23	* we do have is the guarantee that a pte will only either go from not
				24	* present to present, or present to not present or both -- it will not
				25	* switch to a completely different present page without a TLB flush in
				26	* between; something that we are blocking by holding interrupts off.
				27	*
				28	* Setting ptes from not present to present goes:
				29	* ptep->pte_high = h;
				30	* smp_wmb();
				31	* ptep->pte_low = l;
				32	*
				33	* And present to not present goes:
				34	* ptep->pte_low = 0;
				35	* smp_wmb();
				36	* ptep->pte_high = 0;
				37	*
				38	* We must ensure here that the load of pte_low sees l iff pte_high
				39	* sees h. We load pte_high after loading pte_low, which ensures we
				40	* don't see an older value of pte_high. Then we recheck pte_low,
				41	* which ensures that we haven't picked up a changed pte high. We might
				42	* have got rubbish values from pte_low and pte_high, but we are
				43	* guaranteed that pte_low will not have the present bit set unless
				44	* it is 'l'. And get_user_pages_fast only operates on present ptes, so
				45	* we're safe.
				46	*
				47	* gup_get_pte should not be used or copied outside gup.c without being
				48	* very careful -- it does not atomically load the pte or anything that
				49	* is likely to be useful for you.
				50	*/
				51	pte_t pte;
				52
				53	retry:
				54	pte.pte_low = ptep->pte_low;
				55	smp_rmb();
				56	pte.pte_high = ptep->pte_high;
				57	smp_rmb();
				58	if (unlikely(pte.pte_low != ptep->pte_low))
				59	goto retry;
				60
				61	return pte;
				62	#endif
				63	}
				64
				65	/*
				66	* The performance critical leaf functions are made noinline otherwise gcc
				67	* inlines everything into a single function which results in too much
				68	* register pressure.
				69	*/
				70	static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
				71	unsigned long end, int write, struct page *pages, int nr)
				72	{
				73	unsigned long mask;
				74	pte_t *ptep;
				75
				76	mask = _PAGE_PRESENT\|_PAGE_USER;
				77	if (write)
				78	mask \|= _PAGE_RW;
				79
				80	ptep = pte_offset_map(&pmd, addr);
				81	do {
				82	pte_t pte = gup_get_pte(ptep);
				83	struct page *page;
				84
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame^]	85	if ((pte_flags(pte) & (mask \| _PAGE_SPECIAL)) != mask) {
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	86	pte_unmap(ptep);
				87	return 0;
				88	}
				89	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
				90	page = pte_page(pte);
				91	get_page(page);
				92	pages[*nr] = page;
				93	(*nr)++;
				94
				95	} while (ptep++, addr += PAGE_SIZE, addr != end);
				96	pte_unmap(ptep - 1);
				97
				98	return 1;
				99	}
				100
				101	static inline void get_head_page_multiple(struct page *page, int nr)
				102	{
				103	VM_BUG_ON(page != compound_head(page));
				104	VM_BUG_ON(page_count(page) == 0);
				105	atomic_add(nr, &page->_count);
				106	}
				107
				108	static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
				109	unsigned long end, int write, struct page *pages, int nr)
				110	{
				111	unsigned long mask;
				112	pte_t pte = (pte_t )&pmd;
				113	struct page head, page;
				114	int refs;
				115
				116	mask = _PAGE_PRESENT\|_PAGE_USER;
				117	if (write)
				118	mask \|= _PAGE_RW;
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame^]	119	if ((pte_flags(pte) & mask) != mask)
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	120	return 0;
				121	/* hugepages are never "special" */
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame^]	122	VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	123	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
				124
				125	refs = 0;
				126	head = pte_page(pte);
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	127	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	128	do {
				129	VM_BUG_ON(compound_head(page) != head);
				130	pages[*nr] = page;
				131	(*nr)++;
				132	page++;
				133	refs++;
				134	} while (addr += PAGE_SIZE, addr != end);
				135	get_head_page_multiple(head, refs);
				136
				137	return 1;
				138	}
				139
				140	static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
				141	int write, struct page *pages, int nr)
				142	{
				143	unsigned long next;
				144	pmd_t *pmdp;
				145
				146	pmdp = pmd_offset(&pud, addr);
				147	do {
				148	pmd_t pmd = *pmdp;
				149
				150	next = pmd_addr_end(addr, end);
				151	if (pmd_none(pmd))
				152	return 0;
				153	if (unlikely(pmd_large(pmd))) {
				154	if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
				155	return 0;
				156	} else {
				157	if (!gup_pte_range(pmd, addr, next, write, pages, nr))
				158	return 0;
				159	}
				160	} while (pmdp++, addr = next, addr != end);
				161
				162	return 1;
				163	}
				164
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	165	static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
				166	unsigned long end, int write, struct page *pages, int nr)
				167	{
				168	unsigned long mask;
				169	pte_t pte = (pte_t )&pud;
				170	struct page head, page;
				171	int refs;
				172
				173	mask = _PAGE_PRESENT\|_PAGE_USER;
				174	if (write)
				175	mask \|= _PAGE_RW;
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame^]	176	if ((pte_flags(pte) & mask) != mask)
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	177	return 0;
				178	/* hugepages are never "special" */
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame^]	179	VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	180	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
				181
				182	refs = 0;
				183	head = pte_page(pte);
				184	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
				185	do {
				186	VM_BUG_ON(compound_head(page) != head);
				187	pages[*nr] = page;
				188	(*nr)++;
				189	page++;
				190	refs++;
				191	} while (addr += PAGE_SIZE, addr != end);
				192	get_head_page_multiple(head, refs);
				193
				194	return 1;
				195	}
				196
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	197	static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
				198	int write, struct page *pages, int nr)
				199	{
				200	unsigned long next;
				201	pud_t *pudp;
				202
				203	pudp = pud_offset(&pgd, addr);
				204	do {
				205	pud_t pud = *pudp;
				206
				207	next = pud_addr_end(addr, end);
				208	if (pud_none(pud))
				209	return 0;
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	210	if (unlikely(pud_large(pud))) {
				211	if (!gup_huge_pud(pud, addr, next, write, pages, nr))
				212	return 0;
				213	} else {
				214	if (!gup_pmd_range(pud, addr, next, write, pages, nr))
				215	return 0;
				216	}
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	217	} while (pudp++, addr = next, addr != end);
				218
				219	return 1;
				220	}
				221
				222	int get_user_pages_fast(unsigned long start, int nr_pages, int write,
				223	struct page **pages)
				224	{
				225	struct mm_struct *mm = current->mm;
Linus Torvalds	9b79022	2008-07-28 17:54:21 -0700	[diff] [blame]	226	unsigned long addr, len, end;
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	227	unsigned long next;
				228	pgd_t *pgdp;
				229	int nr = 0;
				230
Linus Torvalds	9b79022	2008-07-28 17:54:21 -0700	[diff] [blame]	231	start &= PAGE_MASK;
				232	addr = start;
				233	len = (unsigned long) nr_pages << PAGE_SHIFT;
				234	end = start + len;
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	235	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
Linus Torvalds	9b79022	2008-07-28 17:54:21 -0700	[diff] [blame]	236	start, len)))
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	237	goto slow_irqon;
				238
				239	/*
				240	* XXX: batch / limit 'nr', to avoid large irq off latency
				241	* needs some instrumenting to determine the common sizes used by
				242	* important workloads (eg. DB2), and whether limiting the batch size
				243	* will decrease performance.
				244	*
				245	* It seems like we're in the clear for the moment. Direct-IO is
				246	* the main guy that batches up lots of get_user_pages, and even
				247	* they are limited to 64-at-a-time which is not so many.
				248	*/
				249	/*
				250	* This doesn't prevent pagetable teardown, but does prevent
				251	* the pagetables and pages from being freed on x86.
				252	*
				253	* So long as we atomically load page table pointers versus teardown
				254	* (which we do on x86, with the above PAE exception), we can follow the
				255	* address down to the the page and take a ref on it.
				256	*/
				257	local_irq_disable();
				258	pgdp = pgd_offset(mm, addr);
				259	do {
				260	pgd_t pgd = *pgdp;
				261
				262	next = pgd_addr_end(addr, end);
				263	if (pgd_none(pgd))
				264	goto slow;
				265	if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
				266	goto slow;
				267	} while (pgdp++, addr = next, addr != end);
				268	local_irq_enable();
				269
				270	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
				271	return nr;
				272
				273	{
				274	int ret;
				275
				276	slow:
				277	local_irq_enable();
				278	slow_irqon:
				279	/* Try to get the remaining pages with get_user_pages */
				280	start += nr << PAGE_SHIFT;
				281	pages += nr;
				282
				283	down_read(&mm->mmap_sem);
				284	ret = get_user_pages(current, mm, start,
				285	(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
				286	up_read(&mm->mmap_sem);
				287
				288	/* Have to be a bit careful with return values */
				289	if (nr > 0) {
				290	if (ret < 0)
				291	ret = nr;
				292	else
				293	ret += nr;
				294	}
				295
				296	return ret;
				297	}
				298	}