Blame - arch/x86/mm/gup.c - kernel/msm-4.9

blob: dbe34b9313743f1cae72ac28aee5c99e5d2c9369 [file] [log] [blame]

Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	1	/*
				2	* Lockless get_user_pages_fast for x86
				3	*
				4	* Copyright (C) 2008 Nick Piggin
				5	* Copyright (C) 2008 Novell Inc.
				6	*/
				7	#include <linux/sched.h>
				8	#include <linux/mm.h>
				9	#include <linux/vmstat.h>
				10	#include <linux/highmem.h>
Andrea Arcangeli	8ee5382	2011-01-13 15:47:10 -0800	[diff] [blame^]	11	#include <linux/swap.h>
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	12
				13	#include <asm/pgtable.h>
				14
				15	static inline pte_t gup_get_pte(pte_t *ptep)
				16	{
				17	#ifndef CONFIG_X86_PAE
Ingo Molnar	0c87197	2009-06-15 11:35:01 +0200	[diff] [blame]	18	return ACCESS_ONCE(*ptep);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	19	#else
				20	/*
				21	* With get_user_pages_fast, we walk down the pagetables without taking
Andy Shevchenko	ab09809	2010-02-02 14:38:12 -0800	[diff] [blame]	22	* any locks. For this we would like to load the pointers atomically,
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	23	* but that is not possible (without expensive cmpxchg8b) on PAE. What
				24	* we do have is the guarantee that a pte will only either go from not
				25	* present to present, or present to not present or both -- it will not
				26	* switch to a completely different present page without a TLB flush in
				27	* between; something that we are blocking by holding interrupts off.
				28	*
				29	* Setting ptes from not present to present goes:
				30	* ptep->pte_high = h;
				31	* smp_wmb();
				32	* ptep->pte_low = l;
				33	*
				34	* And present to not present goes:
				35	* ptep->pte_low = 0;
				36	* smp_wmb();
				37	* ptep->pte_high = 0;
				38	*
				39	* We must ensure here that the load of pte_low sees l iff pte_high
				40	* sees h. We load pte_high after loading pte_low, which ensures we
				41	* don't see an older value of pte_high. Then we recheck pte_low,
				42	* which ensures that we haven't picked up a changed pte high. We might
				43	* have got rubbish values from pte_low and pte_high, but we are
				44	* guaranteed that pte_low will not have the present bit set unless
				45	* it is 'l'. And get_user_pages_fast only operates on present ptes, so
				46	* we're safe.
				47	*
				48	* gup_get_pte should not be used or copied outside gup.c without being
				49	* very careful -- it does not atomically load the pte or anything that
				50	* is likely to be useful for you.
				51	*/
				52	pte_t pte;
				53
				54	retry:
				55	pte.pte_low = ptep->pte_low;
				56	smp_rmb();
				57	pte.pte_high = ptep->pte_high;
				58	smp_rmb();
				59	if (unlikely(pte.pte_low != ptep->pte_low))
				60	goto retry;
				61
				62	return pte;
				63	#endif
				64	}
				65
				66	/*
				67	* The performance critical leaf functions are made noinline otherwise gcc
				68	* inlines everything into a single function which results in too much
				69	* register pressure.
				70	*/
				71	static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
				72	unsigned long end, int write, struct page *pages, int nr)
				73	{
				74	unsigned long mask;
				75	pte_t *ptep;
				76
				77	mask = _PAGE_PRESENT\|_PAGE_USER;
				78	if (write)
				79	mask \|= _PAGE_RW;
				80
				81	ptep = pte_offset_map(&pmd, addr);
				82	do {
				83	pte_t pte = gup_get_pte(ptep);
				84	struct page *page;
				85
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame]	86	if ((pte_flags(pte) & (mask \| _PAGE_SPECIAL)) != mask) {
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	87	pte_unmap(ptep);
				88	return 0;
				89	}
				90	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
				91	page = pte_page(pte);
				92	get_page(page);
Andrea Arcangeli	8ee5382	2011-01-13 15:47:10 -0800	[diff] [blame^]	93	SetPageReferenced(page);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	94	pages[*nr] = page;
				95	(*nr)++;
				96
				97	} while (ptep++, addr += PAGE_SIZE, addr != end);
				98	pte_unmap(ptep - 1);
				99
				100	return 1;
				101	}
				102
				103	static inline void get_head_page_multiple(struct page *page, int nr)
				104	{
				105	VM_BUG_ON(page != compound_head(page));
				106	VM_BUG_ON(page_count(page) == 0);
				107	atomic_add(nr, &page->_count);
Andrea Arcangeli	8ee5382	2011-01-13 15:47:10 -0800	[diff] [blame^]	108	SetPageReferenced(page);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	109	}
				110
Andrea Arcangeli	9180706	2011-01-13 15:46:32 -0800	[diff] [blame]	111	static inline void get_huge_page_tail(struct page *page)
				112	{
				113	/*
				114	* __split_huge_page_refcount() cannot run
				115	* from under us.
				116	*/
				117	VM_BUG_ON(atomic_read(&page->_count) < 0);
				118	atomic_inc(&page->_count);
				119	}
				120
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	121	static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
				122	unsigned long end, int write, struct page *pages, int nr)
				123	{
				124	unsigned long mask;
				125	pte_t pte = (pte_t )&pmd;
				126	struct page head, page;
				127	int refs;
				128
				129	mask = _PAGE_PRESENT\|_PAGE_USER;
				130	if (write)
				131	mask \|= _PAGE_RW;
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame]	132	if ((pte_flags(pte) & mask) != mask)
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	133	return 0;
				134	/* hugepages are never "special" */
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame]	135	VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	136	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
				137
				138	refs = 0;
				139	head = pte_page(pte);
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	140	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	141	do {
				142	VM_BUG_ON(compound_head(page) != head);
				143	pages[*nr] = page;
Andrea Arcangeli	9180706	2011-01-13 15:46:32 -0800	[diff] [blame]	144	if (PageTail(page))
				145	get_huge_page_tail(page);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	146	(*nr)++;
				147	page++;
				148	refs++;
				149	} while (addr += PAGE_SIZE, addr != end);
				150	get_head_page_multiple(head, refs);
				151
				152	return 1;
				153	}
				154
				155	static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
				156	int write, struct page *pages, int nr)
				157	{
				158	unsigned long next;
				159	pmd_t *pmdp;
				160
				161	pmdp = pmd_offset(&pud, addr);
				162	do {
				163	pmd_t pmd = *pmdp;
				164
				165	next = pmd_addr_end(addr, end);
Andrea Arcangeli	64cc6ae	2011-01-13 15:46:42 -0800	[diff] [blame]	166	/*
				167	* The pmd_trans_splitting() check below explains why
				168	* pmdp_splitting_flush has to flush the tlb, to stop
				169	* this gup-fast code from running while we set the
				170	* splitting bit in the pmd. Returning zero will take
				171	* the slow path that will call wait_split_huge_page()
				172	* if the pmd is still in splitting state. gup-fast
				173	* can't because it has irq disabled and
				174	* wait_split_huge_page() would never return as the
				175	* tlb flush IPI wouldn't run.
				176	*/
				177	if (pmd_none(pmd) \|\| pmd_trans_splitting(pmd))
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	178	return 0;
				179	if (unlikely(pmd_large(pmd))) {
				180	if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
				181	return 0;
				182	} else {
				183	if (!gup_pte_range(pmd, addr, next, write, pages, nr))
				184	return 0;
				185	}
				186	} while (pmdp++, addr = next, addr != end);
				187
				188	return 1;
				189	}
				190
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	191	static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
				192	unsigned long end, int write, struct page *pages, int nr)
				193	{
				194	unsigned long mask;
				195	pte_t pte = (pte_t )&pud;
				196	struct page head, page;
				197	int refs;
				198
				199	mask = _PAGE_PRESENT\|_PAGE_USER;
				200	if (write)
				201	mask \|= _PAGE_RW;
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame]	202	if ((pte_flags(pte) & mask) != mask)
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	203	return 0;
				204	/* hugepages are never "special" */
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame]	205	VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	206	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
				207
				208	refs = 0;
				209	head = pte_page(pte);
				210	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
				211	do {
				212	VM_BUG_ON(compound_head(page) != head);
				213	pages[*nr] = page;
				214	(*nr)++;
				215	page++;
				216	refs++;
				217	} while (addr += PAGE_SIZE, addr != end);
				218	get_head_page_multiple(head, refs);
				219
				220	return 1;
				221	}
				222
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	223	static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
				224	int write, struct page *pages, int nr)
				225	{
				226	unsigned long next;
				227	pud_t *pudp;
				228
				229	pudp = pud_offset(&pgd, addr);
				230	do {
				231	pud_t pud = *pudp;
				232
				233	next = pud_addr_end(addr, end);
				234	if (pud_none(pud))
				235	return 0;
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	236	if (unlikely(pud_large(pud))) {
				237	if (!gup_huge_pud(pud, addr, next, write, pages, nr))
				238	return 0;
				239	} else {
				240	if (!gup_pmd_range(pud, addr, next, write, pages, nr))
				241	return 0;
				242	}
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	243	} while (pudp++, addr = next, addr != end);
				244
				245	return 1;
				246	}
				247
Peter Zijlstra	465a454	2009-06-15 12:31:37 +0200	[diff] [blame]	248	/*
				249	* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
				250	* back to the regular GUP.
				251	*/
				252	int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
				253	struct page **pages)
				254	{
				255	struct mm_struct *mm = current->mm;
				256	unsigned long addr, len, end;
				257	unsigned long next;
				258	unsigned long flags;
				259	pgd_t *pgdp;
				260	int nr = 0;
				261
				262	start &= PAGE_MASK;
				263	addr = start;
				264	len = (unsigned long) nr_pages << PAGE_SHIFT;
				265	end = start + len;
				266	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
				267	(void __user *)start, len)))
				268	return 0;
				269
				270	/*
				271	* XXX: batch / limit 'nr', to avoid large irq off latency
				272	* needs some instrumenting to determine the common sizes used by
				273	* important workloads (eg. DB2), and whether limiting the batch size
				274	* will decrease performance.
				275	*
				276	* It seems like we're in the clear for the moment. Direct-IO is
				277	* the main guy that batches up lots of get_user_pages, and even
				278	* they are limited to 64-at-a-time which is not so many.
				279	*/
				280	/*
				281	* This doesn't prevent pagetable teardown, but does prevent
				282	* the pagetables and pages from being freed on x86.
				283	*
				284	* So long as we atomically load page table pointers versus teardown
				285	* (which we do on x86, with the above PAE exception), we can follow the
				286	* address down to the the page and take a ref on it.
				287	*/
				288	local_irq_save(flags);
				289	pgdp = pgd_offset(mm, addr);
				290	do {
				291	pgd_t pgd = *pgdp;
				292
				293	next = pgd_addr_end(addr, end);
				294	if (pgd_none(pgd))
				295	break;
				296	if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
				297	break;
				298	} while (pgdp++, addr = next, addr != end);
				299	local_irq_restore(flags);
				300
				301	return nr;
				302	}
				303
Andy Grover	a0d22f4	2009-04-09 16:45:29 -0700	[diff] [blame]	304	/**
				305	* get_user_pages_fast() - pin user pages in memory
				306	* @start: starting user address
				307	* @nr_pages: number of pages from start to pin
				308	* @write: whether pages will be written to
				309	* @pages: array that receives pointers to the pages pinned.
				310	* Should be at least nr_pages long.
				311	*
				312	* Attempt to pin user pages in memory without taking mm->mmap_sem.
				313	* If not successful, it will fall back to taking the lock and
				314	* calling get_user_pages().
				315	*
				316	* Returns number of pages pinned. This may be fewer than the number
				317	* requested. If nr_pages is 0 or negative, returns 0. If no pages
				318	* were pinned, returns -errno.
				319	*/
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	320	int get_user_pages_fast(unsigned long start, int nr_pages, int write,
				321	struct page **pages)
				322	{
				323	struct mm_struct *mm = current->mm;
Linus Torvalds	9b79022	2008-07-28 17:54:21 -0700	[diff] [blame]	324	unsigned long addr, len, end;
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	325	unsigned long next;
				326	pgd_t *pgdp;
				327	int nr = 0;
				328
Linus Torvalds	9b79022	2008-07-28 17:54:21 -0700	[diff] [blame]	329	start &= PAGE_MASK;
				330	addr = start;
				331	len = (unsigned long) nr_pages << PAGE_SHIFT;
Linus Torvalds	7f81890	2009-06-20 09:52:27 -0700	[diff] [blame]	332
Linus Torvalds	9b79022	2008-07-28 17:54:21 -0700	[diff] [blame]	333	end = start + len;
Linus Torvalds	7f81890	2009-06-20 09:52:27 -0700	[diff] [blame]	334	if (end < start)
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	335	goto slow_irqon;
				336
Linus Torvalds	7f81890	2009-06-20 09:52:27 -0700	[diff] [blame]	337	#ifdef CONFIG_X86_64
				338	if (end >> __VIRTUAL_MASK_SHIFT)
				339	goto slow_irqon;
				340	#endif
				341
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	342	/*
				343	* XXX: batch / limit 'nr', to avoid large irq off latency
				344	* needs some instrumenting to determine the common sizes used by
				345	* important workloads (eg. DB2), and whether limiting the batch size
				346	* will decrease performance.
				347	*
				348	* It seems like we're in the clear for the moment. Direct-IO is
				349	* the main guy that batches up lots of get_user_pages, and even
				350	* they are limited to 64-at-a-time which is not so many.
				351	*/
				352	/*
				353	* This doesn't prevent pagetable teardown, but does prevent
				354	* the pagetables and pages from being freed on x86.
				355	*
				356	* So long as we atomically load page table pointers versus teardown
				357	* (which we do on x86, with the above PAE exception), we can follow the
				358	* address down to the the page and take a ref on it.
				359	*/
				360	local_irq_disable();
				361	pgdp = pgd_offset(mm, addr);
				362	do {
				363	pgd_t pgd = *pgdp;
				364
				365	next = pgd_addr_end(addr, end);
				366	if (pgd_none(pgd))
				367	goto slow;
				368	if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
				369	goto slow;
				370	} while (pgdp++, addr = next, addr != end);
				371	local_irq_enable();
				372
				373	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
				374	return nr;
				375
				376	{
				377	int ret;
				378
				379	slow:
				380	local_irq_enable();
				381	slow_irqon:
				382	/* Try to get the remaining pages with get_user_pages */
				383	start += nr << PAGE_SHIFT;
				384	pages += nr;
				385
				386	down_read(&mm->mmap_sem);
				387	ret = get_user_pages(current, mm, start,
				388	(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
				389	up_read(&mm->mmap_sem);
				390
				391	/* Have to be a bit careful with return values */
				392	if (nr > 0) {
				393	if (ret < 0)
				394	ret = nr;
				395	else
				396	ret += nr;
				397	}
				398
				399	return ret;
				400	}
				401	}