Blame - arch/x86/mm/gup.c - kernel/msm-4.9

blob: 6340cef6798af12c994ae6d23ac3f5df73e43d52 [file] [log] [blame]

Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	1	/*
				2	* Lockless get_user_pages_fast for x86
				3	*
				4	* Copyright (C) 2008 Nick Piggin
				5	* Copyright (C) 2008 Novell Inc.
				6	*/
				7	#include <linux/sched.h>
				8	#include <linux/mm.h>
				9	#include <linux/vmstat.h>
				10	#include <linux/highmem.h>
				11
				12	#include <asm/pgtable.h>
				13
				14	static inline pte_t gup_get_pte(pte_t *ptep)
				15	{
				16	#ifndef CONFIG_X86_PAE
				17	return *ptep;
				18	#else
				19	/*
				20	* With get_user_pages_fast, we walk down the pagetables without taking
				21	* any locks. For this we would like to load the pointers atoimcally,
				22	* but that is not possible (without expensive cmpxchg8b) on PAE. What
				23	* we do have is the guarantee that a pte will only either go from not
				24	* present to present, or present to not present or both -- it will not
				25	* switch to a completely different present page without a TLB flush in
				26	* between; something that we are blocking by holding interrupts off.
				27	*
				28	* Setting ptes from not present to present goes:
				29	* ptep->pte_high = h;
				30	* smp_wmb();
				31	* ptep->pte_low = l;
				32	*
				33	* And present to not present goes:
				34	* ptep->pte_low = 0;
				35	* smp_wmb();
				36	* ptep->pte_high = 0;
				37	*
				38	* We must ensure here that the load of pte_low sees l iff pte_high
				39	* sees h. We load pte_high after loading pte_low, which ensures we
				40	* don't see an older value of pte_high. Then we recheck pte_low,
				41	* which ensures that we haven't picked up a changed pte high. We might
				42	* have got rubbish values from pte_low and pte_high, but we are
				43	* guaranteed that pte_low will not have the present bit set unless
				44	* it is 'l'. And get_user_pages_fast only operates on present ptes, so
				45	* we're safe.
				46	*
				47	* gup_get_pte should not be used or copied outside gup.c without being
				48	* very careful -- it does not atomically load the pte or anything that
				49	* is likely to be useful for you.
				50	*/
				51	pte_t pte;
				52
				53	retry:
				54	pte.pte_low = ptep->pte_low;
				55	smp_rmb();
				56	pte.pte_high = ptep->pte_high;
				57	smp_rmb();
				58	if (unlikely(pte.pte_low != ptep->pte_low))
				59	goto retry;
				60
				61	return pte;
				62	#endif
				63	}
				64
				65	/*
				66	* The performance critical leaf functions are made noinline otherwise gcc
				67	* inlines everything into a single function which results in too much
				68	* register pressure.
				69	*/
				70	static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
				71	unsigned long end, int write, struct page *pages, int nr)
				72	{
				73	unsigned long mask;
				74	pte_t *ptep;
				75
				76	mask = _PAGE_PRESENT\|_PAGE_USER;
				77	if (write)
				78	mask \|= _PAGE_RW;
				79
				80	ptep = pte_offset_map(&pmd, addr);
				81	do {
				82	pte_t pte = gup_get_pte(ptep);
				83	struct page *page;
				84
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame]	85	if ((pte_flags(pte) & (mask \| _PAGE_SPECIAL)) != mask) {
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	86	pte_unmap(ptep);
				87	return 0;
				88	}
				89	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
				90	page = pte_page(pte);
				91	get_page(page);
				92	pages[*nr] = page;
				93	(*nr)++;
				94
				95	} while (ptep++, addr += PAGE_SIZE, addr != end);
				96	pte_unmap(ptep - 1);
				97
				98	return 1;
				99	}
				100
				101	static inline void get_head_page_multiple(struct page *page, int nr)
				102	{
				103	VM_BUG_ON(page != compound_head(page));
				104	VM_BUG_ON(page_count(page) == 0);
				105	atomic_add(nr, &page->_count);
				106	}
				107
				108	static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
				109	unsigned long end, int write, struct page *pages, int nr)
				110	{
				111	unsigned long mask;
				112	pte_t pte = (pte_t )&pmd;
				113	struct page head, page;
				114	int refs;
				115
				116	mask = _PAGE_PRESENT\|_PAGE_USER;
				117	if (write)
				118	mask \|= _PAGE_RW;
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame]	119	if ((pte_flags(pte) & mask) != mask)
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	120	return 0;
				121	/* hugepages are never "special" */
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame]	122	VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	123	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
				124
				125	refs = 0;
				126	head = pte_page(pte);
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	127	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	128	do {
				129	VM_BUG_ON(compound_head(page) != head);
				130	pages[*nr] = page;
				131	(*nr)++;
				132	page++;
				133	refs++;
				134	} while (addr += PAGE_SIZE, addr != end);
				135	get_head_page_multiple(head, refs);
				136
				137	return 1;
				138	}
				139
				140	static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
				141	int write, struct page *pages, int nr)
				142	{
				143	unsigned long next;
				144	pmd_t *pmdp;
				145
				146	pmdp = pmd_offset(&pud, addr);
				147	do {
				148	pmd_t pmd = *pmdp;
				149
				150	next = pmd_addr_end(addr, end);
				151	if (pmd_none(pmd))
				152	return 0;
				153	if (unlikely(pmd_large(pmd))) {
				154	if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
				155	return 0;
				156	} else {
				157	if (!gup_pte_range(pmd, addr, next, write, pages, nr))
				158	return 0;
				159	}
				160	} while (pmdp++, addr = next, addr != end);
				161
				162	return 1;
				163	}
				164
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	165	static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
				166	unsigned long end, int write, struct page *pages, int nr)
				167	{
				168	unsigned long mask;
				169	pte_t pte = (pte_t )&pud;
				170	struct page head, page;
				171	int refs;
				172
				173	mask = _PAGE_PRESENT\|_PAGE_USER;
				174	if (write)
				175	mask \|= _PAGE_RW;
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame]	176	if ((pte_flags(pte) & mask) != mask)
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	177	return 0;
				178	/* hugepages are never "special" */
Jan Beulich	606ee44	2008-09-17 16:48:17 +0100	[diff] [blame]	179	VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	180	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
				181
				182	refs = 0;
				183	head = pte_page(pte);
				184	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
				185	do {
				186	VM_BUG_ON(compound_head(page) != head);
				187	pages[*nr] = page;
				188	(*nr)++;
				189	page++;
				190	refs++;
				191	} while (addr += PAGE_SIZE, addr != end);
				192	get_head_page_multiple(head, refs);
				193
				194	return 1;
				195	}
				196
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	197	static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
				198	int write, struct page *pages, int nr)
				199	{
				200	unsigned long next;
				201	pud_t *pudp;
				202
				203	pudp = pud_offset(&pgd, addr);
				204	do {
				205	pud_t pud = *pudp;
				206
				207	next = pud_addr_end(addr, end);
				208	if (pud_none(pud))
				209	return 0;
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	210	if (unlikely(pud_large(pud))) {
				211	if (!gup_huge_pud(pud, addr, next, write, pages, nr))
				212	return 0;
				213	} else {
				214	if (!gup_pmd_range(pud, addr, next, write, pages, nr))
				215	return 0;
				216	}
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	217	} while (pudp++, addr = next, addr != end);
				218
				219	return 1;
				220	}
				221
Andy Grover	a0d22f4	2009-04-09 16:45:29 -0700	[diff] [blame^]	222	/**
				223	* get_user_pages_fast() - pin user pages in memory
				224	* @start: starting user address
				225	* @nr_pages: number of pages from start to pin
				226	* @write: whether pages will be written to
				227	* @pages: array that receives pointers to the pages pinned.
				228	* Should be at least nr_pages long.
				229	*
				230	* Attempt to pin user pages in memory without taking mm->mmap_sem.
				231	* If not successful, it will fall back to taking the lock and
				232	* calling get_user_pages().
				233	*
				234	* Returns number of pages pinned. This may be fewer than the number
				235	* requested. If nr_pages is 0 or negative, returns 0. If no pages
				236	* were pinned, returns -errno.
				237	*/
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	238	int get_user_pages_fast(unsigned long start, int nr_pages, int write,
				239	struct page **pages)
				240	{
				241	struct mm_struct *mm = current->mm;
Linus Torvalds	9b79022	2008-07-28 17:54:21 -0700	[diff] [blame]	242	unsigned long addr, len, end;
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	243	unsigned long next;
				244	pgd_t *pgdp;
				245	int nr = 0;
				246
Linus Torvalds	9b79022	2008-07-28 17:54:21 -0700	[diff] [blame]	247	start &= PAGE_MASK;
				248	addr = start;
				249	len = (unsigned long) nr_pages << PAGE_SHIFT;
				250	end = start + len;
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	251	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
Harvey Harrison	9352f56	2008-10-28 23:05:22 -0700	[diff] [blame]	252	(void __user *)start, len)))
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	253	goto slow_irqon;
				254
				255	/*
				256	* XXX: batch / limit 'nr', to avoid large irq off latency
				257	* needs some instrumenting to determine the common sizes used by
				258	* important workloads (eg. DB2), and whether limiting the batch size
				259	* will decrease performance.
				260	*
				261	* It seems like we're in the clear for the moment. Direct-IO is
				262	* the main guy that batches up lots of get_user_pages, and even
				263	* they are limited to 64-at-a-time which is not so many.
				264	*/
				265	/*
				266	* This doesn't prevent pagetable teardown, but does prevent
				267	* the pagetables and pages from being freed on x86.
				268	*
				269	* So long as we atomically load page table pointers versus teardown
				270	* (which we do on x86, with the above PAE exception), we can follow the
				271	* address down to the the page and take a ref on it.
				272	*/
				273	local_irq_disable();
				274	pgdp = pgd_offset(mm, addr);
				275	do {
				276	pgd_t pgd = *pgdp;
				277
				278	next = pgd_addr_end(addr, end);
				279	if (pgd_none(pgd))
				280	goto slow;
				281	if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
				282	goto slow;
				283	} while (pgdp++, addr = next, addr != end);
				284	local_irq_enable();
				285
				286	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
				287	return nr;
				288
				289	{
				290	int ret;
				291
				292	slow:
				293	local_irq_enable();
				294	slow_irqon:
				295	/* Try to get the remaining pages with get_user_pages */
				296	start += nr << PAGE_SHIFT;
				297	pages += nr;
				298
				299	down_read(&mm->mmap_sem);
				300	ret = get_user_pages(current, mm, start,
				301	(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
				302	up_read(&mm->mmap_sem);
				303
				304	/* Have to be a bit careful with return values */
				305	if (nr > 0) {
				306	if (ret < 0)
				307	ret = nr;
				308	else
				309	ret += nr;
				310	}
				311
				312	return ret;
				313	}
				314	}