Blame - arch/x86/mm/gup.c - kernel/msm-4.9

blob: 2f0a32945cda32a4c21ce0c13c9c597bfd9ce035 [file] [log] [blame]

Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	1	/*
				2	* Lockless get_user_pages_fast for x86
				3	*
				4	* Copyright (C) 2008 Nick Piggin
				5	* Copyright (C) 2008 Novell Inc.
				6	*/
				7	#include <linux/sched.h>
				8	#include <linux/mm.h>
				9	#include <linux/vmstat.h>
				10	#include <linux/highmem.h>
Andrea Arcangeli	8ee5382	2011-01-13 15:47:10 -0800	[diff] [blame]	11	#include <linux/swap.h>
Dan Williams	3565fce	2016-01-15 16:56:55 -0800	[diff] [blame]	12	#include <linux/memremap.h>
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	13
				14	#include <asm/pgtable.h>
				15
				16	static inline pte_t gup_get_pte(pte_t *ptep)
				17	{
				18	#ifndef CONFIG_X86_PAE
Christian Borntraeger	14cf3d9	2014-11-21 16:29:40 +0100	[diff] [blame]	19	return READ_ONCE(*ptep);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	20	#else
				21	/*
				22	* With get_user_pages_fast, we walk down the pagetables without taking
Andy Shevchenko	ab09809	2010-02-02 14:38:12 -0800	[diff] [blame]	23	* any locks. For this we would like to load the pointers atomically,
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	24	* but that is not possible (without expensive cmpxchg8b) on PAE. What
				25	* we do have is the guarantee that a pte will only either go from not
				26	* present to present, or present to not present or both -- it will not
				27	* switch to a completely different present page without a TLB flush in
				28	* between; something that we are blocking by holding interrupts off.
				29	*
				30	* Setting ptes from not present to present goes:
				31	* ptep->pte_high = h;
				32	* smp_wmb();
				33	* ptep->pte_low = l;
				34	*
				35	* And present to not present goes:
				36	* ptep->pte_low = 0;
				37	* smp_wmb();
				38	* ptep->pte_high = 0;
				39	*
				40	* We must ensure here that the load of pte_low sees l iff pte_high
				41	* sees h. We load pte_high after loading pte_low, which ensures we
				42	* don't see an older value of pte_high. Then we recheck pte_low,
				43	* which ensures that we haven't picked up a changed pte high. We might
				44	* have got rubbish values from pte_low and pte_high, but we are
				45	* guaranteed that pte_low will not have the present bit set unless
				46	* it is 'l'. And get_user_pages_fast only operates on present ptes, so
				47	* we're safe.
				48	*
				49	* gup_get_pte should not be used or copied outside gup.c without being
				50	* very careful -- it does not atomically load the pte or anything that
				51	* is likely to be useful for you.
				52	*/
				53	pte_t pte;
				54
				55	retry:
				56	pte.pte_low = ptep->pte_low;
				57	smp_rmb();
				58	pte.pte_high = ptep->pte_high;
				59	smp_rmb();
				60	if (unlikely(pte.pte_low != ptep->pte_low))
				61	goto retry;
				62
				63	return pte;
				64	#endif
				65	}
				66
Dan Williams	3565fce	2016-01-15 16:56:55 -0800	[diff] [blame]	67	static void undo_dev_pagemap(int nr, int nr_start, struct page *pages)
				68	{
				69	while ((*nr) - nr_start) {
				70	struct page page = pages[--(nr)];
				71
				72	ClearPageReferenced(page);
				73	put_page(page);
				74	}
				75	}
				76
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	77	/*
Dave Hansen	1874f68	2016-02-12 13:02:18 -0800	[diff] [blame^]	78	* 'pteval' can come from a pte, pmd or pud. We only check
				79	* _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
				80	* same value on all 3 types.
				81	*/
				82	static inline int pte_allows_gup(unsigned long pteval, int write)
				83	{
				84	unsigned long need_pte_bits = _PAGE_PRESENT\|_PAGE_USER;
				85
				86	if (write)
				87	need_pte_bits \|= _PAGE_RW;
				88
				89	if ((pteval & need_pte_bits) != need_pte_bits)
				90	return 0;
				91
				92	return 1;
				93	}
				94
				95	/*
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	96	* The performance critical leaf functions are made noinline otherwise gcc
				97	* inlines everything into a single function which results in too much
				98	* register pressure.
				99	*/
				100	static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
				101	unsigned long end, int write, struct page *pages, int nr)
				102	{
Dan Williams	3565fce	2016-01-15 16:56:55 -0800	[diff] [blame]	103	struct dev_pagemap *pgmap = NULL;
Dan Williams	3565fce	2016-01-15 16:56:55 -0800	[diff] [blame]	104	int nr_start = *nr;
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	105	pte_t *ptep;
				106
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	107	ptep = pte_offset_map(&pmd, addr);
				108	do {
				109	pte_t pte = gup_get_pte(ptep);
				110	struct page *page;
				111
Mel Gorman	2b4847e	2013-12-18 17:08:32 -0800	[diff] [blame]	112	/* Similar to the PMD case, NUMA hinting must take slow path */
Mel Gorman	8a0516e	2015-02-12 14:58:22 -0800	[diff] [blame]	113	if (pte_protnone(pte)) {
Mel Gorman	2b4847e	2013-12-18 17:08:32 -0800	[diff] [blame]	114	pte_unmap(ptep);
				115	return 0;
				116	}
				117
Dan Williams	3565fce	2016-01-15 16:56:55 -0800	[diff] [blame]	118	page = pte_page(pte);
				119	if (pte_devmap(pte)) {
				120	pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
				121	if (unlikely(!pgmap)) {
				122	undo_dev_pagemap(nr, nr_start, pages);
				123	pte_unmap(ptep);
				124	return 0;
				125	}
Dave Hansen	1874f68	2016-02-12 13:02:18 -0800	[diff] [blame^]	126	} else if (!pte_allows_gup(pte_val(pte), write) \|\|
				127	pte_special(pte)) {
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	128	pte_unmap(ptep);
				129	return 0;
				130	}
				131	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	132	get_page(page);
Dan Williams	3565fce	2016-01-15 16:56:55 -0800	[diff] [blame]	133	put_dev_pagemap(pgmap);
Andrea Arcangeli	8ee5382	2011-01-13 15:47:10 -0800	[diff] [blame]	134	SetPageReferenced(page);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	135	pages[*nr] = page;
				136	(*nr)++;
				137
				138	} while (ptep++, addr += PAGE_SIZE, addr != end);
				139	pte_unmap(ptep - 1);
				140
				141	return 1;
				142	}
				143
				144	static inline void get_head_page_multiple(struct page *page, int nr)
				145	{
Sasha Levin	309381fea	2014-01-23 15:52:54 -0800	[diff] [blame]	146	VM_BUG_ON_PAGE(page != compound_head(page), page);
				147	VM_BUG_ON_PAGE(page_count(page) == 0, page);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	148	atomic_add(nr, &page->_count);
Andrea Arcangeli	8ee5382	2011-01-13 15:47:10 -0800	[diff] [blame]	149	SetPageReferenced(page);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	150	}
				151
Dan Williams	3565fce	2016-01-15 16:56:55 -0800	[diff] [blame]	152	static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
				153	unsigned long end, struct page *pages, int nr)
				154	{
				155	int nr_start = *nr;
				156	unsigned long pfn = pmd_pfn(pmd);
				157	struct dev_pagemap *pgmap = NULL;
				158
				159	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
				160	do {
				161	struct page *page = pfn_to_page(pfn);
				162
				163	pgmap = get_dev_pagemap(pfn, pgmap);
				164	if (unlikely(!pgmap)) {
				165	undo_dev_pagemap(nr, nr_start, pages);
				166	return 0;
				167	}
				168	SetPageReferenced(page);
				169	pages[*nr] = page;
				170	get_page(page);
				171	put_dev_pagemap(pgmap);
				172	(*nr)++;
				173	pfn++;
				174	} while (addr += PAGE_SIZE, addr != end);
				175	return 1;
				176	}
				177
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	178	static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
				179	unsigned long end, int write, struct page *pages, int nr)
				180	{
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	181	struct page head, page;
				182	int refs;
				183
Dave Hansen	1874f68	2016-02-12 13:02:18 -0800	[diff] [blame^]	184	if (!pte_allows_gup(pmd_val(pmd), write))
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	185	return 0;
Dan Williams	3565fce	2016-01-15 16:56:55 -0800	[diff] [blame]	186
				187	VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
				188	if (pmd_devmap(pmd))
				189	return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
				190
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	191	/* hugepages are never "special" */
Toshi Kani	daf3e35	2015-09-17 12:24:21 -0600	[diff] [blame]	192	VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	193
				194	refs = 0;
Toshi Kani	daf3e35	2015-09-17 12:24:21 -0600	[diff] [blame]	195	head = pmd_page(pmd);
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	196	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	197	do {
Sasha Levin	309381fea	2014-01-23 15:52:54 -0800	[diff] [blame]	198	VM_BUG_ON_PAGE(compound_head(page) != head, page);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	199	pages[*nr] = page;
				200	(*nr)++;
				201	page++;
				202	refs++;
				203	} while (addr += PAGE_SIZE, addr != end);
				204	get_head_page_multiple(head, refs);
				205
				206	return 1;
				207	}
				208
				209	static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
				210	int write, struct page *pages, int nr)
				211	{
				212	unsigned long next;
				213	pmd_t *pmdp;
				214
				215	pmdp = pmd_offset(&pud, addr);
				216	do {
				217	pmd_t pmd = *pmdp;
				218
				219	next = pmd_addr_end(addr, end);
Kirill A. Shutemov	1f19617	2016-01-15 16:53:35 -0800	[diff] [blame]	220	if (pmd_none(pmd))
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	221	return 0;
Naoya Horiguchi	cbef847	2015-02-11 15:25:19 -0800	[diff] [blame]	222	if (unlikely(pmd_large(pmd) \|\| !pmd_present(pmd))) {
Mel Gorman	2b4847e	2013-12-18 17:08:32 -0800	[diff] [blame]	223	/*
				224	* NUMA hinting faults need to be handled in the GUP
				225	* slowpath for accounting purposes and so that they
				226	* can be serialised against THP migration.
				227	*/
Mel Gorman	8a0516e	2015-02-12 14:58:22 -0800	[diff] [blame]	228	if (pmd_protnone(pmd))
Mel Gorman	2b4847e	2013-12-18 17:08:32 -0800	[diff] [blame]	229	return 0;
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	230	if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
				231	return 0;
				232	} else {
				233	if (!gup_pte_range(pmd, addr, next, write, pages, nr))
				234	return 0;
				235	}
				236	} while (pmdp++, addr = next, addr != end);
				237
				238	return 1;
				239	}
				240
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	241	static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
				242	unsigned long end, int write, struct page *pages, int nr)
				243	{
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	244	struct page head, page;
				245	int refs;
				246
Dave Hansen	1874f68	2016-02-12 13:02:18 -0800	[diff] [blame^]	247	if (!pte_allows_gup(pud_val(pud), write))
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	248	return 0;
				249	/* hugepages are never "special" */
Toshi Kani	daf3e35	2015-09-17 12:24:21 -0600	[diff] [blame]	250	VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
				251	VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	252
				253	refs = 0;
Toshi Kani	daf3e35	2015-09-17 12:24:21 -0600	[diff] [blame]	254	head = pud_page(pud);
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	255	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
				256	do {
Sasha Levin	309381fea	2014-01-23 15:52:54 -0800	[diff] [blame]	257	VM_BUG_ON_PAGE(compound_head(page) != head, page);
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	258	pages[*nr] = page;
				259	(*nr)++;
				260	page++;
				261	refs++;
				262	} while (addr += PAGE_SIZE, addr != end);
				263	get_head_page_multiple(head, refs);
				264
				265	return 1;
				266	}
				267
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	268	static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
				269	int write, struct page *pages, int nr)
				270	{
				271	unsigned long next;
				272	pud_t *pudp;
				273
				274	pudp = pud_offset(&pgd, addr);
				275	do {
				276	pud_t pud = *pudp;
				277
				278	next = pud_addr_end(addr, end);
				279	if (pud_none(pud))
				280	return 0;
Nick Piggin	652ea69	2008-07-25 19:45:27 -0700	[diff] [blame]	281	if (unlikely(pud_large(pud))) {
				282	if (!gup_huge_pud(pud, addr, next, write, pages, nr))
				283	return 0;
				284	} else {
				285	if (!gup_pmd_range(pud, addr, next, write, pages, nr))
				286	return 0;
				287	}
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	288	} while (pudp++, addr = next, addr != end);
				289
				290	return 1;
				291	}
				292
Peter Zijlstra	465a454	2009-06-15 12:31:37 +0200	[diff] [blame]	293	/*
				294	* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
				295	* back to the regular GUP.
				296	*/
				297	int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
				298	struct page **pages)
				299	{
				300	struct mm_struct *mm = current->mm;
				301	unsigned long addr, len, end;
				302	unsigned long next;
				303	unsigned long flags;
				304	pgd_t *pgdp;
				305	int nr = 0;
				306
				307	start &= PAGE_MASK;
				308	addr = start;
				309	len = (unsigned long) nr_pages << PAGE_SHIFT;
				310	end = start + len;
				311	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
				312	(void __user *)start, len)))
				313	return 0;
				314
				315	/*
				316	* XXX: batch / limit 'nr', to avoid large irq off latency
				317	* needs some instrumenting to determine the common sizes used by
				318	* important workloads (eg. DB2), and whether limiting the batch size
				319	* will decrease performance.
				320	*
				321	* It seems like we're in the clear for the moment. Direct-IO is
				322	* the main guy that batches up lots of get_user_pages, and even
				323	* they are limited to 64-at-a-time which is not so many.
				324	*/
				325	/*
				326	* This doesn't prevent pagetable teardown, but does prevent
				327	* the pagetables and pages from being freed on x86.
				328	*
				329	* So long as we atomically load page table pointers versus teardown
				330	* (which we do on x86, with the above PAE exception), we can follow the
				331	* address down to the the page and take a ref on it.
				332	*/
				333	local_irq_save(flags);
				334	pgdp = pgd_offset(mm, addr);
				335	do {
				336	pgd_t pgd = *pgdp;
				337
				338	next = pgd_addr_end(addr, end);
				339	if (pgd_none(pgd))
				340	break;
				341	if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
				342	break;
				343	} while (pgdp++, addr = next, addr != end);
				344	local_irq_restore(flags);
				345
				346	return nr;
				347	}
				348
Andy Grover	a0d22f4	2009-04-09 16:45:29 -0700	[diff] [blame]	349	/**
				350	* get_user_pages_fast() - pin user pages in memory
				351	* @start: starting user address
				352	* @nr_pages: number of pages from start to pin
				353	* @write: whether pages will be written to
				354	* @pages: array that receives pointers to the pages pinned.
				355	* Should be at least nr_pages long.
				356	*
				357	* Attempt to pin user pages in memory without taking mm->mmap_sem.
				358	* If not successful, it will fall back to taking the lock and
				359	* calling get_user_pages().
				360	*
				361	* Returns number of pages pinned. This may be fewer than the number
				362	* requested. If nr_pages is 0 or negative, returns 0. If no pages
				363	* were pinned, returns -errno.
				364	*/
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	365	int get_user_pages_fast(unsigned long start, int nr_pages, int write,
				366	struct page **pages)
				367	{
				368	struct mm_struct *mm = current->mm;
Linus Torvalds	9b79022	2008-07-28 17:54:21 -0700	[diff] [blame]	369	unsigned long addr, len, end;
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	370	unsigned long next;
				371	pgd_t *pgdp;
				372	int nr = 0;
				373
Linus Torvalds	9b79022	2008-07-28 17:54:21 -0700	[diff] [blame]	374	start &= PAGE_MASK;
				375	addr = start;
				376	len = (unsigned long) nr_pages << PAGE_SHIFT;
Linus Torvalds	7f81890	2009-06-20 09:52:27 -0700	[diff] [blame]	377
Linus Torvalds	9b79022	2008-07-28 17:54:21 -0700	[diff] [blame]	378	end = start + len;
Linus Torvalds	7f81890	2009-06-20 09:52:27 -0700	[diff] [blame]	379	if (end < start)
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	380	goto slow_irqon;
				381
Linus Torvalds	7f81890	2009-06-20 09:52:27 -0700	[diff] [blame]	382	#ifdef CONFIG_X86_64
				383	if (end >> __VIRTUAL_MASK_SHIFT)
				384	goto slow_irqon;
				385	#endif
				386
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	387	/*
				388	* XXX: batch / limit 'nr', to avoid large irq off latency
				389	* needs some instrumenting to determine the common sizes used by
				390	* important workloads (eg. DB2), and whether limiting the batch size
				391	* will decrease performance.
				392	*
				393	* It seems like we're in the clear for the moment. Direct-IO is
				394	* the main guy that batches up lots of get_user_pages, and even
				395	* they are limited to 64-at-a-time which is not so many.
				396	*/
				397	/*
				398	* This doesn't prevent pagetable teardown, but does prevent
				399	* the pagetables and pages from being freed on x86.
				400	*
				401	* So long as we atomically load page table pointers versus teardown
				402	* (which we do on x86, with the above PAE exception), we can follow the
				403	* address down to the the page and take a ref on it.
				404	*/
				405	local_irq_disable();
				406	pgdp = pgd_offset(mm, addr);
				407	do {
				408	pgd_t pgd = *pgdp;
				409
				410	next = pgd_addr_end(addr, end);
				411	if (pgd_none(pgd))
				412	goto slow;
				413	if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
				414	goto slow;
				415	} while (pgdp++, addr = next, addr != end);
				416	local_irq_enable();
				417
				418	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
				419	return nr;
				420
				421	{
				422	int ret;
				423
				424	slow:
				425	local_irq_enable();
				426	slow_irqon:
				427	/* Try to get the remaining pages with get_user_pages */
				428	start += nr << PAGE_SHIFT;
				429	pages += nr;
				430
Dave Hansen	d4edcf0	2016-02-12 13:01:56 -0800	[diff] [blame]	431	ret = get_user_pages_unlocked(start,
Andrea Arcangeli	a7b7807	2015-02-11 15:27:23 -0800	[diff] [blame]	432	(end - start) >> PAGE_SHIFT,
				433	write, 0, pages);
Nick Piggin	8174c43	2008-07-25 19:45:24 -0700	[diff] [blame]	434
				435	/* Have to be a bit careful with return values */
				436	if (nr > 0) {
				437	if (ret < 0)
				438	ret = nr;
				439	else
				440	ret += nr;
				441	}
				442
				443	return ret;
				444	}
				445	}