Blame - mm/gup.c - kernel/msm-4.9

blob: 91d044b1600dd6b216decb62ca9f1bd7ef152c52 [file] [log] [blame]

Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	1	#include <linux/kernel.h>
				2	#include <linux/errno.h>
				3	#include <linux/err.h>
				4	#include <linux/spinlock.h>
				5
				6	#include <linux/hugetlb.h>
				7	#include <linux/mm.h>
				8	#include <linux/pagemap.h>
				9	#include <linux/rmap.h>
				10	#include <linux/swap.h>
				11	#include <linux/swapops.h>
				12
				13	#include "internal.h"
				14
Kirill A. Shutemov	69e68b4	2014-06-04 16:08:11 -0700	[diff] [blame]	15	static struct page no_page_table(struct vm_area_struct vma,
				16	unsigned int flags)
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	17	{
Kirill A. Shutemov	69e68b4	2014-06-04 16:08:11 -0700	[diff] [blame]	18	/*
				19	* When core dumping an enormous anonymous area that nobody
				20	* has touched so far, we don't want to allocate unnecessary pages or
				21	* page tables. Return error instead of NULL to skip handle_mm_fault,
				22	* then get_dump_page() will return NULL to leave a hole in the dump.
				23	* But we can only make this optimization where a hole would surely
				24	* be zero-filled if handle_mm_fault() actually did handle it.
				25	*/
				26	if ((flags & FOLL_DUMP) && (!vma->vm_ops \|\| !vma->vm_ops->fault))
				27	return ERR_PTR(-EFAULT);
				28	return NULL;
				29	}
				30
				31	static struct page follow_page_pte(struct vm_area_struct vma,
				32	unsigned long address, pmd_t *pmd, unsigned int flags)
				33	{
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	34	struct mm_struct *mm = vma->vm_mm;
Kirill A. Shutemov	69e68b4	2014-06-04 16:08:11 -0700	[diff] [blame]	35	struct page *page;
				36	spinlock_t *ptl;
				37	pte_t *ptep, pte;
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	38
Kirill A. Shutemov	69e68b4	2014-06-04 16:08:11 -0700	[diff] [blame]	39	retry:
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	40	if (unlikely(pmd_bad(*pmd)))
Kirill A. Shutemov	69e68b4	2014-06-04 16:08:11 -0700	[diff] [blame]	41	return no_page_table(vma, flags);
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	42
				43	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	44	pte = *ptep;
				45	if (!pte_present(pte)) {
				46	swp_entry_t entry;
				47	/*
				48	* KSM's break_ksm() relies upon recognizing a ksm page
				49	* even while it is being migrated, so for that case we
				50	* need migration_entry_wait().
				51	*/
				52	if (likely(!(flags & FOLL_MIGRATION)))
				53	goto no_page;
				54	if (pte_none(pte) \|\| pte_file(pte))
				55	goto no_page;
				56	entry = pte_to_swp_entry(pte);
				57	if (!is_migration_entry(entry))
				58	goto no_page;
				59	pte_unmap_unlock(ptep, ptl);
				60	migration_entry_wait(mm, pmd, address);
Kirill A. Shutemov	69e68b4	2014-06-04 16:08:11 -0700	[diff] [blame]	61	goto retry;
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	62	}
				63	if ((flags & FOLL_NUMA) && pte_numa(pte))
				64	goto no_page;
Kirill A. Shutemov	69e68b4	2014-06-04 16:08:11 -0700	[diff] [blame]	65	if ((flags & FOLL_WRITE) && !pte_write(pte)) {
				66	pte_unmap_unlock(ptep, ptl);
				67	return NULL;
				68	}
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	69
				70	page = vm_normal_page(vma, address, pte);
				71	if (unlikely(!page)) {
				72	if ((flags & FOLL_DUMP) \|\|
				73	!is_zero_pfn(pte_pfn(pte)))
				74	goto bad_page;
				75	page = pte_page(pte);
				76	}
				77
				78	if (flags & FOLL_GET)
				79	get_page_foll(page);
				80	if (flags & FOLL_TOUCH) {
				81	if ((flags & FOLL_WRITE) &&
				82	!pte_dirty(pte) && !PageDirty(page))
				83	set_page_dirty(page);
				84	/*
				85	* pte_mkyoung() would be more correct here, but atomic care
				86	* is needed to avoid losing the dirty bit: it is easier to use
				87	* mark_page_accessed().
				88	*/
				89	mark_page_accessed(page);
				90	}
				91	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
				92	/*
				93	* The preliminary mapping check is mainly to avoid the
				94	* pointless overhead of lock_page on the ZERO_PAGE
				95	* which might bounce very badly if there is contention.
				96	*
				97	* If the page is already locked, we don't need to
				98	* handle it now - vmscan will handle it later if and
				99	* when it attempts to reclaim the page.
				100	*/
				101	if (page->mapping && trylock_page(page)) {
				102	lru_add_drain(); /* push cached pages to LRU */
				103	/*
				104	* Because we lock page here, and migration is
				105	* blocked by the pte's page reference, and we
				106	* know the page is still mapped, we don't even
				107	* need to check for file-cache page truncation.
				108	*/
				109	mlock_vma_page(page);
				110	unlock_page(page);
				111	}
				112	}
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	113	pte_unmap_unlock(ptep, ptl);
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	114	return page;
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	115	bad_page:
				116	pte_unmap_unlock(ptep, ptl);
				117	return ERR_PTR(-EFAULT);
				118
				119	no_page:
				120	pte_unmap_unlock(ptep, ptl);
				121	if (!pte_none(pte))
Kirill A. Shutemov	69e68b4	2014-06-04 16:08:11 -0700	[diff] [blame]	122	return NULL;
				123	return no_page_table(vma, flags);
				124	}
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	125
Kirill A. Shutemov	69e68b4	2014-06-04 16:08:11 -0700	[diff] [blame]	126	/**
				127	* follow_page_mask - look up a page descriptor from a user-virtual address
				128	* @vma: vm_area_struct mapping @address
				129	* @address: virtual address to look up
				130	* @flags: flags modifying lookup behaviour
				131	* @page_mask: on output, *page_mask is set according to the size of the page
				132	*
				133	* @flags can have FOLL_ flags set, defined in <linux/mm.h>
				134	*
				135	* Returns the mapped (struct page *), %NULL if no mapping exists, or
				136	* an error pointer if there is a mapping to something not represented
				137	* by a page descriptor (see also vm_normal_page()).
				138	*/
				139	struct page follow_page_mask(struct vm_area_struct vma,
				140	unsigned long address, unsigned int flags,
				141	unsigned int *page_mask)
				142	{
				143	pgd_t *pgd;
				144	pud_t *pud;
				145	pmd_t *pmd;
				146	spinlock_t *ptl;
				147	struct page *page;
				148	struct mm_struct *mm = vma->vm_mm;
				149
				150	*page_mask = 0;
				151
				152	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
				153	if (!IS_ERR(page)) {
				154	BUG_ON(flags & FOLL_GET);
				155	return page;
				156	}
				157
				158	pgd = pgd_offset(mm, address);
				159	if (pgd_none(pgd) \|\| unlikely(pgd_bad(pgd)))
				160	return no_page_table(vma, flags);
				161
				162	pud = pud_offset(pgd, address);
				163	if (pud_none(*pud))
				164	return no_page_table(vma, flags);
				165	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
				166	if (flags & FOLL_GET)
				167	return NULL;
				168	page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
				169	return page;
				170	}
				171	if (unlikely(pud_bad(*pud)))
				172	return no_page_table(vma, flags);
				173
				174	pmd = pmd_offset(pud, address);
				175	if (pmd_none(*pmd))
				176	return no_page_table(vma, flags);
				177	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
				178	page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
				179	if (flags & FOLL_GET) {
				180	/*
				181	* Refcount on tail pages are not well-defined and
				182	* shouldn't be taken. The caller should handle a NULL
				183	* return when trying to follow tail pages.
				184	*/
				185	if (PageHead(page))
				186	get_page(page);
				187	else
				188	page = NULL;
				189	}
				190	return page;
				191	}
				192	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
				193	return no_page_table(vma, flags);
				194	if (pmd_trans_huge(*pmd)) {
				195	if (flags & FOLL_SPLIT) {
				196	split_huge_page_pmd(vma, address, pmd);
				197	return follow_page_pte(vma, address, pmd, flags);
				198	}
				199	ptl = pmd_lock(mm, pmd);
				200	if (likely(pmd_trans_huge(*pmd))) {
				201	if (unlikely(pmd_trans_splitting(*pmd))) {
				202	spin_unlock(ptl);
				203	wait_split_huge_page(vma->anon_vma, pmd);
				204	} else {
				205	page = follow_trans_huge_pmd(vma, address,
				206	pmd, flags);
				207	spin_unlock(ptl);
				208	*page_mask = HPAGE_PMD_NR - 1;
				209	return page;
				210	}
				211	} else
				212	spin_unlock(ptl);
				213	}
				214	return follow_page_pte(vma, address, pmd, flags);
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	215	}
				216
Kirill A. Shutemov	f2b495c	2014-06-04 16:08:11 -0700	[diff] [blame]	217	static int get_gate_page(struct mm_struct *mm, unsigned long address,
				218	unsigned int gup_flags, struct vm_area_struct **vma,
				219	struct page **page)
				220	{
				221	pgd_t *pgd;
				222	pud_t *pud;
				223	pmd_t *pmd;
				224	pte_t *pte;
				225	int ret = -EFAULT;
				226
				227	/* user gate pages are read-only */
				228	if (gup_flags & FOLL_WRITE)
				229	return -EFAULT;
				230	if (address > TASK_SIZE)
				231	pgd = pgd_offset_k(address);
				232	else
				233	pgd = pgd_offset_gate(mm, address);
				234	BUG_ON(pgd_none(*pgd));
				235	pud = pud_offset(pgd, address);
				236	BUG_ON(pud_none(*pud));
				237	pmd = pmd_offset(pud, address);
				238	if (pmd_none(*pmd))
				239	return -EFAULT;
				240	VM_BUG_ON(pmd_trans_huge(*pmd));
				241	pte = pte_offset_map(pmd, address);
				242	if (pte_none(*pte))
				243	goto unmap;
				244	*vma = get_gate_vma(mm);
				245	if (!page)
				246	goto out;
				247	page = vm_normal_page(vma, address, *pte);
				248	if (!*page) {
				249	if ((gup_flags & FOLL_DUMP) \|\| !is_zero_pfn(pte_pfn(*pte)))
				250	goto unmap;
				251	page = pte_page(pte);
				252	}
				253	get_page(*page);
				254	out:
				255	ret = 0;
				256	unmap:
				257	pte_unmap(pte);
				258	return ret;
				259	}
				260
Paul Cassella	9a95f3c	2014-08-06 16:07:24 -0700	[diff] [blame]	261	/*
				262	* mmap_sem must be held on entry. If @nonblocking != NULL and
				263	* *@flags does not include FOLL_NOWAIT, the mmap_sem may be released.
				264	* If it is, *@nonblocking will be set to 0 and -EBUSY returned.
				265	*/
Kirill A. Shutemov	1674448	2014-06-04 16:08:12 -0700	[diff] [blame]	266	static int faultin_page(struct task_struct tsk, struct vm_area_struct vma,
				267	unsigned long address, unsigned int flags, int nonblocking)
				268	{
				269	struct mm_struct *mm = vma->vm_mm;
				270	unsigned int fault_flags = 0;
				271	int ret;
				272
				273	/* For mlock, just skip the stack guard page. */
				274	if ((*flags & FOLL_MLOCK) &&
				275	(stack_guard_page_start(vma, address) \|\|
				276	stack_guard_page_end(vma, address + PAGE_SIZE)))
				277	return -ENOENT;
				278	if (*flags & FOLL_WRITE)
				279	fault_flags \|= FAULT_FLAG_WRITE;
				280	if (nonblocking)
				281	fault_flags \|= FAULT_FLAG_ALLOW_RETRY;
				282	if (*flags & FOLL_NOWAIT)
				283	fault_flags \|= FAULT_FLAG_ALLOW_RETRY \| FAULT_FLAG_RETRY_NOWAIT;
				284
				285	ret = handle_mm_fault(mm, vma, address, fault_flags);
				286	if (ret & VM_FAULT_ERROR) {
				287	if (ret & VM_FAULT_OOM)
				288	return -ENOMEM;
				289	if (ret & (VM_FAULT_HWPOISON \| VM_FAULT_HWPOISON_LARGE))
				290	return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
				291	if (ret & VM_FAULT_SIGBUS)
				292	return -EFAULT;
				293	BUG();
				294	}
				295
				296	if (tsk) {
				297	if (ret & VM_FAULT_MAJOR)
				298	tsk->maj_flt++;
				299	else
				300	tsk->min_flt++;
				301	}
				302
				303	if (ret & VM_FAULT_RETRY) {
				304	if (nonblocking)
				305	*nonblocking = 0;
				306	return -EBUSY;
				307	}
				308
				309	/*
				310	* The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
				311	* necessary, even if maybe_mkwrite decided not to set pte_write. We
				312	* can thus safely do subsequent page lookups as if they were reads.
				313	* But only do so when looping for pte_write is futile: in some cases
				314	* userspace may also be wanting to write to the gotten user page,
				315	* which a read fault here might prevent (a readonly page might get
				316	* reCOWed by userspace write).
				317	*/
				318	if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
				319	*flags &= ~FOLL_WRITE;
				320	return 0;
				321	}
				322
Kirill A. Shutemov	fa5bb20	2014-06-04 16:08:13 -0700	[diff] [blame]	323	static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
				324	{
				325	vm_flags_t vm_flags = vma->vm_flags;
				326
				327	if (vm_flags & (VM_IO \| VM_PFNMAP))
				328	return -EFAULT;
				329
				330	if (gup_flags & FOLL_WRITE) {
				331	if (!(vm_flags & VM_WRITE)) {
				332	if (!(gup_flags & FOLL_FORCE))
				333	return -EFAULT;
				334	/*
				335	* We used to let the write,force case do COW in a
				336	* VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
				337	* set a breakpoint in a read-only mapping of an
				338	* executable, without corrupting the file (yet only
				339	* when that file had been opened for writing!).
				340	* Anon pages in shared mappings are surprising: now
				341	* just reject it.
				342	*/
				343	if (!is_cow_mapping(vm_flags)) {
				344	WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
				345	return -EFAULT;
				346	}
				347	}
				348	} else if (!(vm_flags & VM_READ)) {
				349	if (!(gup_flags & FOLL_FORCE))
				350	return -EFAULT;
				351	/*
				352	* Is there actually any vma we can reach here which does not
				353	* have VM_MAYREAD set?
				354	*/
				355	if (!(vm_flags & VM_MAYREAD))
				356	return -EFAULT;
				357	}
				358	return 0;
				359	}
				360
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	361	/**
				362	* __get_user_pages() - pin user pages in memory
				363	* @tsk: task_struct of target task
				364	* @mm: mm_struct of target mm
				365	* @start: starting user address
				366	* @nr_pages: number of pages from start to pin
				367	* @gup_flags: flags modifying pin behaviour
				368	* @pages: array that receives pointers to the pages pinned.
				369	* Should be at least nr_pages long. Or NULL, if caller
				370	* only intends to ensure the pages are faulted in.
				371	* @vmas: array of pointers to vmas corresponding to each page.
				372	* Or NULL if the caller does not require them.
				373	* @nonblocking: whether waiting for disk IO or mmap_sem contention
				374	*
				375	* Returns number of pages pinned. This may be fewer than the number
				376	* requested. If nr_pages is 0 or negative, returns 0. If no pages
				377	* were pinned, returns -errno. Each page returned must be released
				378	* with a put_page() call when it is finished with. vmas will only
				379	* remain valid while mmap_sem is held.
				380	*
Paul Cassella	9a95f3c	2014-08-06 16:07:24 -0700	[diff] [blame]	381	* Must be called with mmap_sem held. It may be released. See below.
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	382	*
				383	* __get_user_pages walks a process's page tables and takes a reference to
				384	* each struct page that each user address corresponds to at a given
				385	* instant. That is, it takes the page that would be accessed if a user
				386	* thread accesses the given user virtual address at that instant.
				387	*
				388	* This does not guarantee that the page exists in the user mappings when
				389	* __get_user_pages returns, and there may even be a completely different
				390	* page there in some cases (eg. if mmapped pagecache has been invalidated
				391	* and subsequently re faulted). However it does guarantee that the page
				392	* won't be freed completely. And mostly callers simply care that the page
				393	* contains data that was valid at some point in time. Typically, an IO
				394	* or similar operation cannot guarantee anything stronger anyway because
				395	* locks can't be held over the syscall boundary.
				396	*
				397	* If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
				398	* the page is written to, set_page_dirty (or set_page_dirty_lock, as
				399	* appropriate) must be called after the page is finished with, and
				400	* before put_page is called.
				401	*
				402	* If @nonblocking != NULL, __get_user_pages will not wait for disk IO
				403	* or mmap_sem contention, and if waiting is needed to pin all pages,
Paul Cassella	9a95f3c	2014-08-06 16:07:24 -0700	[diff] [blame]	404	* *@nonblocking will be set to 0. Further, if @gup_flags does not
				405	* include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
				406	* this case.
				407	*
				408	* A caller using such a combination of @nonblocking and @gup_flags
				409	* must therefore hold the mmap_sem for reading only, and recognize
				410	* when it's been released. Otherwise, it must be held for either
				411	* reading or writing and will not be released.
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	412	*
				413	* In most cases, get_user_pages or get_user_pages_fast should be used
				414	* instead of __get_user_pages. __get_user_pages should be used only if
				415	* you need some special @gup_flags.
				416	*/
				417	long __get_user_pages(struct task_struct tsk, struct mm_struct mm,
				418	unsigned long start, unsigned long nr_pages,
				419	unsigned int gup_flags, struct page **pages,
				420	struct vm_area_struct *vmas, int nonblocking)
				421	{
Kirill A. Shutemov	fa5bb20	2014-06-04 16:08:13 -0700	[diff] [blame]	422	long i = 0;
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	423	unsigned int page_mask;
Kirill A. Shutemov	fa5bb20	2014-06-04 16:08:13 -0700	[diff] [blame]	424	struct vm_area_struct *vma = NULL;
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	425
				426	if (!nr_pages)
				427	return 0;
				428
				429	VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
				430
				431	/*
				432	* If FOLL_FORCE is set then do not force a full fault as the hinting
				433	* fault information is unrelated to the reference behaviour of a task
				434	* using the address space
				435	*/
				436	if (!(gup_flags & FOLL_FORCE))
				437	gup_flags \|= FOLL_NUMA;
				438
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	439	do {
Kirill A. Shutemov	fa5bb20	2014-06-04 16:08:13 -0700	[diff] [blame]	440	struct page *page;
				441	unsigned int foll_flags = gup_flags;
				442	unsigned int page_increm;
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	443
Kirill A. Shutemov	fa5bb20	2014-06-04 16:08:13 -0700	[diff] [blame]	444	/* first iteration or cross vma bound */
				445	if (!vma \|\| start >= vma->vm_end) {
				446	vma = find_extend_vma(mm, start);
				447	if (!vma && in_gate_area(mm, start)) {
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	448	int ret;
Kirill A. Shutemov	fa5bb20	2014-06-04 16:08:13 -0700	[diff] [blame]	449	ret = get_gate_page(mm, start & PAGE_MASK,
				450	gup_flags, &vma,
				451	pages ? &pages[i] : NULL);
				452	if (ret)
				453	return i ? : ret;
				454	page_mask = 0;
				455	goto next_page;
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	456	}
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	457
Kirill A. Shutemov	fa5bb20	2014-06-04 16:08:13 -0700	[diff] [blame]	458	if (!vma \|\| check_vma_flags(vma, gup_flags))
				459	return i ? : -EFAULT;
				460	if (is_vm_hugetlb_page(vma)) {
				461	i = follow_hugetlb_page(mm, vma, pages, vmas,
				462	&start, &nr_pages, i,
				463	gup_flags);
				464	continue;
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	465	}
Kirill A. Shutemov	fa5bb20	2014-06-04 16:08:13 -0700	[diff] [blame]	466	}
				467	retry:
				468	/*
				469	* If we have a pending SIGKILL, don't keep faulting pages and
				470	* potentially allocating memory.
				471	*/
				472	if (unlikely(fatal_signal_pending(current)))
				473	return i ? i : -ERESTARTSYS;
				474	cond_resched();
				475	page = follow_page_mask(vma, start, foll_flags, &page_mask);
				476	if (!page) {
				477	int ret;
				478	ret = faultin_page(tsk, vma, start, &foll_flags,
				479	nonblocking);
				480	switch (ret) {
				481	case 0:
				482	goto retry;
				483	case -EFAULT:
				484	case -ENOMEM:
				485	case -EHWPOISON:
				486	return i ? i : ret;
				487	case -EBUSY:
				488	return i;
				489	case -ENOENT:
				490	goto next_page;
				491	}
				492	BUG();
				493	}
				494	if (IS_ERR(page))
				495	return i ? i : PTR_ERR(page);
				496	if (pages) {
				497	pages[i] = page;
				498	flush_anon_page(vma, page, start);
				499	flush_dcache_page(page);
				500	page_mask = 0;
				501	}
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	502	next_page:
Kirill A. Shutemov	fa5bb20	2014-06-04 16:08:13 -0700	[diff] [blame]	503	if (vmas) {
				504	vmas[i] = vma;
				505	page_mask = 0;
				506	}
				507	page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
				508	if (page_increm > nr_pages)
				509	page_increm = nr_pages;
				510	i += page_increm;
				511	start += page_increm * PAGE_SIZE;
				512	nr_pages -= page_increm;
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	513	} while (nr_pages);
				514	return i;
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	515	}
				516	EXPORT_SYMBOL(__get_user_pages);
				517
				518	/*
				519	* fixup_user_fault() - manually resolve a user page fault
				520	* @tsk: the task_struct to use for page fault accounting, or
				521	* NULL if faults are not to be recorded.
				522	* @mm: mm_struct of target mm
				523	* @address: user address
				524	* @fault_flags:flags to pass down to handle_mm_fault()
				525	*
				526	* This is meant to be called in the specific scenario where for locking reasons
				527	* we try to access user memory in atomic context (within a pagefault_disable()
				528	* section), this returns -EFAULT, and we want to resolve the user fault before
				529	* trying again.
				530	*
				531	* Typically this is meant to be used by the futex code.
				532	*
				533	* The main difference with get_user_pages() is that this function will
				534	* unconditionally call handle_mm_fault() which will in turn perform all the
				535	* necessary SW fixup of the dirty and young bits in the PTE, while
				536	* handle_mm_fault() only guarantees to update these in the struct page.
				537	*
				538	* This is important for some architectures where those bits also gate the
				539	* access permission to the page because they are maintained in software. On
				540	* such architectures, gup() will not be enough to make a subsequent access
				541	* succeed.
				542	*
Paul Cassella	9a95f3c	2014-08-06 16:07:24 -0700	[diff] [blame]	543	* This has the same semantics wrt the @mm->mmap_sem as does filemap_fault().
Kirill A. Shutemov	4bbd4c7	2014-06-04 16:08:10 -0700	[diff] [blame]	544	*/
				545	int fixup_user_fault(struct task_struct tsk, struct mm_struct mm,
				546	unsigned long address, unsigned int fault_flags)
				547	{
				548	struct vm_area_struct *vma;
				549	vm_flags_t vm_flags;
				550	int ret;
				551
				552	vma = find_extend_vma(mm, address);
				553	if (!vma \|\| address < vma->vm_start)
				554	return -EFAULT;
				555
				556	vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
				557	if (!(vm_flags & vma->vm_flags))
				558	return -EFAULT;
				559
				560	ret = handle_mm_fault(mm, vma, address, fault_flags);
				561	if (ret & VM_FAULT_ERROR) {
				562	if (ret & VM_FAULT_OOM)
				563	return -ENOMEM;
				564	if (ret & (VM_FAULT_HWPOISON \| VM_FAULT_HWPOISON_LARGE))
				565	return -EHWPOISON;
				566	if (ret & VM_FAULT_SIGBUS)
				567	return -EFAULT;
				568	BUG();
				569	}
				570	if (tsk) {
				571	if (ret & VM_FAULT_MAJOR)
				572	tsk->maj_flt++;
				573	else
				574	tsk->min_flt++;
				575	}
				576	return 0;
				577	}
				578
				579	/*
				580	* get_user_pages() - pin user pages in memory
				581	* @tsk: the task_struct to use for page fault accounting, or
				582	* NULL if faults are not to be recorded.
				583	* @mm: mm_struct of target mm
				584	* @start: starting user address
				585	* @nr_pages: number of pages from start to pin
				586	* @write: whether pages will be written to by the caller
				587	* @force: whether to force access even when user mapping is currently
				588	* protected (but never forces write access to shared mapping).
				589	* @pages: array that receives pointers to the pages pinned.
				590	* Should be at least nr_pages long. Or NULL, if caller
				591	* only intends to ensure the pages are faulted in.
				592	* @vmas: array of pointers to vmas corresponding to each page.
				593	* Or NULL if the caller does not require them.
				594	*
				595	* Returns number of pages pinned. This may be fewer than the number
				596	* requested. If nr_pages is 0 or negative, returns 0. If no pages
				597	* were pinned, returns -errno. Each page returned must be released
				598	* with a put_page() call when it is finished with. vmas will only
				599	* remain valid while mmap_sem is held.
				600	*
				601	* Must be called with mmap_sem held for read or write.
				602	*
				603	* get_user_pages walks a process's page tables and takes a reference to
				604	* each struct page that each user address corresponds to at a given
				605	* instant. That is, it takes the page that would be accessed if a user
				606	* thread accesses the given user virtual address at that instant.
				607	*
				608	* This does not guarantee that the page exists in the user mappings when
				609	* get_user_pages returns, and there may even be a completely different
				610	* page there in some cases (eg. if mmapped pagecache has been invalidated
				611	* and subsequently re faulted). However it does guarantee that the page
				612	* won't be freed completely. And mostly callers simply care that the page
				613	* contains data that was valid at some point in time. Typically, an IO
				614	* or similar operation cannot guarantee anything stronger anyway because
				615	* locks can't be held over the syscall boundary.
				616	*
				617	* If write=0, the page must not be written to. If the page is written to,
				618	* set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
				619	* after the page is finished with, and before put_page is called.
				620	*
				621	* get_user_pages is typically used for fewer-copy IO operations, to get a
				622	* handle on the memory by some means other than accesses via the user virtual
				623	* addresses. The pages may be submitted for DMA to devices or accessed via
				624	* their kernel linear mapping (via the kmap APIs). Care should be taken to
				625	* use the correct cache flushing APIs.
				626	*
				627	* See also get_user_pages_fast, for performance critical applications.
				628	*/
				629	long get_user_pages(struct task_struct tsk, struct mm_struct mm,
				630	unsigned long start, unsigned long nr_pages, int write,
				631	int force, struct page pages, struct vm_area_struct vmas)
				632	{
				633	int flags = FOLL_TOUCH;
				634
				635	if (pages)
				636	flags \|= FOLL_GET;
				637	if (write)
				638	flags \|= FOLL_WRITE;
				639	if (force)
				640	flags \|= FOLL_FORCE;
				641
				642	return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
				643	NULL);
				644	}
				645	EXPORT_SYMBOL(get_user_pages);
				646
				647	/**
				648	* get_dump_page() - pin user page in memory while writing it to core dump
				649	* @addr: user address
				650	*
				651	* Returns struct page pointer of user page pinned for dump,
				652	* to be freed afterwards by page_cache_release() or put_page().
				653	*
				654	* Returns NULL on any kind of failure - a hole must then be inserted into
				655	* the corefile, to preserve alignment with its headers; and also returns
				656	* NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
				657	* allowing a hole to be left in the corefile to save diskspace.
				658	*
				659	* Called without mmap_sem, but after all other threads have been killed.
				660	*/
				661	#ifdef CONFIG_ELF_CORE
				662	struct page *get_dump_page(unsigned long addr)
				663	{
				664	struct vm_area_struct *vma;
				665	struct page *page;
				666
				667	if (__get_user_pages(current, current->mm, addr, 1,
				668	FOLL_FORCE \| FOLL_DUMP \| FOLL_GET, &page, &vma,
				669	NULL) < 1)
				670	return NULL;
				671	flush_cache_page(vma, addr, page_to_pfn(page));
				672	return page;
				673	}
				674	#endif /* CONFIG_ELF_CORE */