x86: mm: consolidate VM_FAULT_RETRY handling The VM_FAULT_RETRY handling was confusing and incorrect for the case of returning to kernel mode. We need to handle the exception table fixup if we return to kernel mode due to a fatal signal - it will basically look to the kernel user mode access like the access failed due to the VM going away from udner it. Which is correct - the process is dying - and avoids the whole "repeat endless kernel page faults" case. Handling the VM_FAULT_RETRY early and in just one place also simplifies the mmap_sem handling, since once we've taken care of VM_FAULT_RETRY we know that we can just drop the lock. The remaining accounting and possible error handling is thread-local and does not need the mmap_sem. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

commit: 26178ec11ef3c6c814bf16a0a2b9c2f7242e3c64 [log] [tgz]
author: Linus Torvalds <torvalds@linux-foundation.org> Mon Dec 15 15:07:33 2014 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> Mon Dec 15 15:07:33 2014 -0800
tree: 7c4e2506fb972eda829b3ff7f0eca3dbae8e25ae
parent: 7fb08eca45270d0ae86e1ad9d39c40b7a55d0190 [diff] [blame]
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index d7de5b8..b74a7e1 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c

@@ -1055,7 +1055,7 @@
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
 	struct mm_struct *mm;
-	int fault;
+	int fault, major = 0;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
 	tsk = current;
@@ -1230,48 +1230,50 @@
 	 * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
 	 */
 	fault = handle_mm_fault(mm, vma, address, flags);
+	major |= fault & VM_FAULT_MAJOR;
 
 	/*
-	 * If we need to retry but a fatal signal is pending, handle the
-	 * signal first. We do not need to release the mmap_sem because it
-	 * would already be released in __lock_page_or_retry in mm/filemap.c.
+	 * If we need to retry the mmap_sem has already been released,
+	 * and if there is a fatal signal pending there is no guarantee
+	 * that we made any progress. Handle this case first.
 	 */
-	if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)))
-		return;
+	if (unlikely(fault & VM_FAULT_RETRY)) {
+		/* Retry at most once */
+		if (flags & FAULT_FLAG_ALLOW_RETRY) {
+			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
+			if (!fatal_signal_pending(tsk))
+				goto retry;
+		}
 
+		/* User mode? Just return to handle the fatal exception */
+		if (fault & FAULT_FLAG_USER)
+			return;
+
+		/* Not returning to user mode? Handle exceptions or die: */
+		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+		return;
+	}
+
+	up_read(&mm->mmap_sem);
 	if (unlikely(fault & VM_FAULT_ERROR)) {
-		up_read(&mm->mmap_sem);
 		mm_fault_error(regs, error_code, address, fault);
 		return;
 	}
 
 	/*
-	 * Major/minor page fault accounting is only done on the
-	 * initial attempt. If we go through a retry, it is extremely
-	 * likely that the page will be found in page cache at that point.
+	 * Major/minor page fault accounting. If any of the events
+	 * returned VM_FAULT_MAJOR, we account it as a major fault.
 	 */
-	if (flags & FAULT_FLAG_ALLOW_RETRY) {
-		if (fault & VM_FAULT_MAJOR) {
-			tsk->maj_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
-				      regs, address);
-		} else {
-			tsk->min_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
-				      regs, address);
-		}
-		if (fault & VM_FAULT_RETRY) {
-			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
-			 * of starvation. */
-			flags &= ~FAULT_FLAG_ALLOW_RETRY;
-			flags |= FAULT_FLAG_TRIED;
-			goto retry;
-		}
+	if (major) {
+		tsk->maj_flt++;
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+	} else {
+		tsk->min_flt++;
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
 	}
 
 	check_v8086_mode(regs, address, tsk);
-
-	up_read(&mm->mmap_sem);
 }
 NOKPROBE_SYMBOL(__do_page_fault);
commit	26178ec11ef3c6c814bf16a0a2b9c2f7242e3c64	[log] [tgz]
author	Linus Torvalds <torvalds@linux-foundation.org>	Mon Dec 15 15:07:33 2014 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	Mon Dec 15 15:07:33 2014 -0800
tree	7c4e2506fb972eda829b3ff7f0eca3dbae8e25ae
parent	7fb08eca45270d0ae86e1ad9d39c40b7a55d0190 [diff] [blame]