Blame - arch/x86_64/mm/fault.c - kernel/msm-5.4

blob: e0330921676413d100b6a1f2cf2c6f579133e3d8 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/arch/x86-64/mm/fault.c
				3	*
				4	* Copyright (C) 1995 Linus Torvalds
				5	* Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
				6	*/
				7
				8	#include <linux/config.h>
				9	#include <linux/signal.h>
				10	#include <linux/sched.h>
				11	#include <linux/kernel.h>
				12	#include <linux/errno.h>
				13	#include <linux/string.h>
				14	#include <linux/types.h>
				15	#include <linux/ptrace.h>
				16	#include <linux/mman.h>
				17	#include <linux/mm.h>
				18	#include <linux/smp.h>
				19	#include <linux/smp_lock.h>
				20	#include <linux/interrupt.h>
				21	#include <linux/init.h>
				22	#include <linux/tty.h>
				23	#include <linux/vt_kern.h> /* For unblank_screen() */
				24	#include <linux/compiler.h>
				25	#include <linux/module.h>
				26	#include <linux/kprobes.h>
				27
				28	#include <asm/system.h>
				29	#include <asm/uaccess.h>
				30	#include <asm/pgalloc.h>
				31	#include <asm/smp.h>
				32	#include <asm/tlbflush.h>
				33	#include <asm/proto.h>
				34	#include <asm/kdebug.h>
				35	#include <asm-generic/sections.h>
				36	#include <asm/kdebug.h>
				37
				38	void bust_spinlocks(int yes)
				39	{
				40	int loglevel_save = console_loglevel;
				41	if (yes) {
				42	oops_in_progress = 1;
				43	} else {
				44	#ifdef CONFIG_VT
				45	unblank_screen();
				46	#endif
				47	oops_in_progress = 0;
				48	/*
				49	* OK, the message is on the console. Now we call printk()
				50	* without oops_in_progress set so that printk will give klogd
				51	* a poke. Hold onto your hats...
				52	*/
				53	console_loglevel = 15; /* NMI oopser may have shut the console up */
				54	printk(" ");
				55	console_loglevel = loglevel_save;
				56	}
				57	}
				58
				59	/* Sometimes the CPU reports invalid exceptions on prefetch.
				60	Check that here and ignore.
				61	Opcode checker based on code by Richard Brunner */
				62	static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
				63	unsigned long error_code)
				64	{
Andi Kleen	f1290ec	2005-04-16 15:24:59 -0700	[diff] [blame^]	65	unsigned char *instr;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	66	int scan_more = 1;
				67	int prefetch = 0;
Andi Kleen	f1290ec	2005-04-16 15:24:59 -0700	[diff] [blame^]	68	unsigned char *max_instr;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	69
				70	/* If it was a exec fault ignore */
				71	if (error_code & (1<<4))
				72	return 0;
				73
Andi Kleen	f1290ec	2005-04-16 15:24:59 -0700	[diff] [blame^]	74	instr = (unsigned char *)convert_rip_to_linear(current, regs);
				75	max_instr = instr + 15;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	76
Andi Kleen	f1290ec	2005-04-16 15:24:59 -0700	[diff] [blame^]	77	if ((regs->cs & 3) != 0 && instr >= (unsigned char *)TASK_SIZE)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	78	return 0;
				79
				80	while (scan_more && instr < max_instr) {
				81	unsigned char opcode;
				82	unsigned char instr_hi;
				83	unsigned char instr_lo;
				84
				85	if (__get_user(opcode, instr))
				86	break;
				87
				88	instr_hi = opcode & 0xf0;
				89	instr_lo = opcode & 0x0f;
				90	instr++;
				91
				92	switch (instr_hi) {
				93	case 0x20:
				94	case 0x30:
				95	/* Values 0x26,0x2E,0x36,0x3E are valid x86
				96	prefixes. In long mode, the CPU will signal
				97	invalid opcode if some of these prefixes are
				98	present so we will never get here anyway */
				99	scan_more = ((instr_lo & 7) == 0x6);
				100	break;
				101
				102	case 0x40:
				103	/* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
				104	Need to figure out under what instruction mode the
				105	instruction was issued ... */
				106	/* Could check the LDT for lm, but for now it's good
				107	enough to assume that long mode only uses well known
				108	segments or kernel. */
				109	scan_more = ((regs->cs & 3) == 0) \|\| (regs->cs == __USER_CS);
				110	break;
				111
				112	case 0x60:
				113	/* 0x64 thru 0x67 are valid prefixes in all modes. */
				114	scan_more = (instr_lo & 0xC) == 0x4;
				115	break;
				116	case 0xF0:
				117	/* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
				118	scan_more = !instr_lo \|\| (instr_lo>>1) == 1;
				119	break;
				120	case 0x00:
				121	/* Prefetch instruction is 0x0F0D or 0x0F18 */
				122	scan_more = 0;
				123	if (__get_user(opcode, instr))
				124	break;
				125	prefetch = (instr_lo == 0xF) &&
				126	(opcode == 0x0D \|\| opcode == 0x18);
				127	break;
				128	default:
				129	scan_more = 0;
				130	break;
				131	}
				132	}
				133	return prefetch;
				134	}
				135
				136	static int bad_address(void *p)
				137	{
				138	unsigned long dummy;
				139	return __get_user(dummy, (unsigned long *)p);
				140	}
				141
				142	void dump_pagetable(unsigned long address)
				143	{
				144	pgd_t *pgd;
				145	pud_t *pud;
				146	pmd_t *pmd;
				147	pte_t *pte;
				148
				149	asm("movq %%cr3,%0" : "=r" (pgd));
				150
				151	pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
				152	pgd += pgd_index(address);
				153	printk("PGD %lx ", pgd_val(*pgd));
				154	if (bad_address(pgd)) goto bad;
				155	if (!pgd_present(*pgd)) goto ret;
				156
				157	pud = __pud_offset_k((pud_t )pgd_page(pgd), address);
				158	if (bad_address(pud)) goto bad;
				159	printk("PUD %lx ", pud_val(*pud));
				160	if (!pud_present(*pud)) goto ret;
				161
				162	pmd = pmd_offset(pud, address);
				163	if (bad_address(pmd)) goto bad;
				164	printk("PMD %lx ", pmd_val(*pmd));
				165	if (!pmd_present(*pmd)) goto ret;
				166
				167	pte = pte_offset_kernel(pmd, address);
				168	if (bad_address(pte)) goto bad;
				169	printk("PTE %lx", pte_val(*pte));
				170	ret:
				171	printk("\n");
				172	return;
				173	bad:
				174	printk("BAD\n");
				175	}
				176
				177	static const char errata93_warning[] =
				178	KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
				179	KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
				180	KERN_ERR "******* Please consider a BIOS update.\n"
				181	KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
				182
				183	/* Workaround for K8 erratum #93 & buggy BIOS.
				184	BIOS SMM functions are required to use a specific workaround
				185	to avoid corruption of the 64bit RIP register on C stepping K8.
				186	A lot of BIOS that didn't get tested properly miss this.
				187	The OS sees this as a page fault with the upper 32bits of RIP cleared.
				188	Try to work around it here.
				189	Note we only handle faults in kernel here. */
				190
				191	static int is_errata93(struct pt_regs *regs, unsigned long address)
				192	{
				193	static int warned;
				194	if (address != regs->rip)
				195	return 0;
				196	if ((address >> 32) != 0)
				197	return 0;
				198	address \|= 0xffffffffUL << 32;
				199	if ((address >= (u64)_stext && address <= (u64)_etext) \|\|
				200	(address >= MODULES_VADDR && address <= MODULES_END)) {
				201	if (!warned) {
				202	printk(errata93_warning);
				203	warned = 1;
				204	}
				205	regs->rip = address;
				206	return 1;
				207	}
				208	return 0;
				209	}
				210
				211	int unhandled_signal(struct task_struct *tsk, int sig)
				212	{
				213	if (tsk->pid == 1)
				214	return 1;
				215	/* Warn for strace, but not for gdb */
				216	if (!test_ti_thread_flag(tsk->thread_info, TIF_SYSCALL_TRACE) &&
				217	(tsk->ptrace & PT_PTRACED))
				218	return 0;
				219	return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) \|\|
				220	(tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
				221	}
				222
				223	static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
				224	unsigned long error_code)
				225	{
				226	oops_begin();
				227	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
				228	current->comm, address);
				229	dump_pagetable(address);
				230	__die("Bad pagetable", regs, error_code);
				231	oops_end();
				232	do_exit(SIGKILL);
				233	}
				234
				235	/*
				236	* Handle a fault on the vmalloc or module mapping area
				237	*/
				238	static int vmalloc_fault(unsigned long address)
				239	{
				240	pgd_t pgd, pgd_ref;
				241	pud_t pud, pud_ref;
				242	pmd_t pmd, pmd_ref;
				243	pte_t pte, pte_ref;
				244
				245	/* Copy kernel mappings over when needed. This can also
				246	happen within a race in page table update. In the later
				247	case just flush. */
				248
				249	pgd = pgd_offset(current->mm ?: &init_mm, address);
				250	pgd_ref = pgd_offset_k(address);
				251	if (pgd_none(*pgd_ref))
				252	return -1;
				253	if (pgd_none(*pgd))
				254	set_pgd(pgd, *pgd_ref);
				255
				256	/* Below here mismatches are bugs because these lower tables
				257	are shared */
				258
				259	pud = pud_offset(pgd, address);
				260	pud_ref = pud_offset(pgd_ref, address);
				261	if (pud_none(*pud_ref))
				262	return -1;
				263	if (pud_none(pud) \|\| pud_page(pud) != pud_page(*pud_ref))
				264	BUG();
				265	pmd = pmd_offset(pud, address);
				266	pmd_ref = pmd_offset(pud_ref, address);
				267	if (pmd_none(*pmd_ref))
				268	return -1;
				269	if (pmd_none(pmd) \|\| pmd_page(pmd) != pmd_page(*pmd_ref))
				270	BUG();
				271	pte_ref = pte_offset_kernel(pmd_ref, address);
				272	if (!pte_present(*pte_ref))
				273	return -1;
				274	pte = pte_offset_kernel(pmd, address);
				275	if (!pte_present(pte) \|\| pte_page(pte) != pte_page(*pte_ref))
				276	BUG();
				277	__flush_tlb_all();
				278	return 0;
				279	}
				280
				281	int page_fault_trace = 0;
				282	int exception_trace = 1;
				283
				284	/*
				285	* This routine handles page faults. It determines the address,
				286	* and the problem, and then passes it off to one of the appropriate
				287	* routines.
				288	*
				289	* error_code:
				290	* bit 0 == 0 means no page found, 1 means protection fault
				291	* bit 1 == 0 means read, 1 means write
				292	* bit 2 == 0 means kernel, 1 means user-mode
				293	* bit 3 == 1 means fault was an instruction fetch
				294	*/
				295	asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
				296	{
				297	struct task_struct *tsk;
				298	struct mm_struct *mm;
				299	struct vm_area_struct * vma;
				300	unsigned long address;
				301	const struct exception_table_entry *fixup;
				302	int write;
				303	siginfo_t info;
				304
				305	#ifdef CONFIG_CHECKING
				306	{
				307	unsigned long gs;
				308	struct x8664_pda *pda = cpu_pda + stack_smp_processor_id();
				309	rdmsrl(MSR_GS_BASE, gs);
				310	if (gs != (unsigned long)pda) {
				311	wrmsrl(MSR_GS_BASE, pda);
				312	printk("page_fault: wrong gs %lx expected %p\n", gs, pda);
				313	}
				314	}
				315	#endif
				316
				317	/* get the address */
				318	__asm__("movq %%cr2,%0":"=r" (address));
				319	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
				320	SIGSEGV) == NOTIFY_STOP)
				321	return;
				322
				323	if (likely(regs->eflags & X86_EFLAGS_IF))
				324	local_irq_enable();
				325
				326	if (unlikely(page_fault_trace))
				327	printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
				328	regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
				329
				330	tsk = current;
				331	mm = tsk->mm;
				332	info.si_code = SEGV_MAPERR;
				333
				334
				335	/*
				336	* We fault-in kernel-space virtual memory on-demand. The
				337	* 'reference' page table is init_mm.pgd.
				338	*
				339	* NOTE! We MUST NOT take any locks for this case. We may
				340	* be in an interrupt or a critical region, and should
				341	* only copy the information from the master page table,
				342	* nothing more.
				343	*
				344	* This verifies that the fault happens in kernel space
				345	* (error_code & 4) == 0, and that the fault was not a
				346	* protection error (error_code & 1) == 0.
				347	*/
				348	if (unlikely(address >= TASK_SIZE)) {
				349	if (!(error_code & 5)) {
				350	if (vmalloc_fault(address) < 0)
				351	goto bad_area_nosemaphore;
				352	return;
				353	}
				354	/*
				355	* Don't take the mm semaphore here. If we fixup a prefetch
				356	* fault we could otherwise deadlock.
				357	*/
				358	goto bad_area_nosemaphore;
				359	}
				360
				361	if (unlikely(error_code & (1 << 3)))
				362	pgtable_bad(address, regs, error_code);
				363
				364	/*
				365	* If we're in an interrupt or have no user
				366	* context, we must not take the fault..
				367	*/
				368	if (unlikely(in_atomic() \|\| !mm))
				369	goto bad_area_nosemaphore;
				370
				371	again:
				372	/* When running in the kernel we expect faults to occur only to
				373	* addresses in user space. All other faults represent errors in the
				374	* kernel and should generate an OOPS. Unfortunatly, in the case of an
				375	* erroneous fault occuring in a code path which already holds mmap_sem
				376	* we will deadlock attempting to validate the fault against the
				377	* address space. Luckily the kernel only validly references user
				378	* space from well defined areas of code, which are listed in the
				379	* exceptions table.
				380	*
				381	* As the vast majority of faults will be valid we will only perform
				382	* the source reference check when there is a possibilty of a deadlock.
				383	* Attempt to lock the address space, if we cannot we then validate the
				384	* source. If this is invalid we can skip the address space check,
				385	* thus avoiding the deadlock.
				386	*/
				387	if (!down_read_trylock(&mm->mmap_sem)) {
				388	if ((error_code & 4) == 0 &&
				389	!search_exception_tables(regs->rip))
				390	goto bad_area_nosemaphore;
				391	down_read(&mm->mmap_sem);
				392	}
				393
				394	vma = find_vma(mm, address);
				395	if (!vma)
				396	goto bad_area;
				397	if (likely(vma->vm_start <= address))
				398	goto good_area;
				399	if (!(vma->vm_flags & VM_GROWSDOWN))
				400	goto bad_area;
				401	if (error_code & 4) {
				402	// XXX: align red zone size with ABI
				403	if (address + 128 < regs->rsp)
				404	goto bad_area;
				405	}
				406	if (expand_stack(vma, address))
				407	goto bad_area;
				408	/*
				409	* Ok, we have a good vm_area for this memory access, so
				410	* we can handle it..
				411	*/
				412	good_area:
				413	info.si_code = SEGV_ACCERR;
				414	write = 0;
				415	switch (error_code & 3) {
				416	default: /* 3: write, present */
				417	/* fall through */
				418	case 2: /* write, not present */
				419	if (!(vma->vm_flags & VM_WRITE))
				420	goto bad_area;
				421	write++;
				422	break;
				423	case 1: /* read, present */
				424	goto bad_area;
				425	case 0: /* read, not present */
				426	if (!(vma->vm_flags & (VM_READ \| VM_EXEC)))
				427	goto bad_area;
				428	}
				429
				430	/*
				431	* If for any reason at all we couldn't handle the fault,
				432	* make sure we exit gracefully rather than endlessly redo
				433	* the fault.
				434	*/
				435	switch (handle_mm_fault(mm, vma, address, write)) {
				436	case 1:
				437	tsk->min_flt++;
				438	break;
				439	case 2:
				440	tsk->maj_flt++;
				441	break;
				442	case 0:
				443	goto do_sigbus;
				444	default:
				445	goto out_of_memory;
				446	}
				447
				448	up_read(&mm->mmap_sem);
				449	return;
				450
				451	/*
				452	* Something tried to access memory that isn't in our memory map..
				453	* Fix it, but check if it's kernel or user first..
				454	*/
				455	bad_area:
				456	up_read(&mm->mmap_sem);
				457
				458	bad_area_nosemaphore:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	459	/* User mode accesses just cause a SIGSEGV */
				460	if (error_code & 4) {
				461	if (is_prefetch(regs, address, error_code))
				462	return;
				463
				464	/* Work around K8 erratum #100 K8 in compat mode
				465	occasionally jumps to illegal addresses >4GB. We
				466	catch this here in the page fault handler because
				467	these addresses are not reachable. Just detect this
				468	case and return. Any code segment in LDT is
				469	compatibility mode. */
				470	if ((regs->cs == __USER32_CS \|\| (regs->cs & (1<<2))) &&
				471	(address >> 32))
				472	return;
				473
				474	if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
				475	printk(
				476	"%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
				477	tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
				478	tsk->comm, tsk->pid, address, regs->rip,
				479	regs->rsp, error_code);
				480	}
				481
				482	tsk->thread.cr2 = address;
				483	/* Kernel addresses are always protection faults */
				484	tsk->thread.error_code = error_code \| (address >= TASK_SIZE);
				485	tsk->thread.trap_no = 14;
				486	info.si_signo = SIGSEGV;
				487	info.si_errno = 0;
				488	/* info.si_code has been set above */
				489	info.si_addr = (void __user *)address;
				490	force_sig_info(SIGSEGV, &info, tsk);
				491	return;
				492	}
				493
				494	no_context:
				495
				496	/* Are we prepared to handle this kernel fault? */
				497	fixup = search_exception_tables(regs->rip);
				498	if (fixup) {
				499	regs->rip = fixup->fixup;
				500	return;
				501	}
				502
				503	/*
				504	* Hall of shame of CPU/BIOS bugs.
				505	*/
				506
				507	if (is_prefetch(regs, address, error_code))
				508	return;
				509
				510	if (is_errata93(regs, address))
				511	return;
				512
				513	/*
				514	* Oops. The kernel tried to access some bad page. We'll have to
				515	* terminate things with extreme prejudice.
				516	*/
				517
				518	oops_begin();
				519
				520	if (address < PAGE_SIZE)
				521	printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
				522	else
				523	printk(KERN_ALERT "Unable to handle kernel paging request");
				524	printk(" at %016lx RIP: \n" KERN_ALERT,address);
				525	printk_address(regs->rip);
				526	printk("\n");
				527	dump_pagetable(address);
				528	__die("Oops", regs, error_code);
				529	/* Executive summary in case the body of the oops scrolled away */
				530	printk(KERN_EMERG "CR2: %016lx\n", address);
				531	oops_end();
				532	do_exit(SIGKILL);
				533
				534	/*
				535	* We ran out of memory, or some other thing happened to us that made
				536	* us unable to handle the page fault gracefully.
				537	*/
				538	out_of_memory:
				539	up_read(&mm->mmap_sem);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	540	if (current->pid == 1) {
				541	yield();
				542	goto again;
				543	}
				544	printk("VM: killing process %s\n", tsk->comm);
				545	if (error_code & 4)
				546	do_exit(SIGKILL);
				547	goto no_context;
				548
				549	do_sigbus:
				550	up_read(&mm->mmap_sem);
				551
				552	/* Kernel mode? Handle exceptions or die */
				553	if (!(error_code & 4))
				554	goto no_context;
				555
				556	tsk->thread.cr2 = address;
				557	tsk->thread.error_code = error_code;
				558	tsk->thread.trap_no = 14;
				559	info.si_signo = SIGBUS;
				560	info.si_errno = 0;
				561	info.si_code = BUS_ADRERR;
				562	info.si_addr = (void __user *)address;
				563	force_sig_info(SIGBUS, &info, tsk);
				564	return;
				565	}