blob: 80f8436ac8b29a2e87680c5b0d8b98d449592f0d [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
Linus Torvalds1da177e2005-04-16 15:20:36 -07006#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070016#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070021#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/module.h>
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -070023#include <linux/kprobes.h>
Andi Kleenab2bf0c2006-12-07 02:14:06 +010024#include <linux/uaccess.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070025#include <linux/kdebug.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070026
27#include <asm/system.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070028#include <asm/pgalloc.h>
29#include <asm/smp.h>
30#include <asm/tlbflush.h>
31#include <asm/proto.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070032#include <asm-generic/sections.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070033
Harvey Harrison33cb5242008-01-30 13:32:19 +010034/*
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
41 */
Ingo Molnar8a19da72008-01-30 13:32:53 +010042#define PF_PROT (1<<0)
Andi Kleen66c58152006-01-11 22:44:09 +010043#define PF_WRITE (1<<1)
Ingo Molnar8a19da72008-01-30 13:32:53 +010044#define PF_USER (1<<2)
45#define PF_RSVD (1<<3)
Andi Kleen66c58152006-01-11 22:44:09 +010046#define PF_INSTR (1<<4)
47
Christoph Hellwig74a0b572007-10-16 01:24:07 -070048static inline int notify_page_fault(struct pt_regs *regs)
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070049{
Harvey Harrison33cb5242008-01-30 13:32:19 +010050#ifdef CONFIG_KPROBES
Christoph Hellwig74a0b572007-10-16 01:24:07 -070051 int ret = 0;
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070052
Christoph Hellwig74a0b572007-10-16 01:24:07 -070053 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
55 preempt_disable();
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
57 ret = 1;
58 preempt_enable();
59 }
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070060
Christoph Hellwig74a0b572007-10-16 01:24:07 -070061 return ret;
Christoph Hellwig74a0b572007-10-16 01:24:07 -070062#else
Christoph Hellwig74a0b572007-10-16 01:24:07 -070063 return 0;
Christoph Hellwig74a0b572007-10-16 01:24:07 -070064#endif
Harvey Harrison33cb5242008-01-30 13:32:19 +010065}
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070066
Harvey Harrison1dc85be2008-01-30 13:32:35 +010067/*
68 * X86_32
69 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
70 * Check that here and ignore it.
71 *
72 * X86_64
73 * Sometimes the CPU reports invalid exceptions on prefetch.
74 * Check that here and ignore it.
75 *
76 * Opcode checker based on code by Richard Brunner
77 */
78static int is_prefetch(struct pt_regs *regs, unsigned long addr,
79 unsigned long error_code)
Harvey Harrison33cb5242008-01-30 13:32:19 +010080{
Andi Kleenab2bf0c2006-12-07 02:14:06 +010081 unsigned char *instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -070082 int scan_more = 1;
Harvey Harrison33cb5242008-01-30 13:32:19 +010083 int prefetch = 0;
Andi Kleenf1290ec2005-04-16 15:24:59 -070084 unsigned char *max_instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -070085
Harvey Harrison1dc85be2008-01-30 13:32:35 +010086#ifdef CONFIG_X86_32
Harvey Harrison1dc85be2008-01-30 13:32:35 +010087 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
88 boot_cpu_data.x86 >= 6)) {
89 /* Catch an obscure case of prefetch inside an NX page. */
90 if (nx_enabled && (error_code & PF_INSTR))
91 return 0;
92 } else {
93 return 0;
94 }
Harvey Harrison1dc85be2008-01-30 13:32:35 +010095#else
Linus Torvalds1da177e2005-04-16 15:20:36 -070096 /* If it was a exec fault ignore */
Andi Kleen66c58152006-01-11 22:44:09 +010097 if (error_code & PF_INSTR)
Linus Torvalds1da177e2005-04-16 15:20:36 -070098 return 0;
Harvey Harrison1dc85be2008-01-30 13:32:35 +010099#endif
100
Harvey Harrisonf2857ce2008-01-30 13:33:12 +0100101 instr = (unsigned char *)convert_ip_to_linear(current, regs);
Andi Kleenf1290ec2005-04-16 15:24:59 -0700102 max_instr = instr + 15;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700104 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105 return 0;
106
Harvey Harrison33cb5242008-01-30 13:32:19 +0100107 while (scan_more && instr < max_instr) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108 unsigned char opcode;
109 unsigned char instr_hi;
110 unsigned char instr_lo;
111
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100112 if (probe_kernel_address(instr, opcode))
Harvey Harrison33cb5242008-01-30 13:32:19 +0100113 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114
Harvey Harrison33cb5242008-01-30 13:32:19 +0100115 instr_hi = opcode & 0xf0;
116 instr_lo = opcode & 0x0f;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117 instr++;
118
Harvey Harrison33cb5242008-01-30 13:32:19 +0100119 switch (instr_hi) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700120 case 0x20:
121 case 0x30:
Harvey Harrison33cb5242008-01-30 13:32:19 +0100122 /*
123 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
124 * In X86_64 long mode, the CPU will signal invalid
125 * opcode if some of these prefixes are present so
126 * X86_64 will never get here anyway
127 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700128 scan_more = ((instr_lo & 7) == 0x6);
129 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100130#ifdef CONFIG_X86_64
Linus Torvalds1da177e2005-04-16 15:20:36 -0700131 case 0x40:
Harvey Harrison33cb5242008-01-30 13:32:19 +0100132 /*
133 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
134 * Need to figure out under what instruction mode the
135 * instruction was issued. Could check the LDT for lm,
136 * but for now it's good enough to assume that long
137 * mode only uses well known segments or kernel.
138 */
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700139 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700140 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100141#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142 case 0x60:
143 /* 0x64 thru 0x67 are valid prefixes in all modes. */
144 scan_more = (instr_lo & 0xC) == 0x4;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100145 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700146 case 0xF0:
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100147 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148 scan_more = !instr_lo || (instr_lo>>1) == 1;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100149 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700150 case 0x00:
151 /* Prefetch instruction is 0x0F0D or 0x0F18 */
152 scan_more = 0;
Harvey Harrisonf2857ce2008-01-30 13:33:12 +0100153
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100154 if (probe_kernel_address(instr, opcode))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 break;
156 prefetch = (instr_lo == 0xF) &&
157 (opcode == 0x0D || opcode == 0x18);
Harvey Harrison33cb5242008-01-30 13:32:19 +0100158 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700159 default:
160 scan_more = 0;
161 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100162 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163 }
164 return prefetch;
165}
166
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100167static void force_sig_info_fault(int si_signo, int si_code,
168 unsigned long address, struct task_struct *tsk)
169{
170 siginfo_t info;
171
172 info.si_signo = si_signo;
173 info.si_errno = 0;
174 info.si_code = si_code;
175 info.si_addr = (void __user *)address;
176 force_sig_info(si_signo, &info, tsk);
177}
178
Harvey Harrison33cb5242008-01-30 13:32:19 +0100179static int bad_address(void *p)
180{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181 unsigned long dummy;
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100182 return probe_kernel_address((unsigned long *)p, dummy);
Harvey Harrison33cb5242008-01-30 13:32:19 +0100183}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184
185void dump_pagetable(unsigned long address)
186{
187 pgd_t *pgd;
188 pud_t *pud;
189 pmd_t *pmd;
190 pte_t *pte;
191
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200192 pgd = (pgd_t *)read_cr3();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193
Harvey Harrison33cb5242008-01-30 13:32:19 +0100194 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195 pgd += pgd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196 if (bad_address(pgd)) goto bad;
Jan Beulichd646bce2006-02-03 21:51:47 +0100197 printk("PGD %lx ", pgd_val(*pgd));
Harvey Harrison33cb5242008-01-30 13:32:19 +0100198 if (!pgd_present(*pgd)) goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199
Andi Kleend2ae5b52006-06-26 13:57:56 +0200200 pud = pud_offset(pgd, address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700201 if (bad_address(pud)) goto bad;
202 printk("PUD %lx ", pud_val(*pud));
203 if (!pud_present(*pud)) goto ret;
204
205 pmd = pmd_offset(pud, address);
206 if (bad_address(pmd)) goto bad;
207 printk("PMD %lx ", pmd_val(*pmd));
Jan Beulichb1992df2007-10-19 20:35:03 +0200208 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209
210 pte = pte_offset_kernel(pmd, address);
211 if (bad_address(pte)) goto bad;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100212 printk("PTE %lx", pte_val(*pte));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213ret:
214 printk("\n");
215 return;
216bad:
217 printk("BAD\n");
218}
219
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100220#ifdef CONFIG_X86_64
Harvey Harrison33cb5242008-01-30 13:32:19 +0100221static const char errata93_warning[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
223KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
224KERN_ERR "******* Please consider a BIOS update.\n"
225KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
226
227/* Workaround for K8 erratum #93 & buggy BIOS.
228 BIOS SMM functions are required to use a specific workaround
Harvey Harrison33cb5242008-01-30 13:32:19 +0100229 to avoid corruption of the 64bit RIP register on C stepping K8.
230 A lot of BIOS that didn't get tested properly miss this.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700231 The OS sees this as a page fault with the upper 32bits of RIP cleared.
232 Try to work around it here.
233 Note we only handle faults in kernel here. */
234
Harvey Harrison33cb5242008-01-30 13:32:19 +0100235static int is_errata93(struct pt_regs *regs, unsigned long address)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700236{
237 static int warned;
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100238 if (address != regs->ip)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700239 return 0;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100240 if ((address >> 32) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241 return 0;
242 address |= 0xffffffffUL << 32;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100243 if ((address >= (u64)_stext && address <= (u64)_etext) ||
244 (address >= MODULES_VADDR && address <= MODULES_END)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700245 if (!warned) {
Harvey Harrison33cb5242008-01-30 13:32:19 +0100246 printk(errata93_warning);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700247 warned = 1;
248 }
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100249 regs->ip = address;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250 return 1;
251 }
252 return 0;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100253}
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100254#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700255
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
257 unsigned long error_code)
258{
Jan Beulich12091402005-09-12 18:49:24 +0200259 unsigned long flags = oops_begin();
Jan Beulich6e3f3612006-01-11 22:42:14 +0100260 struct task_struct *tsk;
Jan Beulich12091402005-09-12 18:49:24 +0200261
Linus Torvalds1da177e2005-04-16 15:20:36 -0700262 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
263 current->comm, address);
264 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100265 tsk = current;
266 tsk->thread.cr2 = address;
267 tsk->thread.trap_no = 14;
268 tsk->thread.error_code = error_code;
Jan Beulich22f59912008-01-30 13:31:23 +0100269 if (__die("Bad pagetable", regs, error_code))
270 regs = NULL;
271 oops_end(flags, regs, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272}
273
274/*
Andi Kleenf95190b2006-01-11 22:44:00 +0100275 * Handle a fault on the vmalloc area
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700276 *
277 * This assumes no large pages in there.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278 */
279static int vmalloc_fault(unsigned long address)
280{
281 pgd_t *pgd, *pgd_ref;
282 pud_t *pud, *pud_ref;
283 pmd_t *pmd, *pmd_ref;
284 pte_t *pte, *pte_ref;
285
286 /* Copy kernel mappings over when needed. This can also
287 happen within a race in page table update. In the later
288 case just flush. */
289
290 pgd = pgd_offset(current->mm ?: &init_mm, address);
291 pgd_ref = pgd_offset_k(address);
292 if (pgd_none(*pgd_ref))
293 return -1;
294 if (pgd_none(*pgd))
295 set_pgd(pgd, *pgd_ref);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100296 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700297 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298
299 /* Below here mismatches are bugs because these lower tables
300 are shared */
301
302 pud = pud_offset(pgd, address);
303 pud_ref = pud_offset(pgd_ref, address);
304 if (pud_none(*pud_ref))
305 return -1;
Dave McCracken46a82b22006-09-25 23:31:48 -0700306 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307 BUG();
308 pmd = pmd_offset(pud, address);
309 pmd_ref = pmd_offset(pud_ref, address);
310 if (pmd_none(*pmd_ref))
311 return -1;
312 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
313 BUG();
314 pte_ref = pte_offset_kernel(pmd_ref, address);
315 if (!pte_present(*pte_ref))
316 return -1;
317 pte = pte_offset_kernel(pmd, address);
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700318 /* Don't use pte_page here, because the mappings can point
319 outside mem_map, and the NUMA hash lookup cannot handle
320 that. */
321 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323 return 0;
324}
325
Masoud Asgharifard Sharbianiabd4f752007-07-22 11:12:28 +0200326int show_unhandled_signals = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327
328/*
329 * This routine handles page faults. It determines the address,
330 * and the problem, and then passes it off to one of the appropriate
331 * routines.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332 */
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -0700333asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
334 unsigned long error_code)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335{
336 struct task_struct *tsk;
337 struct mm_struct *mm;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100338 struct vm_area_struct *vma;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339 unsigned long address;
Nick Piggin83c54072007-07-19 01:47:05 -0700340 int write, fault;
Jan Beulich12091402005-09-12 18:49:24 +0200341 unsigned long flags;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100342 int si_code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343
Peter Zijlstra143a5d32007-10-25 14:01:10 +0200344 /*
345 * We can fault from pretty much anywhere, with unknown IRQ state.
346 */
347 trace_hardirqs_fixup();
348
Arjan van de Vena9ba9a32006-03-25 16:30:10 +0100349 tsk = current;
350 mm = tsk->mm;
351 prefetchw(&mm->mmap_sem);
352
Linus Torvalds1da177e2005-04-16 15:20:36 -0700353 /* get the address */
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200354 address = read_cr2();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100356 si_code = SEGV_MAPERR;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700357
358
359 /*
360 * We fault-in kernel-space virtual memory on-demand. The
361 * 'reference' page table is init_mm.pgd.
362 *
363 * NOTE! We MUST NOT take any locks for this case. We may
364 * be in an interrupt or a critical region, and should
365 * only copy the information from the master page table,
366 * nothing more.
367 *
368 * This verifies that the fault happens in kernel space
369 * (error_code & 4) == 0, and that the fault was not a
Jan Beulich8b1bde92006-01-11 22:42:23 +0100370 * protection error (error_code & 9) == 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700371 */
Suresh Siddha84929802005-06-21 17:14:32 -0700372 if (unlikely(address >= TASK_SIZE64)) {
Andi Kleenf95190b2006-01-11 22:44:00 +0100373 /*
374 * Don't check for the module range here: its PML4
375 * is always initialized because it's shared with the main
376 * kernel text. Only vmalloc may need PML4 syncups.
377 */
Andi Kleen66c58152006-01-11 22:44:09 +0100378 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
Andi Kleenf95190b2006-01-11 22:44:00 +0100379 ((address >= VMALLOC_START && address < VMALLOC_END))) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100380 if (vmalloc_fault(address) >= 0)
381 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700382 }
Christoph Hellwig74a0b572007-10-16 01:24:07 -0700383 if (notify_page_fault(regs))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100384 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700385 /*
386 * Don't take the mm semaphore here. If we fixup a prefetch
387 * fault we could otherwise deadlock.
388 */
389 goto bad_area_nosemaphore;
390 }
391
Christoph Hellwig74a0b572007-10-16 01:24:07 -0700392 if (notify_page_fault(regs))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100393 return;
394
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100395 if (likely(regs->flags & X86_EFLAGS_IF))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100396 local_irq_enable();
397
Andi Kleen66c58152006-01-11 22:44:09 +0100398 if (unlikely(error_code & PF_RSVD))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399 pgtable_bad(address, regs, error_code);
400
401 /*
Harvey Harrison33cb5242008-01-30 13:32:19 +0100402 * If we're in an interrupt, have no user context or are running in an
403 * atomic region then we must not take the fault.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404 */
405 if (unlikely(in_atomic() || !mm))
406 goto bad_area_nosemaphore;
407
Linus Torvaldsdbe3ed12007-09-19 11:37:14 -0700408 /*
409 * User-mode registers count as a user access even for any
410 * potential system fault or CPU buglet.
411 */
412 if (user_mode_vm(regs))
413 error_code |= PF_USER;
414
Linus Torvalds1da177e2005-04-16 15:20:36 -0700415 again:
416 /* When running in the kernel we expect faults to occur only to
417 * addresses in user space. All other faults represent errors in the
Simon Arlott676b1852007-10-20 01:25:36 +0200418 * kernel and should generate an OOPS. Unfortunately, in the case of an
Adrian Bunk80f72282006-06-30 18:27:16 +0200419 * erroneous fault occurring in a code path which already holds mmap_sem
Linus Torvalds1da177e2005-04-16 15:20:36 -0700420 * we will deadlock attempting to validate the fault against the
421 * address space. Luckily the kernel only validly references user
422 * space from well defined areas of code, which are listed in the
423 * exceptions table.
424 *
425 * As the vast majority of faults will be valid we will only perform
Simon Arlott676b1852007-10-20 01:25:36 +0200426 * the source reference check when there is a possibility of a deadlock.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700427 * Attempt to lock the address space, if we cannot we then validate the
428 * source. If this is invalid we can skip the address space check,
429 * thus avoiding the deadlock.
430 */
431 if (!down_read_trylock(&mm->mmap_sem)) {
Andi Kleen66c58152006-01-11 22:44:09 +0100432 if ((error_code & PF_USER) == 0 &&
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100433 !search_exception_tables(regs->ip))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700434 goto bad_area_nosemaphore;
435 down_read(&mm->mmap_sem);
436 }
437
438 vma = find_vma(mm, address);
439 if (!vma)
440 goto bad_area;
441 if (likely(vma->vm_start <= address))
442 goto good_area;
443 if (!(vma->vm_flags & VM_GROWSDOWN))
444 goto bad_area;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100445 if (error_code & PF_USER) {
Chuck Ebbert03fdc2c2006-06-26 13:59:50 +0200446 /* Allow userspace just enough access below the stack pointer
447 * to let the 'enter' instruction work.
448 */
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100449 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450 goto bad_area;
451 }
452 if (expand_stack(vma, address))
453 goto bad_area;
454/*
455 * Ok, we have a good vm_area for this memory access, so
456 * we can handle it..
457 */
458good_area:
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100459 si_code = SEGV_ACCERR;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460 write = 0;
Andi Kleen66c58152006-01-11 22:44:09 +0100461 switch (error_code & (PF_PROT|PF_WRITE)) {
Harvey Harrison33cb5242008-01-30 13:32:19 +0100462 default: /* 3: write, present */
463 /* fall through */
464 case PF_WRITE: /* write, not present */
465 if (!(vma->vm_flags & VM_WRITE))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 goto bad_area;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100467 write++;
468 break;
469 case PF_PROT: /* read, present */
470 goto bad_area;
471 case 0: /* read, not present */
472 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
473 goto bad_area;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474 }
475
476 /*
477 * If for any reason at all we couldn't handle the fault,
478 * make sure we exit gracefully rather than endlessly redo
479 * the fault.
480 */
Nick Piggin83c54072007-07-19 01:47:05 -0700481 fault = handle_mm_fault(mm, vma, address, write);
482 if (unlikely(fault & VM_FAULT_ERROR)) {
483 if (fault & VM_FAULT_OOM)
484 goto out_of_memory;
485 else if (fault & VM_FAULT_SIGBUS)
486 goto do_sigbus;
487 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700488 }
Nick Piggin83c54072007-07-19 01:47:05 -0700489 if (fault & VM_FAULT_MAJOR)
490 tsk->maj_flt++;
491 else
492 tsk->min_flt++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700493 up_read(&mm->mmap_sem);
494 return;
495
496/*
497 * Something tried to access memory that isn't in our memory map..
498 * Fix it, but check if it's kernel or user first..
499 */
500bad_area:
501 up_read(&mm->mmap_sem);
502
503bad_area_nosemaphore:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700504 /* User mode accesses just cause a SIGSEGV */
Andi Kleen66c58152006-01-11 22:44:09 +0100505 if (error_code & PF_USER) {
Steven Rostedte5e3c842007-06-06 23:34:04 -0400506
507 /*
508 * It's possible to have interrupts off here.
509 */
510 local_irq_enable();
511
Linus Torvalds1da177e2005-04-16 15:20:36 -0700512 if (is_prefetch(regs, address, error_code))
513 return;
514
515 /* Work around K8 erratum #100 K8 in compat mode
516 occasionally jumps to illegal addresses >4GB. We
517 catch this here in the page fault handler because
518 these addresses are not reachable. Just detect this
519 case and return. Any code segment in LDT is
520 compatibility mode. */
521 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
522 (address >> 32))
523 return;
524
Masoud Asgharifard Sharbianiabd4f752007-07-22 11:12:28 +0200525 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
526 printk_ratelimit()) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700527 printk(
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100528 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700529 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100530 tsk->comm, tsk->pid, address, regs->ip,
531 regs->sp, error_code);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700532 }
Harvey Harrison33cb5242008-01-30 13:32:19 +0100533
Linus Torvalds1da177e2005-04-16 15:20:36 -0700534 tsk->thread.cr2 = address;
535 /* Kernel addresses are always protection faults */
536 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
537 tsk->thread.trap_no = 14;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100538
539 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700540 return;
541 }
542
543no_context:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700544 /* Are we prepared to handle this kernel fault? */
Harvey Harrison33cb5242008-01-30 13:32:19 +0100545 if (fixup_exception(regs))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700546 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700547
Harvey Harrison33cb5242008-01-30 13:32:19 +0100548 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549 * Hall of shame of CPU/BIOS bugs.
550 */
551
Harvey Harrison33cb5242008-01-30 13:32:19 +0100552 if (is_prefetch(regs, address, error_code))
553 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700554
555 if (is_errata93(regs, address))
Harvey Harrison33cb5242008-01-30 13:32:19 +0100556 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700557
558/*
559 * Oops. The kernel tried to access some bad page. We'll have to
560 * terminate things with extreme prejudice.
561 */
562
Jan Beulich12091402005-09-12 18:49:24 +0200563 flags = oops_begin();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700564
565 if (address < PAGE_SIZE)
566 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
567 else
568 printk(KERN_ALERT "Unable to handle kernel paging request");
Harvey Harrison33cb5242008-01-30 13:32:19 +0100569 printk(" at %016lx RIP: \n" KERN_ALERT, address);
Arjan van de Venbc850d62008-01-30 13:33:07 +0100570 printk_address(regs->ip, regs->bp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700571 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100572 tsk->thread.cr2 = address;
573 tsk->thread.trap_no = 14;
574 tsk->thread.error_code = error_code;
Jan Beulich22f59912008-01-30 13:31:23 +0100575 if (__die("Oops", regs, error_code))
576 regs = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700577 /* Executive summary in case the body of the oops scrolled away */
578 printk(KERN_EMERG "CR2: %016lx\n", address);
Jan Beulich22f59912008-01-30 13:31:23 +0100579 oops_end(flags, regs, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700580
581/*
582 * We ran out of memory, or some other thing happened to us that made
583 * us unable to handle the page fault gracefully.
584 */
585out_of_memory:
586 up_read(&mm->mmap_sem);
Serge E. Hallynb460cbc2007-10-18 23:39:52 -0700587 if (is_global_init(current)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700588 yield();
589 goto again;
590 }
591 printk("VM: killing process %s\n", tsk->comm);
Harvey Harrison318aa292008-01-30 13:32:59 +0100592 if (error_code & PF_USER)
Will Schmidt021daae2007-07-21 17:11:17 +0200593 do_group_exit(SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700594 goto no_context;
595
596do_sigbus:
597 up_read(&mm->mmap_sem);
598
599 /* Kernel mode? Handle exceptions or die */
Andi Kleen66c58152006-01-11 22:44:09 +0100600 if (!(error_code & PF_USER))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700601 goto no_context;
602
603 tsk->thread.cr2 = address;
604 tsk->thread.error_code = error_code;
605 tsk->thread.trap_no = 14;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100606 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700607 return;
608}
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100609
Jan Beulich8c914cb2006-03-25 16:29:40 +0100610DEFINE_SPINLOCK(pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200611LIST_HEAD(pgd_list);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100612
613void vmalloc_sync_all(void)
614{
Harvey Harrison33cb5242008-01-30 13:32:19 +0100615 /* Note that races in the updates of insync and start aren't
Jan Beulich8c914cb2006-03-25 16:29:40 +0100616 problematic:
617 insync can only get set bits added, and updates to start are only
618 improving performance (without affecting correctness if undone). */
619 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
620 static unsigned long start = VMALLOC_START & PGDIR_MASK;
621 unsigned long address;
622
623 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
624 if (!test_bit(pgd_index(address), insync)) {
625 const pgd_t *pgd_ref = pgd_offset_k(address);
626 struct page *page;
627
628 if (pgd_none(*pgd_ref))
629 continue;
630 spin_lock(&pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200631 list_for_each_entry(page, &pgd_list, lru) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100632 pgd_t *pgd;
633 pgd = (pgd_t *)page_address(page) + pgd_index(address);
634 if (pgd_none(*pgd))
635 set_pgd(pgd, *pgd_ref);
636 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700637 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Jan Beulich8c914cb2006-03-25 16:29:40 +0100638 }
639 spin_unlock(&pgd_lock);
640 set_bit(pgd_index(address), insync);
641 }
642 if (address == start)
643 start = address + PGDIR_SIZE;
644 }
645 /* Check that there is no need to do the same for the modules area. */
646 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
Harvey Harrison33cb5242008-01-30 13:32:19 +0100647 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
Jan Beulich8c914cb2006-03-25 16:29:40 +0100648 (__START_KERNEL & PGDIR_MASK)));
649}