blob: 08dc696f54ee4543fe4aff4cf551f1f7253e2da3 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86-64/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */
7
8#include <linux/config.h>
9#include <linux/signal.h>
10#include <linux/sched.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/string.h>
14#include <linux/types.h>
15#include <linux/ptrace.h>
16#include <linux/mman.h>
17#include <linux/mm.h>
18#include <linux/smp.h>
19#include <linux/smp_lock.h>
20#include <linux/interrupt.h>
21#include <linux/init.h>
22#include <linux/tty.h>
23#include <linux/vt_kern.h> /* For unblank_screen() */
24#include <linux/compiler.h>
25#include <linux/module.h>
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -070026#include <linux/kprobes.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070027
28#include <asm/system.h>
29#include <asm/uaccess.h>
30#include <asm/pgalloc.h>
31#include <asm/smp.h>
32#include <asm/tlbflush.h>
33#include <asm/proto.h>
34#include <asm/kdebug.h>
35#include <asm-generic/sections.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070036
Andi Kleen66c58152006-01-11 22:44:09 +010037/* Page fault error code bits */
38#define PF_PROT (1<<0) /* or no page found */
39#define PF_WRITE (1<<1)
40#define PF_USER (1<<2)
41#define PF_RSVD (1<<3)
42#define PF_INSTR (1<<4)
43
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070044#ifdef CONFIG_KPROBES
45ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
46
47/* Hook to register for page fault notifications */
48int register_page_fault_notifier(struct notifier_block *nb)
49{
50 vmalloc_sync_all();
51 return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
52}
53
54int unregister_page_fault_notifier(struct notifier_block *nb)
55{
56 return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
57}
58
59static inline int notify_page_fault(enum die_val val, const char *str,
60 struct pt_regs *regs, long err, int trap, int sig)
61{
62 struct die_args args = {
63 .regs = regs,
64 .str = str,
65 .err = err,
66 .trapnr = trap,
67 .signr = sig
68 };
69 return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
70}
71#else
72static inline int notify_page_fault(enum die_val val, const char *str,
73 struct pt_regs *regs, long err, int trap, int sig)
74{
75 return NOTIFY_DONE;
76}
77#endif
78
Linus Torvalds1da177e2005-04-16 15:20:36 -070079void bust_spinlocks(int yes)
80{
81 int loglevel_save = console_loglevel;
82 if (yes) {
83 oops_in_progress = 1;
84 } else {
85#ifdef CONFIG_VT
86 unblank_screen();
87#endif
88 oops_in_progress = 0;
89 /*
90 * OK, the message is on the console. Now we call printk()
91 * without oops_in_progress set so that printk will give klogd
92 * a poke. Hold onto your hats...
93 */
94 console_loglevel = 15; /* NMI oopser may have shut the console up */
95 printk(" ");
96 console_loglevel = loglevel_save;
97 }
98}
99
100/* Sometimes the CPU reports invalid exceptions on prefetch.
101 Check that here and ignore.
102 Opcode checker based on code by Richard Brunner */
103static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
104 unsigned long error_code)
105{
Andi Kleenf1290ec2005-04-16 15:24:59 -0700106 unsigned char *instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107 int scan_more = 1;
108 int prefetch = 0;
Andi Kleenf1290ec2005-04-16 15:24:59 -0700109 unsigned char *max_instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110
111 /* If it was a exec fault ignore */
Andi Kleen66c58152006-01-11 22:44:09 +0100112 if (error_code & PF_INSTR)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113 return 0;
114
Andi Kleenf1290ec2005-04-16 15:24:59 -0700115 instr = (unsigned char *)convert_rip_to_linear(current, regs);
116 max_instr = instr + 15;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700118 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119 return 0;
120
121 while (scan_more && instr < max_instr) {
122 unsigned char opcode;
123 unsigned char instr_hi;
124 unsigned char instr_lo;
125
126 if (__get_user(opcode, instr))
127 break;
128
129 instr_hi = opcode & 0xf0;
130 instr_lo = opcode & 0x0f;
131 instr++;
132
133 switch (instr_hi) {
134 case 0x20:
135 case 0x30:
136 /* Values 0x26,0x2E,0x36,0x3E are valid x86
137 prefixes. In long mode, the CPU will signal
138 invalid opcode if some of these prefixes are
139 present so we will never get here anyway */
140 scan_more = ((instr_lo & 7) == 0x6);
141 break;
142
143 case 0x40:
144 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
145 Need to figure out under what instruction mode the
146 instruction was issued ... */
147 /* Could check the LDT for lm, but for now it's good
148 enough to assume that long mode only uses well known
149 segments or kernel. */
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700150 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151 break;
152
153 case 0x60:
154 /* 0x64 thru 0x67 are valid prefixes in all modes. */
155 scan_more = (instr_lo & 0xC) == 0x4;
156 break;
157 case 0xF0:
158 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
159 scan_more = !instr_lo || (instr_lo>>1) == 1;
160 break;
161 case 0x00:
162 /* Prefetch instruction is 0x0F0D or 0x0F18 */
163 scan_more = 0;
164 if (__get_user(opcode, instr))
165 break;
166 prefetch = (instr_lo == 0xF) &&
167 (opcode == 0x0D || opcode == 0x18);
168 break;
169 default:
170 scan_more = 0;
171 break;
172 }
173 }
174 return prefetch;
175}
176
177static int bad_address(void *p)
178{
179 unsigned long dummy;
180 return __get_user(dummy, (unsigned long *)p);
181}
182
183void dump_pagetable(unsigned long address)
184{
185 pgd_t *pgd;
186 pud_t *pud;
187 pmd_t *pmd;
188 pte_t *pte;
189
190 asm("movq %%cr3,%0" : "=r" (pgd));
191
192 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
193 pgd += pgd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194 if (bad_address(pgd)) goto bad;
Jan Beulichd646bce2006-02-03 21:51:47 +0100195 printk("PGD %lx ", pgd_val(*pgd));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196 if (!pgd_present(*pgd)) goto ret;
197
Andi Kleend2ae5b52006-06-26 13:57:56 +0200198 pud = pud_offset(pgd, address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199 if (bad_address(pud)) goto bad;
200 printk("PUD %lx ", pud_val(*pud));
201 if (!pud_present(*pud)) goto ret;
202
203 pmd = pmd_offset(pud, address);
204 if (bad_address(pmd)) goto bad;
205 printk("PMD %lx ", pmd_val(*pmd));
206 if (!pmd_present(*pmd)) goto ret;
207
208 pte = pte_offset_kernel(pmd, address);
209 if (bad_address(pte)) goto bad;
210 printk("PTE %lx", pte_val(*pte));
211ret:
212 printk("\n");
213 return;
214bad:
215 printk("BAD\n");
216}
217
218static const char errata93_warning[] =
219KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
220KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
221KERN_ERR "******* Please consider a BIOS update.\n"
222KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
223
224/* Workaround for K8 erratum #93 & buggy BIOS.
225 BIOS SMM functions are required to use a specific workaround
226 to avoid corruption of the 64bit RIP register on C stepping K8.
227 A lot of BIOS that didn't get tested properly miss this.
228 The OS sees this as a page fault with the upper 32bits of RIP cleared.
229 Try to work around it here.
230 Note we only handle faults in kernel here. */
231
232static int is_errata93(struct pt_regs *regs, unsigned long address)
233{
234 static int warned;
235 if (address != regs->rip)
236 return 0;
237 if ((address >> 32) != 0)
238 return 0;
239 address |= 0xffffffffUL << 32;
240 if ((address >= (u64)_stext && address <= (u64)_etext) ||
241 (address >= MODULES_VADDR && address <= MODULES_END)) {
242 if (!warned) {
243 printk(errata93_warning);
244 warned = 1;
245 }
246 regs->rip = address;
247 return 1;
248 }
249 return 0;
250}
251
252int unhandled_signal(struct task_struct *tsk, int sig)
253{
254 if (tsk->pid == 1)
255 return 1;
Andi Kleen5e5ec102005-08-19 06:56:04 +0200256 if (tsk->ptrace & PT_PTRACED)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257 return 0;
258 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
259 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
260}
261
262static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
263 unsigned long error_code)
264{
Jan Beulich12091402005-09-12 18:49:24 +0200265 unsigned long flags = oops_begin();
Jan Beulich6e3f3612006-01-11 22:42:14 +0100266 struct task_struct *tsk;
Jan Beulich12091402005-09-12 18:49:24 +0200267
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
269 current->comm, address);
270 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100271 tsk = current;
272 tsk->thread.cr2 = address;
273 tsk->thread.trap_no = 14;
274 tsk->thread.error_code = error_code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275 __die("Bad pagetable", regs, error_code);
Jan Beulich12091402005-09-12 18:49:24 +0200276 oops_end(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277 do_exit(SIGKILL);
278}
279
280/*
Andi Kleenf95190b2006-01-11 22:44:00 +0100281 * Handle a fault on the vmalloc area
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700282 *
283 * This assumes no large pages in there.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 */
285static int vmalloc_fault(unsigned long address)
286{
287 pgd_t *pgd, *pgd_ref;
288 pud_t *pud, *pud_ref;
289 pmd_t *pmd, *pmd_ref;
290 pte_t *pte, *pte_ref;
291
292 /* Copy kernel mappings over when needed. This can also
293 happen within a race in page table update. In the later
294 case just flush. */
295
296 pgd = pgd_offset(current->mm ?: &init_mm, address);
297 pgd_ref = pgd_offset_k(address);
298 if (pgd_none(*pgd_ref))
299 return -1;
300 if (pgd_none(*pgd))
301 set_pgd(pgd, *pgd_ref);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100302 else
303 BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304
305 /* Below here mismatches are bugs because these lower tables
306 are shared */
307
308 pud = pud_offset(pgd, address);
309 pud_ref = pud_offset(pgd_ref, address);
310 if (pud_none(*pud_ref))
311 return -1;
312 if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
313 BUG();
314 pmd = pmd_offset(pud, address);
315 pmd_ref = pmd_offset(pud_ref, address);
316 if (pmd_none(*pmd_ref))
317 return -1;
318 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
319 BUG();
320 pte_ref = pte_offset_kernel(pmd_ref, address);
321 if (!pte_present(*pte_ref))
322 return -1;
323 pte = pte_offset_kernel(pmd, address);
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700324 /* Don't use pte_page here, because the mappings can point
325 outside mem_map, and the NUMA hash lookup cannot handle
326 that. */
327 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700328 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 return 0;
330}
331
332int page_fault_trace = 0;
333int exception_trace = 1;
334
335/*
336 * This routine handles page faults. It determines the address,
337 * and the problem, and then passes it off to one of the appropriate
338 * routines.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339 */
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -0700340asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
341 unsigned long error_code)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700342{
343 struct task_struct *tsk;
344 struct mm_struct *mm;
345 struct vm_area_struct * vma;
346 unsigned long address;
347 const struct exception_table_entry *fixup;
348 int write;
Jan Beulich12091402005-09-12 18:49:24 +0200349 unsigned long flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350 siginfo_t info;
351
Arjan van de Vena9ba9a32006-03-25 16:30:10 +0100352 tsk = current;
353 mm = tsk->mm;
354 prefetchw(&mm->mmap_sem);
355
Linus Torvalds1da177e2005-04-16 15:20:36 -0700356 /* get the address */
357 __asm__("movq %%cr2,%0":"=r" (address));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359 info.si_code = SEGV_MAPERR;
360
361
362 /*
363 * We fault-in kernel-space virtual memory on-demand. The
364 * 'reference' page table is init_mm.pgd.
365 *
366 * NOTE! We MUST NOT take any locks for this case. We may
367 * be in an interrupt or a critical region, and should
368 * only copy the information from the master page table,
369 * nothing more.
370 *
371 * This verifies that the fault happens in kernel space
372 * (error_code & 4) == 0, and that the fault was not a
Jan Beulich8b1bde92006-01-11 22:42:23 +0100373 * protection error (error_code & 9) == 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374 */
Suresh Siddha84929802005-06-21 17:14:32 -0700375 if (unlikely(address >= TASK_SIZE64)) {
Andi Kleenf95190b2006-01-11 22:44:00 +0100376 /*
377 * Don't check for the module range here: its PML4
378 * is always initialized because it's shared with the main
379 * kernel text. Only vmalloc may need PML4 syncups.
380 */
Andi Kleen66c58152006-01-11 22:44:09 +0100381 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
Andi Kleenf95190b2006-01-11 22:44:00 +0100382 ((address >= VMALLOC_START && address < VMALLOC_END))) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100383 if (vmalloc_fault(address) >= 0)
384 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700385 }
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -0700386 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
Jan Beulich8c914cb2006-03-25 16:29:40 +0100387 SIGSEGV) == NOTIFY_STOP)
388 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389 /*
390 * Don't take the mm semaphore here. If we fixup a prefetch
391 * fault we could otherwise deadlock.
392 */
393 goto bad_area_nosemaphore;
394 }
395
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -0700396 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
Jan Beulich8c914cb2006-03-25 16:29:40 +0100397 SIGSEGV) == NOTIFY_STOP)
398 return;
399
400 if (likely(regs->eflags & X86_EFLAGS_IF))
401 local_irq_enable();
402
403 if (unlikely(page_fault_trace))
404 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
405 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
406
Andi Kleen66c58152006-01-11 22:44:09 +0100407 if (unlikely(error_code & PF_RSVD))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700408 pgtable_bad(address, regs, error_code);
409
410 /*
411 * If we're in an interrupt or have no user
412 * context, we must not take the fault..
413 */
414 if (unlikely(in_atomic() || !mm))
415 goto bad_area_nosemaphore;
416
417 again:
418 /* When running in the kernel we expect faults to occur only to
419 * addresses in user space. All other faults represent errors in the
420 * kernel and should generate an OOPS. Unfortunatly, in the case of an
421 * erroneous fault occuring in a code path which already holds mmap_sem
422 * we will deadlock attempting to validate the fault against the
423 * address space. Luckily the kernel only validly references user
424 * space from well defined areas of code, which are listed in the
425 * exceptions table.
426 *
427 * As the vast majority of faults will be valid we will only perform
428 * the source reference check when there is a possibilty of a deadlock.
429 * Attempt to lock the address space, if we cannot we then validate the
430 * source. If this is invalid we can skip the address space check,
431 * thus avoiding the deadlock.
432 */
433 if (!down_read_trylock(&mm->mmap_sem)) {
Andi Kleen66c58152006-01-11 22:44:09 +0100434 if ((error_code & PF_USER) == 0 &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700435 !search_exception_tables(regs->rip))
436 goto bad_area_nosemaphore;
437 down_read(&mm->mmap_sem);
438 }
439
440 vma = find_vma(mm, address);
441 if (!vma)
442 goto bad_area;
443 if (likely(vma->vm_start <= address))
444 goto good_area;
445 if (!(vma->vm_flags & VM_GROWSDOWN))
446 goto bad_area;
447 if (error_code & 4) {
Chuck Ebbert03fdc2c2006-06-26 13:59:50 +0200448 /* Allow userspace just enough access below the stack pointer
449 * to let the 'enter' instruction work.
450 */
451 if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700452 goto bad_area;
453 }
454 if (expand_stack(vma, address))
455 goto bad_area;
456/*
457 * Ok, we have a good vm_area for this memory access, so
458 * we can handle it..
459 */
460good_area:
461 info.si_code = SEGV_ACCERR;
462 write = 0;
Andi Kleen66c58152006-01-11 22:44:09 +0100463 switch (error_code & (PF_PROT|PF_WRITE)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700464 default: /* 3: write, present */
465 /* fall through */
Andi Kleen66c58152006-01-11 22:44:09 +0100466 case PF_WRITE: /* write, not present */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700467 if (!(vma->vm_flags & VM_WRITE))
468 goto bad_area;
469 write++;
470 break;
Andi Kleen66c58152006-01-11 22:44:09 +0100471 case PF_PROT: /* read, present */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472 goto bad_area;
Andi Kleen66c58152006-01-11 22:44:09 +0100473 case 0: /* read, not present */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
475 goto bad_area;
476 }
477
478 /*
479 * If for any reason at all we couldn't handle the fault,
480 * make sure we exit gracefully rather than endlessly redo
481 * the fault.
482 */
483 switch (handle_mm_fault(mm, vma, address, write)) {
Alexander Nyberg96800212005-08-04 16:14:57 +0200484 case VM_FAULT_MINOR:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485 tsk->min_flt++;
486 break;
Alexander Nyberg96800212005-08-04 16:14:57 +0200487 case VM_FAULT_MAJOR:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700488 tsk->maj_flt++;
489 break;
Alexander Nyberg96800212005-08-04 16:14:57 +0200490 case VM_FAULT_SIGBUS:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700491 goto do_sigbus;
492 default:
493 goto out_of_memory;
494 }
495
496 up_read(&mm->mmap_sem);
497 return;
498
499/*
500 * Something tried to access memory that isn't in our memory map..
501 * Fix it, but check if it's kernel or user first..
502 */
503bad_area:
504 up_read(&mm->mmap_sem);
505
506bad_area_nosemaphore:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700507 /* User mode accesses just cause a SIGSEGV */
Andi Kleen66c58152006-01-11 22:44:09 +0100508 if (error_code & PF_USER) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700509 if (is_prefetch(regs, address, error_code))
510 return;
511
512 /* Work around K8 erratum #100 K8 in compat mode
513 occasionally jumps to illegal addresses >4GB. We
514 catch this here in the page fault handler because
515 these addresses are not reachable. Just detect this
516 case and return. Any code segment in LDT is
517 compatibility mode. */
518 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
519 (address >> 32))
520 return;
521
522 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
523 printk(
524 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
525 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
526 tsk->comm, tsk->pid, address, regs->rip,
527 regs->rsp, error_code);
528 }
529
530 tsk->thread.cr2 = address;
531 /* Kernel addresses are always protection faults */
532 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
533 tsk->thread.trap_no = 14;
534 info.si_signo = SIGSEGV;
535 info.si_errno = 0;
536 /* info.si_code has been set above */
537 info.si_addr = (void __user *)address;
538 force_sig_info(SIGSEGV, &info, tsk);
539 return;
540 }
541
542no_context:
543
544 /* Are we prepared to handle this kernel fault? */
545 fixup = search_exception_tables(regs->rip);
546 if (fixup) {
547 regs->rip = fixup->fixup;
548 return;
549 }
550
551 /*
552 * Hall of shame of CPU/BIOS bugs.
553 */
554
555 if (is_prefetch(regs, address, error_code))
556 return;
557
558 if (is_errata93(regs, address))
559 return;
560
561/*
562 * Oops. The kernel tried to access some bad page. We'll have to
563 * terminate things with extreme prejudice.
564 */
565
Jan Beulich12091402005-09-12 18:49:24 +0200566 flags = oops_begin();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700567
568 if (address < PAGE_SIZE)
569 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
570 else
571 printk(KERN_ALERT "Unable to handle kernel paging request");
572 printk(" at %016lx RIP: \n" KERN_ALERT,address);
573 printk_address(regs->rip);
574 printk("\n");
575 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100576 tsk->thread.cr2 = address;
577 tsk->thread.trap_no = 14;
578 tsk->thread.error_code = error_code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700579 __die("Oops", regs, error_code);
580 /* Executive summary in case the body of the oops scrolled away */
581 printk(KERN_EMERG "CR2: %016lx\n", address);
Jan Beulich12091402005-09-12 18:49:24 +0200582 oops_end(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700583 do_exit(SIGKILL);
584
585/*
586 * We ran out of memory, or some other thing happened to us that made
587 * us unable to handle the page fault gracefully.
588 */
589out_of_memory:
590 up_read(&mm->mmap_sem);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700591 if (current->pid == 1) {
592 yield();
593 goto again;
594 }
595 printk("VM: killing process %s\n", tsk->comm);
596 if (error_code & 4)
597 do_exit(SIGKILL);
598 goto no_context;
599
600do_sigbus:
601 up_read(&mm->mmap_sem);
602
603 /* Kernel mode? Handle exceptions or die */
Andi Kleen66c58152006-01-11 22:44:09 +0100604 if (!(error_code & PF_USER))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700605 goto no_context;
606
607 tsk->thread.cr2 = address;
608 tsk->thread.error_code = error_code;
609 tsk->thread.trap_no = 14;
610 info.si_signo = SIGBUS;
611 info.si_errno = 0;
612 info.si_code = BUS_ADRERR;
613 info.si_addr = (void __user *)address;
614 force_sig_info(SIGBUS, &info, tsk);
615 return;
616}
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100617
Jan Beulich8c914cb2006-03-25 16:29:40 +0100618DEFINE_SPINLOCK(pgd_lock);
619struct page *pgd_list;
620
621void vmalloc_sync_all(void)
622{
623 /* Note that races in the updates of insync and start aren't
624 problematic:
625 insync can only get set bits added, and updates to start are only
626 improving performance (without affecting correctness if undone). */
627 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
628 static unsigned long start = VMALLOC_START & PGDIR_MASK;
629 unsigned long address;
630
631 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
632 if (!test_bit(pgd_index(address), insync)) {
633 const pgd_t *pgd_ref = pgd_offset_k(address);
634 struct page *page;
635
636 if (pgd_none(*pgd_ref))
637 continue;
638 spin_lock(&pgd_lock);
639 for (page = pgd_list; page;
640 page = (struct page *)page->index) {
641 pgd_t *pgd;
642 pgd = (pgd_t *)page_address(page) + pgd_index(address);
643 if (pgd_none(*pgd))
644 set_pgd(pgd, *pgd_ref);
645 else
646 BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
647 }
648 spin_unlock(&pgd_lock);
649 set_bit(pgd_index(address), insync);
650 }
651 if (address == start)
652 start = address + PGDIR_SIZE;
653 }
654 /* Check that there is no need to do the same for the modules area. */
655 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
656 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
657 (__START_KERNEL & PGDIR_MASK)));
658}
659
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100660static int __init enable_pagefaulttrace(char *str)
661{
662 page_fault_trace = 1;
OGAWA Hirofumi9b410462006-03-31 02:30:33 -0800663 return 1;
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100664}
665__setup("pagefaulttrace", enable_pagefaulttrace);