blob: 0803d3858af18a43398e3620583d68b5a48a0e18 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86-64/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */
7
8#include <linux/config.h>
9#include <linux/signal.h>
10#include <linux/sched.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/string.h>
14#include <linux/types.h>
15#include <linux/ptrace.h>
16#include <linux/mman.h>
17#include <linux/mm.h>
18#include <linux/smp.h>
19#include <linux/smp_lock.h>
20#include <linux/interrupt.h>
21#include <linux/init.h>
22#include <linux/tty.h>
23#include <linux/vt_kern.h> /* For unblank_screen() */
24#include <linux/compiler.h>
25#include <linux/module.h>
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -070026#include <linux/kprobes.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070027
28#include <asm/system.h>
29#include <asm/uaccess.h>
30#include <asm/pgalloc.h>
31#include <asm/smp.h>
32#include <asm/tlbflush.h>
33#include <asm/proto.h>
34#include <asm/kdebug.h>
35#include <asm-generic/sections.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070036
Andi Kleen66c58152006-01-11 22:44:09 +010037/* Page fault error code bits */
38#define PF_PROT (1<<0) /* or no page found */
39#define PF_WRITE (1<<1)
40#define PF_USER (1<<2)
41#define PF_RSVD (1<<3)
42#define PF_INSTR (1<<4)
43
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070044#ifdef CONFIG_KPROBES
45ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
46
47/* Hook to register for page fault notifications */
48int register_page_fault_notifier(struct notifier_block *nb)
49{
50 vmalloc_sync_all();
51 return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
52}
53
54int unregister_page_fault_notifier(struct notifier_block *nb)
55{
56 return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
57}
58
59static inline int notify_page_fault(enum die_val val, const char *str,
60 struct pt_regs *regs, long err, int trap, int sig)
61{
62 struct die_args args = {
63 .regs = regs,
64 .str = str,
65 .err = err,
66 .trapnr = trap,
67 .signr = sig
68 };
69 return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
70}
71#else
72static inline int notify_page_fault(enum die_val val, const char *str,
73 struct pt_regs *regs, long err, int trap, int sig)
74{
75 return NOTIFY_DONE;
76}
77#endif
78
Linus Torvalds1da177e2005-04-16 15:20:36 -070079void bust_spinlocks(int yes)
80{
81 int loglevel_save = console_loglevel;
82 if (yes) {
83 oops_in_progress = 1;
84 } else {
85#ifdef CONFIG_VT
86 unblank_screen();
87#endif
88 oops_in_progress = 0;
89 /*
90 * OK, the message is on the console. Now we call printk()
91 * without oops_in_progress set so that printk will give klogd
92 * a poke. Hold onto your hats...
93 */
94 console_loglevel = 15; /* NMI oopser may have shut the console up */
95 printk(" ");
96 console_loglevel = loglevel_save;
97 }
98}
99
100/* Sometimes the CPU reports invalid exceptions on prefetch.
101 Check that here and ignore.
102 Opcode checker based on code by Richard Brunner */
103static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
104 unsigned long error_code)
105{
Andi Kleenf1290ec2005-04-16 15:24:59 -0700106 unsigned char *instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107 int scan_more = 1;
108 int prefetch = 0;
Andi Kleenf1290ec2005-04-16 15:24:59 -0700109 unsigned char *max_instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110
111 /* If it was a exec fault ignore */
Andi Kleen66c58152006-01-11 22:44:09 +0100112 if (error_code & PF_INSTR)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113 return 0;
114
Andi Kleenf1290ec2005-04-16 15:24:59 -0700115 instr = (unsigned char *)convert_rip_to_linear(current, regs);
116 max_instr = instr + 15;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700118 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119 return 0;
120
121 while (scan_more && instr < max_instr) {
122 unsigned char opcode;
123 unsigned char instr_hi;
124 unsigned char instr_lo;
125
126 if (__get_user(opcode, instr))
127 break;
128
129 instr_hi = opcode & 0xf0;
130 instr_lo = opcode & 0x0f;
131 instr++;
132
133 switch (instr_hi) {
134 case 0x20:
135 case 0x30:
136 /* Values 0x26,0x2E,0x36,0x3E are valid x86
137 prefixes. In long mode, the CPU will signal
138 invalid opcode if some of these prefixes are
139 present so we will never get here anyway */
140 scan_more = ((instr_lo & 7) == 0x6);
141 break;
142
143 case 0x40:
144 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
145 Need to figure out under what instruction mode the
146 instruction was issued ... */
147 /* Could check the LDT for lm, but for now it's good
148 enough to assume that long mode only uses well known
149 segments or kernel. */
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700150 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151 break;
152
153 case 0x60:
154 /* 0x64 thru 0x67 are valid prefixes in all modes. */
155 scan_more = (instr_lo & 0xC) == 0x4;
156 break;
157 case 0xF0:
158 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
159 scan_more = !instr_lo || (instr_lo>>1) == 1;
160 break;
161 case 0x00:
162 /* Prefetch instruction is 0x0F0D or 0x0F18 */
163 scan_more = 0;
164 if (__get_user(opcode, instr))
165 break;
166 prefetch = (instr_lo == 0xF) &&
167 (opcode == 0x0D || opcode == 0x18);
168 break;
169 default:
170 scan_more = 0;
171 break;
172 }
173 }
174 return prefetch;
175}
176
177static int bad_address(void *p)
178{
179 unsigned long dummy;
180 return __get_user(dummy, (unsigned long *)p);
181}
182
183void dump_pagetable(unsigned long address)
184{
185 pgd_t *pgd;
186 pud_t *pud;
187 pmd_t *pmd;
188 pte_t *pte;
189
190 asm("movq %%cr3,%0" : "=r" (pgd));
191
192 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
193 pgd += pgd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194 if (bad_address(pgd)) goto bad;
Jan Beulichd646bce2006-02-03 21:51:47 +0100195 printk("PGD %lx ", pgd_val(*pgd));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196 if (!pgd_present(*pgd)) goto ret;
197
198 pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
199 if (bad_address(pud)) goto bad;
200 printk("PUD %lx ", pud_val(*pud));
201 if (!pud_present(*pud)) goto ret;
202
203 pmd = pmd_offset(pud, address);
204 if (bad_address(pmd)) goto bad;
205 printk("PMD %lx ", pmd_val(*pmd));
206 if (!pmd_present(*pmd)) goto ret;
207
208 pte = pte_offset_kernel(pmd, address);
209 if (bad_address(pte)) goto bad;
210 printk("PTE %lx", pte_val(*pte));
211ret:
212 printk("\n");
213 return;
214bad:
215 printk("BAD\n");
216}
217
218static const char errata93_warning[] =
219KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
220KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
221KERN_ERR "******* Please consider a BIOS update.\n"
222KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
223
224/* Workaround for K8 erratum #93 & buggy BIOS.
225 BIOS SMM functions are required to use a specific workaround
226 to avoid corruption of the 64bit RIP register on C stepping K8.
227 A lot of BIOS that didn't get tested properly miss this.
228 The OS sees this as a page fault with the upper 32bits of RIP cleared.
229 Try to work around it here.
230 Note we only handle faults in kernel here. */
231
232static int is_errata93(struct pt_regs *regs, unsigned long address)
233{
234 static int warned;
235 if (address != regs->rip)
236 return 0;
237 if ((address >> 32) != 0)
238 return 0;
239 address |= 0xffffffffUL << 32;
240 if ((address >= (u64)_stext && address <= (u64)_etext) ||
241 (address >= MODULES_VADDR && address <= MODULES_END)) {
242 if (!warned) {
243 printk(errata93_warning);
244 warned = 1;
245 }
246 regs->rip = address;
247 return 1;
248 }
249 return 0;
250}
251
252int unhandled_signal(struct task_struct *tsk, int sig)
253{
254 if (tsk->pid == 1)
255 return 1;
Andi Kleen5e5ec102005-08-19 06:56:04 +0200256 if (tsk->ptrace & PT_PTRACED)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257 return 0;
258 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
259 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
260}
261
262static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
263 unsigned long error_code)
264{
Jan Beulich12091402005-09-12 18:49:24 +0200265 unsigned long flags = oops_begin();
Jan Beulich6e3f3612006-01-11 22:42:14 +0100266 struct task_struct *tsk;
Jan Beulich12091402005-09-12 18:49:24 +0200267
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
269 current->comm, address);
270 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100271 tsk = current;
272 tsk->thread.cr2 = address;
273 tsk->thread.trap_no = 14;
274 tsk->thread.error_code = error_code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275 __die("Bad pagetable", regs, error_code);
Jan Beulich12091402005-09-12 18:49:24 +0200276 oops_end(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277 do_exit(SIGKILL);
278}
279
280/*
Andi Kleenf95190b2006-01-11 22:44:00 +0100281 * Handle a fault on the vmalloc area
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700282 *
283 * This assumes no large pages in there.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 */
285static int vmalloc_fault(unsigned long address)
286{
287 pgd_t *pgd, *pgd_ref;
288 pud_t *pud, *pud_ref;
289 pmd_t *pmd, *pmd_ref;
290 pte_t *pte, *pte_ref;
291
292 /* Copy kernel mappings over when needed. This can also
293 happen within a race in page table update. In the later
294 case just flush. */
295
296 pgd = pgd_offset(current->mm ?: &init_mm, address);
297 pgd_ref = pgd_offset_k(address);
298 if (pgd_none(*pgd_ref))
299 return -1;
300 if (pgd_none(*pgd))
301 set_pgd(pgd, *pgd_ref);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100302 else
303 BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304
305 /* Below here mismatches are bugs because these lower tables
306 are shared */
307
308 pud = pud_offset(pgd, address);
309 pud_ref = pud_offset(pgd_ref, address);
310 if (pud_none(*pud_ref))
311 return -1;
312 if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
313 BUG();
314 pmd = pmd_offset(pud, address);
315 pmd_ref = pmd_offset(pud_ref, address);
316 if (pmd_none(*pmd_ref))
317 return -1;
318 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
319 BUG();
320 pte_ref = pte_offset_kernel(pmd_ref, address);
321 if (!pte_present(*pte_ref))
322 return -1;
323 pte = pte_offset_kernel(pmd, address);
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700324 /* Don't use pte_page here, because the mappings can point
325 outside mem_map, and the NUMA hash lookup cannot handle
326 that. */
327 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700328 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 return 0;
330}
331
332int page_fault_trace = 0;
333int exception_trace = 1;
334
335/*
336 * This routine handles page faults. It determines the address,
337 * and the problem, and then passes it off to one of the appropriate
338 * routines.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339 */
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -0700340asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
341 unsigned long error_code)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700342{
343 struct task_struct *tsk;
344 struct mm_struct *mm;
345 struct vm_area_struct * vma;
346 unsigned long address;
347 const struct exception_table_entry *fixup;
348 int write;
Jan Beulich12091402005-09-12 18:49:24 +0200349 unsigned long flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350 siginfo_t info;
351
Arjan van de Vena9ba9a32006-03-25 16:30:10 +0100352 tsk = current;
353 mm = tsk->mm;
354 prefetchw(&mm->mmap_sem);
355
Linus Torvalds1da177e2005-04-16 15:20:36 -0700356 /* get the address */
357 __asm__("movq %%cr2,%0":"=r" (address));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359 info.si_code = SEGV_MAPERR;
360
361
362 /*
363 * We fault-in kernel-space virtual memory on-demand. The
364 * 'reference' page table is init_mm.pgd.
365 *
366 * NOTE! We MUST NOT take any locks for this case. We may
367 * be in an interrupt or a critical region, and should
368 * only copy the information from the master page table,
369 * nothing more.
370 *
371 * This verifies that the fault happens in kernel space
372 * (error_code & 4) == 0, and that the fault was not a
Jan Beulich8b1bde92006-01-11 22:42:23 +0100373 * protection error (error_code & 9) == 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374 */
Suresh Siddha84929802005-06-21 17:14:32 -0700375 if (unlikely(address >= TASK_SIZE64)) {
Andi Kleenf95190b2006-01-11 22:44:00 +0100376 /*
377 * Don't check for the module range here: its PML4
378 * is always initialized because it's shared with the main
379 * kernel text. Only vmalloc may need PML4 syncups.
380 */
Andi Kleen66c58152006-01-11 22:44:09 +0100381 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
Andi Kleenf95190b2006-01-11 22:44:00 +0100382 ((address >= VMALLOC_START && address < VMALLOC_END))) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100383 if (vmalloc_fault(address) >= 0)
384 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700385 }
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -0700386 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
Jan Beulich8c914cb2006-03-25 16:29:40 +0100387 SIGSEGV) == NOTIFY_STOP)
388 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389 /*
390 * Don't take the mm semaphore here. If we fixup a prefetch
391 * fault we could otherwise deadlock.
392 */
393 goto bad_area_nosemaphore;
394 }
395
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -0700396 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
Jan Beulich8c914cb2006-03-25 16:29:40 +0100397 SIGSEGV) == NOTIFY_STOP)
398 return;
399
400 if (likely(regs->eflags & X86_EFLAGS_IF))
401 local_irq_enable();
402
403 if (unlikely(page_fault_trace))
404 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
405 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
406
Andi Kleen66c58152006-01-11 22:44:09 +0100407 if (unlikely(error_code & PF_RSVD))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700408 pgtable_bad(address, regs, error_code);
409
410 /*
411 * If we're in an interrupt or have no user
412 * context, we must not take the fault..
413 */
414 if (unlikely(in_atomic() || !mm))
415 goto bad_area_nosemaphore;
416
417 again:
418 /* When running in the kernel we expect faults to occur only to
419 * addresses in user space. All other faults represent errors in the
420 * kernel and should generate an OOPS. Unfortunatly, in the case of an
421 * erroneous fault occuring in a code path which already holds mmap_sem
422 * we will deadlock attempting to validate the fault against the
423 * address space. Luckily the kernel only validly references user
424 * space from well defined areas of code, which are listed in the
425 * exceptions table.
426 *
427 * As the vast majority of faults will be valid we will only perform
428 * the source reference check when there is a possibilty of a deadlock.
429 * Attempt to lock the address space, if we cannot we then validate the
430 * source. If this is invalid we can skip the address space check,
431 * thus avoiding the deadlock.
432 */
433 if (!down_read_trylock(&mm->mmap_sem)) {
Andi Kleen66c58152006-01-11 22:44:09 +0100434 if ((error_code & PF_USER) == 0 &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700435 !search_exception_tables(regs->rip))
436 goto bad_area_nosemaphore;
437 down_read(&mm->mmap_sem);
438 }
439
440 vma = find_vma(mm, address);
441 if (!vma)
442 goto bad_area;
443 if (likely(vma->vm_start <= address))
444 goto good_area;
445 if (!(vma->vm_flags & VM_GROWSDOWN))
446 goto bad_area;
447 if (error_code & 4) {
448 // XXX: align red zone size with ABI
449 if (address + 128 < regs->rsp)
450 goto bad_area;
451 }
452 if (expand_stack(vma, address))
453 goto bad_area;
454/*
455 * Ok, we have a good vm_area for this memory access, so
456 * we can handle it..
457 */
458good_area:
459 info.si_code = SEGV_ACCERR;
460 write = 0;
Andi Kleen66c58152006-01-11 22:44:09 +0100461 switch (error_code & (PF_PROT|PF_WRITE)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462 default: /* 3: write, present */
463 /* fall through */
Andi Kleen66c58152006-01-11 22:44:09 +0100464 case PF_WRITE: /* write, not present */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700465 if (!(vma->vm_flags & VM_WRITE))
466 goto bad_area;
467 write++;
468 break;
Andi Kleen66c58152006-01-11 22:44:09 +0100469 case PF_PROT: /* read, present */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700470 goto bad_area;
Andi Kleen66c58152006-01-11 22:44:09 +0100471 case 0: /* read, not present */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
473 goto bad_area;
474 }
475
476 /*
477 * If for any reason at all we couldn't handle the fault,
478 * make sure we exit gracefully rather than endlessly redo
479 * the fault.
480 */
481 switch (handle_mm_fault(mm, vma, address, write)) {
Alexander Nyberg96800212005-08-04 16:14:57 +0200482 case VM_FAULT_MINOR:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483 tsk->min_flt++;
484 break;
Alexander Nyberg96800212005-08-04 16:14:57 +0200485 case VM_FAULT_MAJOR:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 tsk->maj_flt++;
487 break;
Alexander Nyberg96800212005-08-04 16:14:57 +0200488 case VM_FAULT_SIGBUS:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700489 goto do_sigbus;
490 default:
491 goto out_of_memory;
492 }
493
494 up_read(&mm->mmap_sem);
495 return;
496
497/*
498 * Something tried to access memory that isn't in our memory map..
499 * Fix it, but check if it's kernel or user first..
500 */
501bad_area:
502 up_read(&mm->mmap_sem);
503
504bad_area_nosemaphore:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700505 /* User mode accesses just cause a SIGSEGV */
Andi Kleen66c58152006-01-11 22:44:09 +0100506 if (error_code & PF_USER) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700507 if (is_prefetch(regs, address, error_code))
508 return;
509
510 /* Work around K8 erratum #100 K8 in compat mode
511 occasionally jumps to illegal addresses >4GB. We
512 catch this here in the page fault handler because
513 these addresses are not reachable. Just detect this
514 case and return. Any code segment in LDT is
515 compatibility mode. */
516 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
517 (address >> 32))
518 return;
519
520 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
521 printk(
522 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
523 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
524 tsk->comm, tsk->pid, address, regs->rip,
525 regs->rsp, error_code);
526 }
527
528 tsk->thread.cr2 = address;
529 /* Kernel addresses are always protection faults */
530 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
531 tsk->thread.trap_no = 14;
532 info.si_signo = SIGSEGV;
533 info.si_errno = 0;
534 /* info.si_code has been set above */
535 info.si_addr = (void __user *)address;
536 force_sig_info(SIGSEGV, &info, tsk);
537 return;
538 }
539
540no_context:
541
542 /* Are we prepared to handle this kernel fault? */
543 fixup = search_exception_tables(regs->rip);
544 if (fixup) {
545 regs->rip = fixup->fixup;
546 return;
547 }
548
549 /*
550 * Hall of shame of CPU/BIOS bugs.
551 */
552
553 if (is_prefetch(regs, address, error_code))
554 return;
555
556 if (is_errata93(regs, address))
557 return;
558
559/*
560 * Oops. The kernel tried to access some bad page. We'll have to
561 * terminate things with extreme prejudice.
562 */
563
Jan Beulich12091402005-09-12 18:49:24 +0200564 flags = oops_begin();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700565
566 if (address < PAGE_SIZE)
567 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
568 else
569 printk(KERN_ALERT "Unable to handle kernel paging request");
570 printk(" at %016lx RIP: \n" KERN_ALERT,address);
571 printk_address(regs->rip);
572 printk("\n");
573 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100574 tsk->thread.cr2 = address;
575 tsk->thread.trap_no = 14;
576 tsk->thread.error_code = error_code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700577 __die("Oops", regs, error_code);
578 /* Executive summary in case the body of the oops scrolled away */
579 printk(KERN_EMERG "CR2: %016lx\n", address);
Jan Beulich12091402005-09-12 18:49:24 +0200580 oops_end(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700581 do_exit(SIGKILL);
582
583/*
584 * We ran out of memory, or some other thing happened to us that made
585 * us unable to handle the page fault gracefully.
586 */
587out_of_memory:
588 up_read(&mm->mmap_sem);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700589 if (current->pid == 1) {
590 yield();
591 goto again;
592 }
593 printk("VM: killing process %s\n", tsk->comm);
594 if (error_code & 4)
595 do_exit(SIGKILL);
596 goto no_context;
597
598do_sigbus:
599 up_read(&mm->mmap_sem);
600
601 /* Kernel mode? Handle exceptions or die */
Andi Kleen66c58152006-01-11 22:44:09 +0100602 if (!(error_code & PF_USER))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700603 goto no_context;
604
605 tsk->thread.cr2 = address;
606 tsk->thread.error_code = error_code;
607 tsk->thread.trap_no = 14;
608 info.si_signo = SIGBUS;
609 info.si_errno = 0;
610 info.si_code = BUS_ADRERR;
611 info.si_addr = (void __user *)address;
612 force_sig_info(SIGBUS, &info, tsk);
613 return;
614}
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100615
Jan Beulich8c914cb2006-03-25 16:29:40 +0100616DEFINE_SPINLOCK(pgd_lock);
617struct page *pgd_list;
618
619void vmalloc_sync_all(void)
620{
621 /* Note that races in the updates of insync and start aren't
622 problematic:
623 insync can only get set bits added, and updates to start are only
624 improving performance (without affecting correctness if undone). */
625 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
626 static unsigned long start = VMALLOC_START & PGDIR_MASK;
627 unsigned long address;
628
629 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
630 if (!test_bit(pgd_index(address), insync)) {
631 const pgd_t *pgd_ref = pgd_offset_k(address);
632 struct page *page;
633
634 if (pgd_none(*pgd_ref))
635 continue;
636 spin_lock(&pgd_lock);
637 for (page = pgd_list; page;
638 page = (struct page *)page->index) {
639 pgd_t *pgd;
640 pgd = (pgd_t *)page_address(page) + pgd_index(address);
641 if (pgd_none(*pgd))
642 set_pgd(pgd, *pgd_ref);
643 else
644 BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
645 }
646 spin_unlock(&pgd_lock);
647 set_bit(pgd_index(address), insync);
648 }
649 if (address == start)
650 start = address + PGDIR_SIZE;
651 }
652 /* Check that there is no need to do the same for the modules area. */
653 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
654 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
655 (__START_KERNEL & PGDIR_MASK)));
656}
657
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100658static int __init enable_pagefaulttrace(char *str)
659{
660 page_fault_trace = 1;
OGAWA Hirofumi9b410462006-03-31 02:30:33 -0800661 return 1;
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100662}
663__setup("pagefaulttrace", enable_pagefaulttrace);