blob: 3751b4788e288748112c05df0121a1186aac5fab [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86-64/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */
7
Linus Torvalds1da177e2005-04-16 15:20:36 -07008#include <linux/signal.h>
9#include <linux/sched.h>
10#include <linux/kernel.h>
11#include <linux/errno.h>
12#include <linux/string.h>
13#include <linux/types.h>
14#include <linux/ptrace.h>
15#include <linux/mman.h>
16#include <linux/mm.h>
17#include <linux/smp.h>
18#include <linux/smp_lock.h>
19#include <linux/interrupt.h>
20#include <linux/init.h>
21#include <linux/tty.h>
22#include <linux/vt_kern.h> /* For unblank_screen() */
23#include <linux/compiler.h>
24#include <linux/module.h>
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -070025#include <linux/kprobes.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070026
27#include <asm/system.h>
28#include <asm/uaccess.h>
29#include <asm/pgalloc.h>
30#include <asm/smp.h>
31#include <asm/tlbflush.h>
32#include <asm/proto.h>
33#include <asm/kdebug.h>
34#include <asm-generic/sections.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070035
Andi Kleen66c58152006-01-11 22:44:09 +010036/* Page fault error code bits */
37#define PF_PROT (1<<0) /* or no page found */
38#define PF_WRITE (1<<1)
39#define PF_USER (1<<2)
40#define PF_RSVD (1<<3)
41#define PF_INSTR (1<<4)
42
Andi Kleen273819a22006-09-26 10:52:35 +020043static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070044
45/* Hook to register for page fault notifications */
46int register_page_fault_notifier(struct notifier_block *nb)
47{
48 vmalloc_sync_all();
49 return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
50}
Andi Kleen273819a22006-09-26 10:52:35 +020051EXPORT_SYMBOL_GPL(register_page_fault_notifier);
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070052
53int unregister_page_fault_notifier(struct notifier_block *nb)
54{
55 return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
56}
Andi Kleen273819a22006-09-26 10:52:35 +020057EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070058
59static inline int notify_page_fault(enum die_val val, const char *str,
60 struct pt_regs *regs, long err, int trap, int sig)
61{
62 struct die_args args = {
63 .regs = regs,
64 .str = str,
65 .err = err,
66 .trapnr = trap,
67 .signr = sig
68 };
69 return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
70}
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070071
Linus Torvalds1da177e2005-04-16 15:20:36 -070072void bust_spinlocks(int yes)
73{
74 int loglevel_save = console_loglevel;
75 if (yes) {
76 oops_in_progress = 1;
77 } else {
78#ifdef CONFIG_VT
79 unblank_screen();
80#endif
81 oops_in_progress = 0;
82 /*
83 * OK, the message is on the console. Now we call printk()
84 * without oops_in_progress set so that printk will give klogd
85 * a poke. Hold onto your hats...
86 */
87 console_loglevel = 15; /* NMI oopser may have shut the console up */
88 printk(" ");
89 console_loglevel = loglevel_save;
90 }
91}
92
93/* Sometimes the CPU reports invalid exceptions on prefetch.
94 Check that here and ignore.
95 Opcode checker based on code by Richard Brunner */
96static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
97 unsigned long error_code)
98{
Andi Kleendd2994f2006-09-26 10:52:33 +020099 unsigned char __user *instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100 int scan_more = 1;
101 int prefetch = 0;
Andi Kleenf1290ec2005-04-16 15:24:59 -0700102 unsigned char *max_instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103
104 /* If it was a exec fault ignore */
Andi Kleen66c58152006-01-11 22:44:09 +0100105 if (error_code & PF_INSTR)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106 return 0;
107
Andi Kleendd2994f2006-09-26 10:52:33 +0200108 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
Andi Kleenf1290ec2005-04-16 15:24:59 -0700109 max_instr = instr + 15;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700111 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112 return 0;
113
114 while (scan_more && instr < max_instr) {
115 unsigned char opcode;
116 unsigned char instr_hi;
117 unsigned char instr_lo;
118
Andi Kleendd2994f2006-09-26 10:52:33 +0200119 if (__get_user(opcode, (char __user *)instr))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700120 break;
121
122 instr_hi = opcode & 0xf0;
123 instr_lo = opcode & 0x0f;
124 instr++;
125
126 switch (instr_hi) {
127 case 0x20:
128 case 0x30:
129 /* Values 0x26,0x2E,0x36,0x3E are valid x86
130 prefixes. In long mode, the CPU will signal
131 invalid opcode if some of these prefixes are
132 present so we will never get here anyway */
133 scan_more = ((instr_lo & 7) == 0x6);
134 break;
135
136 case 0x40:
137 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
138 Need to figure out under what instruction mode the
139 instruction was issued ... */
140 /* Could check the LDT for lm, but for now it's good
141 enough to assume that long mode only uses well known
142 segments or kernel. */
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700143 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144 break;
145
146 case 0x60:
147 /* 0x64 thru 0x67 are valid prefixes in all modes. */
148 scan_more = (instr_lo & 0xC) == 0x4;
149 break;
150 case 0xF0:
151 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
152 scan_more = !instr_lo || (instr_lo>>1) == 1;
153 break;
154 case 0x00:
155 /* Prefetch instruction is 0x0F0D or 0x0F18 */
156 scan_more = 0;
Andi Kleendd2994f2006-09-26 10:52:33 +0200157 if (__get_user(opcode, (char __user *)instr))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158 break;
159 prefetch = (instr_lo == 0xF) &&
160 (opcode == 0x0D || opcode == 0x18);
161 break;
162 default:
163 scan_more = 0;
164 break;
165 }
166 }
167 return prefetch;
168}
169
170static int bad_address(void *p)
171{
172 unsigned long dummy;
Andi Kleendd2994f2006-09-26 10:52:33 +0200173 return __get_user(dummy, (unsigned long __user *)p);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700174}
175
176void dump_pagetable(unsigned long address)
177{
178 pgd_t *pgd;
179 pud_t *pud;
180 pmd_t *pmd;
181 pte_t *pte;
182
183 asm("movq %%cr3,%0" : "=r" (pgd));
184
185 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
186 pgd += pgd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700187 if (bad_address(pgd)) goto bad;
Jan Beulichd646bce2006-02-03 21:51:47 +0100188 printk("PGD %lx ", pgd_val(*pgd));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189 if (!pgd_present(*pgd)) goto ret;
190
Andi Kleend2ae5b52006-06-26 13:57:56 +0200191 pud = pud_offset(pgd, address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700192 if (bad_address(pud)) goto bad;
193 printk("PUD %lx ", pud_val(*pud));
194 if (!pud_present(*pud)) goto ret;
195
196 pmd = pmd_offset(pud, address);
197 if (bad_address(pmd)) goto bad;
198 printk("PMD %lx ", pmd_val(*pmd));
199 if (!pmd_present(*pmd)) goto ret;
200
201 pte = pte_offset_kernel(pmd, address);
202 if (bad_address(pte)) goto bad;
203 printk("PTE %lx", pte_val(*pte));
204ret:
205 printk("\n");
206 return;
207bad:
208 printk("BAD\n");
209}
210
211static const char errata93_warning[] =
212KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
213KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
214KERN_ERR "******* Please consider a BIOS update.\n"
215KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
216
217/* Workaround for K8 erratum #93 & buggy BIOS.
218 BIOS SMM functions are required to use a specific workaround
219 to avoid corruption of the 64bit RIP register on C stepping K8.
220 A lot of BIOS that didn't get tested properly miss this.
221 The OS sees this as a page fault with the upper 32bits of RIP cleared.
222 Try to work around it here.
223 Note we only handle faults in kernel here. */
224
225static int is_errata93(struct pt_regs *regs, unsigned long address)
226{
227 static int warned;
228 if (address != regs->rip)
229 return 0;
230 if ((address >> 32) != 0)
231 return 0;
232 address |= 0xffffffffUL << 32;
233 if ((address >= (u64)_stext && address <= (u64)_etext) ||
234 (address >= MODULES_VADDR && address <= MODULES_END)) {
235 if (!warned) {
236 printk(errata93_warning);
237 warned = 1;
238 }
239 regs->rip = address;
240 return 1;
241 }
242 return 0;
243}
244
245int unhandled_signal(struct task_struct *tsk, int sig)
246{
Sukadev Bhattiproluf400e192006-09-29 02:00:07 -0700247 if (is_init(tsk))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700248 return 1;
Andi Kleen5e5ec102005-08-19 06:56:04 +0200249 if (tsk->ptrace & PT_PTRACED)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250 return 0;
251 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
252 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
253}
254
255static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
256 unsigned long error_code)
257{
Jan Beulich12091402005-09-12 18:49:24 +0200258 unsigned long flags = oops_begin();
Jan Beulich6e3f3612006-01-11 22:42:14 +0100259 struct task_struct *tsk;
Jan Beulich12091402005-09-12 18:49:24 +0200260
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
262 current->comm, address);
263 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100264 tsk = current;
265 tsk->thread.cr2 = address;
266 tsk->thread.trap_no = 14;
267 tsk->thread.error_code = error_code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268 __die("Bad pagetable", regs, error_code);
Jan Beulich12091402005-09-12 18:49:24 +0200269 oops_end(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700270 do_exit(SIGKILL);
271}
272
273/*
Andi Kleenf95190b2006-01-11 22:44:00 +0100274 * Handle a fault on the vmalloc area
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700275 *
276 * This assumes no large pages in there.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277 */
278static int vmalloc_fault(unsigned long address)
279{
280 pgd_t *pgd, *pgd_ref;
281 pud_t *pud, *pud_ref;
282 pmd_t *pmd, *pmd_ref;
283 pte_t *pte, *pte_ref;
284
285 /* Copy kernel mappings over when needed. This can also
286 happen within a race in page table update. In the later
287 case just flush. */
288
289 pgd = pgd_offset(current->mm ?: &init_mm, address);
290 pgd_ref = pgd_offset_k(address);
291 if (pgd_none(*pgd_ref))
292 return -1;
293 if (pgd_none(*pgd))
294 set_pgd(pgd, *pgd_ref);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100295 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700296 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297
298 /* Below here mismatches are bugs because these lower tables
299 are shared */
300
301 pud = pud_offset(pgd, address);
302 pud_ref = pud_offset(pgd_ref, address);
303 if (pud_none(*pud_ref))
304 return -1;
Dave McCracken46a82b22006-09-25 23:31:48 -0700305 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306 BUG();
307 pmd = pmd_offset(pud, address);
308 pmd_ref = pmd_offset(pud_ref, address);
309 if (pmd_none(*pmd_ref))
310 return -1;
311 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
312 BUG();
313 pte_ref = pte_offset_kernel(pmd_ref, address);
314 if (!pte_present(*pte_ref))
315 return -1;
316 pte = pte_offset_kernel(pmd, address);
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700317 /* Don't use pte_page here, because the mappings can point
318 outside mem_map, and the NUMA hash lookup cannot handle
319 that. */
320 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322 return 0;
323}
324
325int page_fault_trace = 0;
326int exception_trace = 1;
327
328/*
329 * This routine handles page faults. It determines the address,
330 * and the problem, and then passes it off to one of the appropriate
331 * routines.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332 */
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -0700333asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
334 unsigned long error_code)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335{
336 struct task_struct *tsk;
337 struct mm_struct *mm;
338 struct vm_area_struct * vma;
339 unsigned long address;
340 const struct exception_table_entry *fixup;
341 int write;
Jan Beulich12091402005-09-12 18:49:24 +0200342 unsigned long flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343 siginfo_t info;
344
Arjan van de Vena9ba9a32006-03-25 16:30:10 +0100345 tsk = current;
346 mm = tsk->mm;
347 prefetchw(&mm->mmap_sem);
348
Linus Torvalds1da177e2005-04-16 15:20:36 -0700349 /* get the address */
350 __asm__("movq %%cr2,%0":"=r" (address));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352 info.si_code = SEGV_MAPERR;
353
354
355 /*
356 * We fault-in kernel-space virtual memory on-demand. The
357 * 'reference' page table is init_mm.pgd.
358 *
359 * NOTE! We MUST NOT take any locks for this case. We may
360 * be in an interrupt or a critical region, and should
361 * only copy the information from the master page table,
362 * nothing more.
363 *
364 * This verifies that the fault happens in kernel space
365 * (error_code & 4) == 0, and that the fault was not a
Jan Beulich8b1bde92006-01-11 22:42:23 +0100366 * protection error (error_code & 9) == 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700367 */
Suresh Siddha84929802005-06-21 17:14:32 -0700368 if (unlikely(address >= TASK_SIZE64)) {
Andi Kleenf95190b2006-01-11 22:44:00 +0100369 /*
370 * Don't check for the module range here: its PML4
371 * is always initialized because it's shared with the main
372 * kernel text. Only vmalloc may need PML4 syncups.
373 */
Andi Kleen66c58152006-01-11 22:44:09 +0100374 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
Andi Kleenf95190b2006-01-11 22:44:00 +0100375 ((address >= VMALLOC_START && address < VMALLOC_END))) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100376 if (vmalloc_fault(address) >= 0)
377 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700378 }
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -0700379 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
Jan Beulich8c914cb2006-03-25 16:29:40 +0100380 SIGSEGV) == NOTIFY_STOP)
381 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700382 /*
383 * Don't take the mm semaphore here. If we fixup a prefetch
384 * fault we could otherwise deadlock.
385 */
386 goto bad_area_nosemaphore;
387 }
388
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -0700389 if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
Jan Beulich8c914cb2006-03-25 16:29:40 +0100390 SIGSEGV) == NOTIFY_STOP)
391 return;
392
393 if (likely(regs->eflags & X86_EFLAGS_IF))
394 local_irq_enable();
395
396 if (unlikely(page_fault_trace))
397 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
398 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
399
Andi Kleen66c58152006-01-11 22:44:09 +0100400 if (unlikely(error_code & PF_RSVD))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401 pgtable_bad(address, regs, error_code);
402
403 /*
404 * If we're in an interrupt or have no user
405 * context, we must not take the fault..
406 */
407 if (unlikely(in_atomic() || !mm))
408 goto bad_area_nosemaphore;
409
410 again:
411 /* When running in the kernel we expect faults to occur only to
412 * addresses in user space. All other faults represent errors in the
413 * kernel and should generate an OOPS. Unfortunatly, in the case of an
Adrian Bunk80f72282006-06-30 18:27:16 +0200414 * erroneous fault occurring in a code path which already holds mmap_sem
Linus Torvalds1da177e2005-04-16 15:20:36 -0700415 * we will deadlock attempting to validate the fault against the
416 * address space. Luckily the kernel only validly references user
417 * space from well defined areas of code, which are listed in the
418 * exceptions table.
419 *
420 * As the vast majority of faults will be valid we will only perform
421 * the source reference check when there is a possibilty of a deadlock.
422 * Attempt to lock the address space, if we cannot we then validate the
423 * source. If this is invalid we can skip the address space check,
424 * thus avoiding the deadlock.
425 */
426 if (!down_read_trylock(&mm->mmap_sem)) {
Andi Kleen66c58152006-01-11 22:44:09 +0100427 if ((error_code & PF_USER) == 0 &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700428 !search_exception_tables(regs->rip))
429 goto bad_area_nosemaphore;
430 down_read(&mm->mmap_sem);
431 }
432
433 vma = find_vma(mm, address);
434 if (!vma)
435 goto bad_area;
436 if (likely(vma->vm_start <= address))
437 goto good_area;
438 if (!(vma->vm_flags & VM_GROWSDOWN))
439 goto bad_area;
440 if (error_code & 4) {
Chuck Ebbert03fdc2c2006-06-26 13:59:50 +0200441 /* Allow userspace just enough access below the stack pointer
442 * to let the 'enter' instruction work.
443 */
444 if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445 goto bad_area;
446 }
447 if (expand_stack(vma, address))
448 goto bad_area;
449/*
450 * Ok, we have a good vm_area for this memory access, so
451 * we can handle it..
452 */
453good_area:
454 info.si_code = SEGV_ACCERR;
455 write = 0;
Andi Kleen66c58152006-01-11 22:44:09 +0100456 switch (error_code & (PF_PROT|PF_WRITE)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700457 default: /* 3: write, present */
458 /* fall through */
Andi Kleen66c58152006-01-11 22:44:09 +0100459 case PF_WRITE: /* write, not present */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460 if (!(vma->vm_flags & VM_WRITE))
461 goto bad_area;
462 write++;
463 break;
Andi Kleen66c58152006-01-11 22:44:09 +0100464 case PF_PROT: /* read, present */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700465 goto bad_area;
Andi Kleen66c58152006-01-11 22:44:09 +0100466 case 0: /* read, not present */
Jason Barondf67b3d2006-09-29 01:58:58 -0700467 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700468 goto bad_area;
469 }
470
471 /*
472 * If for any reason at all we couldn't handle the fault,
473 * make sure we exit gracefully rather than endlessly redo
474 * the fault.
475 */
476 switch (handle_mm_fault(mm, vma, address, write)) {
Alexander Nyberg96800212005-08-04 16:14:57 +0200477 case VM_FAULT_MINOR:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700478 tsk->min_flt++;
479 break;
Alexander Nyberg96800212005-08-04 16:14:57 +0200480 case VM_FAULT_MAJOR:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700481 tsk->maj_flt++;
482 break;
Alexander Nyberg96800212005-08-04 16:14:57 +0200483 case VM_FAULT_SIGBUS:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700484 goto do_sigbus;
485 default:
486 goto out_of_memory;
487 }
488
489 up_read(&mm->mmap_sem);
490 return;
491
492/*
493 * Something tried to access memory that isn't in our memory map..
494 * Fix it, but check if it's kernel or user first..
495 */
496bad_area:
497 up_read(&mm->mmap_sem);
498
499bad_area_nosemaphore:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500 /* User mode accesses just cause a SIGSEGV */
Andi Kleen66c58152006-01-11 22:44:09 +0100501 if (error_code & PF_USER) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502 if (is_prefetch(regs, address, error_code))
503 return;
504
505 /* Work around K8 erratum #100 K8 in compat mode
506 occasionally jumps to illegal addresses >4GB. We
507 catch this here in the page fault handler because
508 these addresses are not reachable. Just detect this
509 case and return. Any code segment in LDT is
510 compatibility mode. */
511 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
512 (address >> 32))
513 return;
514
515 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
516 printk(
517 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
518 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
519 tsk->comm, tsk->pid, address, regs->rip,
520 regs->rsp, error_code);
521 }
522
523 tsk->thread.cr2 = address;
524 /* Kernel addresses are always protection faults */
525 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
526 tsk->thread.trap_no = 14;
527 info.si_signo = SIGSEGV;
528 info.si_errno = 0;
529 /* info.si_code has been set above */
530 info.si_addr = (void __user *)address;
531 force_sig_info(SIGSEGV, &info, tsk);
532 return;
533 }
534
535no_context:
536
537 /* Are we prepared to handle this kernel fault? */
538 fixup = search_exception_tables(regs->rip);
539 if (fixup) {
540 regs->rip = fixup->fixup;
541 return;
542 }
543
544 /*
545 * Hall of shame of CPU/BIOS bugs.
546 */
547
548 if (is_prefetch(regs, address, error_code))
549 return;
550
551 if (is_errata93(regs, address))
552 return;
553
554/*
555 * Oops. The kernel tried to access some bad page. We'll have to
556 * terminate things with extreme prejudice.
557 */
558
Jan Beulich12091402005-09-12 18:49:24 +0200559 flags = oops_begin();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700560
561 if (address < PAGE_SIZE)
562 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
563 else
564 printk(KERN_ALERT "Unable to handle kernel paging request");
565 printk(" at %016lx RIP: \n" KERN_ALERT,address);
566 printk_address(regs->rip);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700567 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100568 tsk->thread.cr2 = address;
569 tsk->thread.trap_no = 14;
570 tsk->thread.error_code = error_code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700571 __die("Oops", regs, error_code);
572 /* Executive summary in case the body of the oops scrolled away */
573 printk(KERN_EMERG "CR2: %016lx\n", address);
Jan Beulich12091402005-09-12 18:49:24 +0200574 oops_end(flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700575 do_exit(SIGKILL);
576
577/*
578 * We ran out of memory, or some other thing happened to us that made
579 * us unable to handle the page fault gracefully.
580 */
581out_of_memory:
582 up_read(&mm->mmap_sem);
Sukadev Bhattiproluf400e192006-09-29 02:00:07 -0700583 if (is_init(current)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700584 yield();
585 goto again;
586 }
587 printk("VM: killing process %s\n", tsk->comm);
588 if (error_code & 4)
589 do_exit(SIGKILL);
590 goto no_context;
591
592do_sigbus:
593 up_read(&mm->mmap_sem);
594
595 /* Kernel mode? Handle exceptions or die */
Andi Kleen66c58152006-01-11 22:44:09 +0100596 if (!(error_code & PF_USER))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700597 goto no_context;
598
599 tsk->thread.cr2 = address;
600 tsk->thread.error_code = error_code;
601 tsk->thread.trap_no = 14;
602 info.si_signo = SIGBUS;
603 info.si_errno = 0;
604 info.si_code = BUS_ADRERR;
605 info.si_addr = (void __user *)address;
606 force_sig_info(SIGBUS, &info, tsk);
607 return;
608}
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100609
Jan Beulich8c914cb2006-03-25 16:29:40 +0100610DEFINE_SPINLOCK(pgd_lock);
611struct page *pgd_list;
612
613void vmalloc_sync_all(void)
614{
615 /* Note that races in the updates of insync and start aren't
616 problematic:
617 insync can only get set bits added, and updates to start are only
618 improving performance (without affecting correctness if undone). */
619 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
620 static unsigned long start = VMALLOC_START & PGDIR_MASK;
621 unsigned long address;
622
623 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
624 if (!test_bit(pgd_index(address), insync)) {
625 const pgd_t *pgd_ref = pgd_offset_k(address);
626 struct page *page;
627
628 if (pgd_none(*pgd_ref))
629 continue;
630 spin_lock(&pgd_lock);
631 for (page = pgd_list; page;
632 page = (struct page *)page->index) {
633 pgd_t *pgd;
634 pgd = (pgd_t *)page_address(page) + pgd_index(address);
635 if (pgd_none(*pgd))
636 set_pgd(pgd, *pgd_ref);
637 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700638 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Jan Beulich8c914cb2006-03-25 16:29:40 +0100639 }
640 spin_unlock(&pgd_lock);
641 set_bit(pgd_index(address), insync);
642 }
643 if (address == start)
644 start = address + PGDIR_SIZE;
645 }
646 /* Check that there is no need to do the same for the modules area. */
647 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
648 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
649 (__START_KERNEL & PGDIR_MASK)));
650}
651
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100652static int __init enable_pagefaulttrace(char *str)
653{
654 page_fault_trace = 1;
OGAWA Hirofumi9b410462006-03-31 02:30:33 -0800655 return 1;
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100656}
657__setup("pagefaulttrace", enable_pagefaulttrace);