blob: 0d3d5979ce2c7ef0a99935574ab1a4ef19fc92d4 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
Linus Torvalds1da177e2005-04-16 15:20:36 -07006#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070016#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070021#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/module.h>
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -070023#include <linux/kprobes.h>
Andi Kleenab2bf0c2006-12-07 02:14:06 +010024#include <linux/uaccess.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070025#include <linux/kdebug.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070026
27#include <asm/system.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070028#include <asm/pgalloc.h>
29#include <asm/smp.h>
30#include <asm/tlbflush.h>
31#include <asm/proto.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070032#include <asm-generic/sections.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070033
Harvey Harrison33cb5242008-01-30 13:32:19 +010034/*
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
41 */
42#define PF_PROT (1<<0)
Andi Kleen66c58152006-01-11 22:44:09 +010043#define PF_WRITE (1<<1)
44#define PF_USER (1<<2)
45#define PF_RSVD (1<<3)
46#define PF_INSTR (1<<4)
47
Christoph Hellwig74a0b572007-10-16 01:24:07 -070048static inline int notify_page_fault(struct pt_regs *regs)
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070049{
Harvey Harrison33cb5242008-01-30 13:32:19 +010050#ifdef CONFIG_KPROBES
Christoph Hellwig74a0b572007-10-16 01:24:07 -070051 int ret = 0;
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070052
Christoph Hellwig74a0b572007-10-16 01:24:07 -070053 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
55 preempt_disable();
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
57 ret = 1;
58 preempt_enable();
59 }
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070060
Christoph Hellwig74a0b572007-10-16 01:24:07 -070061 return ret;
Christoph Hellwig74a0b572007-10-16 01:24:07 -070062#else
Christoph Hellwig74a0b572007-10-16 01:24:07 -070063 return 0;
Christoph Hellwig74a0b572007-10-16 01:24:07 -070064#endif
Harvey Harrison33cb5242008-01-30 13:32:19 +010065}
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070066
Harvey Harrison1dc85be2008-01-30 13:32:35 +010067#ifdef CONFIG_X86_32
68/*
69 * Return EIP plus the CS segment base. The segment limit is also
70 * adjusted, clamped to the kernel/user address space (whichever is
71 * appropriate), and returned in *eip_limit.
72 *
73 * The segment is checked, because it might have been changed by another
74 * task between the original faulting instruction and here.
75 *
76 * If CS is no longer a valid code segment, or if EIP is beyond the
77 * limit, or if it is a kernel address when CS is not a kernel segment,
78 * then the returned value will be greater than *eip_limit.
79 *
80 * This is slow, but is very rarely executed.
81 */
82static inline unsigned long get_segment_eip(struct pt_regs *regs,
83 unsigned long *eip_limit)
84{
85 unsigned long ip = regs->ip;
86 unsigned seg = regs->cs & 0xffff;
87 u32 seg_ar, seg_limit, base, *desc;
88
89 /* Unlikely, but must come before segment checks. */
90 if (unlikely(regs->flags & VM_MASK)) {
91 base = seg << 4;
92 *eip_limit = base + 0xffff;
93 return base + (ip & 0xffff);
94 }
95
96 /* The standard kernel/user address space limit. */
97 *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
98
99 /* By far the most common cases. */
100 if (likely(SEGMENT_IS_FLAT_CODE(seg)))
101 return ip;
102
103 /* Check the segment exists, is within the current LDT/GDT size,
104 that kernel/user (ring 0..3) has the appropriate privilege,
105 that it's a code segment, and get the limit. */
106 __asm__("larl %3,%0; lsll %3,%1"
107 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
108 if ((~seg_ar & 0x9800) || ip > seg_limit) {
109 *eip_limit = 0;
110 return 1; /* So that returned ip > *eip_limit. */
111 }
112
113 /* Get the GDT/LDT descriptor base.
114 When you look for races in this code remember that
115 LDT and other horrors are only used in user space. */
116 if (seg & (1<<2)) {
117 /* Must lock the LDT while reading it. */
118 mutex_lock(&current->mm->context.lock);
119 desc = current->mm->context.ldt;
120 desc = (void *)desc + (seg & ~7);
121 } else {
122 /* Must disable preemption while reading the GDT. */
123 desc = (u32 *)get_cpu_gdt_table(get_cpu());
124 desc = (void *)desc + (seg & ~7);
125 }
126
127 /* Decode the code segment base from the descriptor */
128 base = get_desc_base((struct desc_struct *)desc);
129
130 if (seg & (1<<2))
131 mutex_unlock(&current->mm->context.lock);
132 else
133 put_cpu();
134
135 /* Adjust EIP and segment limit, and clamp at the kernel limit.
136 It's legitimate for segments to wrap at 0xffffffff. */
137 seg_limit += base;
138 if (seg_limit < *eip_limit && seg_limit >= base)
139 *eip_limit = seg_limit;
140 return ip + base;
141}
142#endif
143
144/*
145 * X86_32
146 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
147 * Check that here and ignore it.
148 *
149 * X86_64
150 * Sometimes the CPU reports invalid exceptions on prefetch.
151 * Check that here and ignore it.
152 *
153 * Opcode checker based on code by Richard Brunner
154 */
155static int is_prefetch(struct pt_regs *regs, unsigned long addr,
156 unsigned long error_code)
Harvey Harrison33cb5242008-01-30 13:32:19 +0100157{
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100158 unsigned char *instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700159 int scan_more = 1;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100160 int prefetch = 0;
Andi Kleenf1290ec2005-04-16 15:24:59 -0700161 unsigned char *max_instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700162
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100163#ifdef CONFIG_X86_32
164 unsigned long limit;
165 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
166 boot_cpu_data.x86 >= 6)) {
167 /* Catch an obscure case of prefetch inside an NX page. */
168 if (nx_enabled && (error_code & PF_INSTR))
169 return 0;
170 } else {
171 return 0;
172 }
173 instr = (unsigned char *)get_segment_eip(regs, &limit);
174#else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700175 /* If it was a exec fault ignore */
Andi Kleen66c58152006-01-11 22:44:09 +0100176 if (error_code & PF_INSTR)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700177 return 0;
Andi Kleendd2994f2006-09-26 10:52:33 +0200178 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100179#endif
180
Andi Kleenf1290ec2005-04-16 15:24:59 -0700181 max_instr = instr + 15;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700182
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100183#ifdef CONFIG_X86_64
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700184 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185 return 0;
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100186#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700187
Harvey Harrison33cb5242008-01-30 13:32:19 +0100188 while (scan_more && instr < max_instr) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189 unsigned char opcode;
190 unsigned char instr_hi;
191 unsigned char instr_lo;
192
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100193#ifdef CONFIG_X86_32
194 if (instr > (unsigned char *)limit)
195 break;
196#endif
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100197 if (probe_kernel_address(instr, opcode))
Harvey Harrison33cb5242008-01-30 13:32:19 +0100198 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199
Harvey Harrison33cb5242008-01-30 13:32:19 +0100200 instr_hi = opcode & 0xf0;
201 instr_lo = opcode & 0x0f;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202 instr++;
203
Harvey Harrison33cb5242008-01-30 13:32:19 +0100204 switch (instr_hi) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700205 case 0x20:
206 case 0x30:
Harvey Harrison33cb5242008-01-30 13:32:19 +0100207 /*
208 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
209 * In X86_64 long mode, the CPU will signal invalid
210 * opcode if some of these prefixes are present so
211 * X86_64 will never get here anyway
212 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213 scan_more = ((instr_lo & 7) == 0x6);
214 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100215#ifdef CONFIG_X86_64
Linus Torvalds1da177e2005-04-16 15:20:36 -0700216 case 0x40:
Harvey Harrison33cb5242008-01-30 13:32:19 +0100217 /*
218 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
219 * Need to figure out under what instruction mode the
220 * instruction was issued. Could check the LDT for lm,
221 * but for now it's good enough to assume that long
222 * mode only uses well known segments or kernel.
223 */
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700224 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700225 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100226#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700227 case 0x60:
228 /* 0x64 thru 0x67 are valid prefixes in all modes. */
229 scan_more = (instr_lo & 0xC) == 0x4;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100230 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700231 case 0xF0:
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100232 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700233 scan_more = !instr_lo || (instr_lo>>1) == 1;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100234 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700235 case 0x00:
236 /* Prefetch instruction is 0x0F0D or 0x0F18 */
237 scan_more = 0;
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100238#ifdef CONFIG_X86_32
239 if (instr > (unsigned char *)limit)
240 break;
241#endif
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100242 if (probe_kernel_address(instr, opcode))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243 break;
244 prefetch = (instr_lo == 0xF) &&
245 (opcode == 0x0D || opcode == 0x18);
Harvey Harrison33cb5242008-01-30 13:32:19 +0100246 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700247 default:
248 scan_more = 0;
249 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100250 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251 }
252 return prefetch;
253}
254
Harvey Harrison33cb5242008-01-30 13:32:19 +0100255static int bad_address(void *p)
256{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257 unsigned long dummy;
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100258 return probe_kernel_address((unsigned long *)p, dummy);
Harvey Harrison33cb5242008-01-30 13:32:19 +0100259}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260
261void dump_pagetable(unsigned long address)
262{
263 pgd_t *pgd;
264 pud_t *pud;
265 pmd_t *pmd;
266 pte_t *pte;
267
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200268 pgd = (pgd_t *)read_cr3();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700269
Harvey Harrison33cb5242008-01-30 13:32:19 +0100270 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271 pgd += pgd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272 if (bad_address(pgd)) goto bad;
Jan Beulichd646bce2006-02-03 21:51:47 +0100273 printk("PGD %lx ", pgd_val(*pgd));
Harvey Harrison33cb5242008-01-30 13:32:19 +0100274 if (!pgd_present(*pgd)) goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275
Andi Kleend2ae5b52006-06-26 13:57:56 +0200276 pud = pud_offset(pgd, address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277 if (bad_address(pud)) goto bad;
278 printk("PUD %lx ", pud_val(*pud));
279 if (!pud_present(*pud)) goto ret;
280
281 pmd = pmd_offset(pud, address);
282 if (bad_address(pmd)) goto bad;
283 printk("PMD %lx ", pmd_val(*pmd));
Jan Beulichb1992df2007-10-19 20:35:03 +0200284 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285
286 pte = pte_offset_kernel(pmd, address);
287 if (bad_address(pte)) goto bad;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100288 printk("PTE %lx", pte_val(*pte));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289ret:
290 printk("\n");
291 return;
292bad:
293 printk("BAD\n");
294}
295
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100296#ifdef CONFIG_X86_64
Harvey Harrison33cb5242008-01-30 13:32:19 +0100297static const char errata93_warning[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
299KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
300KERN_ERR "******* Please consider a BIOS update.\n"
301KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
302
303/* Workaround for K8 erratum #93 & buggy BIOS.
304 BIOS SMM functions are required to use a specific workaround
Harvey Harrison33cb5242008-01-30 13:32:19 +0100305 to avoid corruption of the 64bit RIP register on C stepping K8.
306 A lot of BIOS that didn't get tested properly miss this.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307 The OS sees this as a page fault with the upper 32bits of RIP cleared.
308 Try to work around it here.
309 Note we only handle faults in kernel here. */
310
Harvey Harrison33cb5242008-01-30 13:32:19 +0100311static int is_errata93(struct pt_regs *regs, unsigned long address)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700312{
313 static int warned;
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100314 if (address != regs->ip)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700315 return 0;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100316 if ((address >> 32) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700317 return 0;
318 address |= 0xffffffffUL << 32;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100319 if ((address >= (u64)_stext && address <= (u64)_etext) ||
320 (address >= MODULES_VADDR && address <= MODULES_END)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321 if (!warned) {
Harvey Harrison33cb5242008-01-30 13:32:19 +0100322 printk(errata93_warning);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323 warned = 1;
324 }
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100325 regs->ip = address;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700326 return 1;
327 }
328 return 0;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100329}
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100330#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700331
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
333 unsigned long error_code)
334{
Jan Beulich12091402005-09-12 18:49:24 +0200335 unsigned long flags = oops_begin();
Jan Beulich6e3f3612006-01-11 22:42:14 +0100336 struct task_struct *tsk;
Jan Beulich12091402005-09-12 18:49:24 +0200337
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
339 current->comm, address);
340 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100341 tsk = current;
342 tsk->thread.cr2 = address;
343 tsk->thread.trap_no = 14;
344 tsk->thread.error_code = error_code;
Jan Beulich22f59912008-01-30 13:31:23 +0100345 if (__die("Bad pagetable", regs, error_code))
346 regs = NULL;
347 oops_end(flags, regs, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348}
349
350/*
Andi Kleenf95190b2006-01-11 22:44:00 +0100351 * Handle a fault on the vmalloc area
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700352 *
353 * This assumes no large pages in there.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700354 */
355static int vmalloc_fault(unsigned long address)
356{
357 pgd_t *pgd, *pgd_ref;
358 pud_t *pud, *pud_ref;
359 pmd_t *pmd, *pmd_ref;
360 pte_t *pte, *pte_ref;
361
362 /* Copy kernel mappings over when needed. This can also
363 happen within a race in page table update. In the later
364 case just flush. */
365
366 pgd = pgd_offset(current->mm ?: &init_mm, address);
367 pgd_ref = pgd_offset_k(address);
368 if (pgd_none(*pgd_ref))
369 return -1;
370 if (pgd_none(*pgd))
371 set_pgd(pgd, *pgd_ref);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100372 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700373 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374
375 /* Below here mismatches are bugs because these lower tables
376 are shared */
377
378 pud = pud_offset(pgd, address);
379 pud_ref = pud_offset(pgd_ref, address);
380 if (pud_none(*pud_ref))
381 return -1;
Dave McCracken46a82b22006-09-25 23:31:48 -0700382 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383 BUG();
384 pmd = pmd_offset(pud, address);
385 pmd_ref = pmd_offset(pud_ref, address);
386 if (pmd_none(*pmd_ref))
387 return -1;
388 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
389 BUG();
390 pte_ref = pte_offset_kernel(pmd_ref, address);
391 if (!pte_present(*pte_ref))
392 return -1;
393 pte = pte_offset_kernel(pmd, address);
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700394 /* Don't use pte_page here, because the mappings can point
395 outside mem_map, and the NUMA hash lookup cannot handle
396 that. */
397 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700398 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399 return 0;
400}
401
Masoud Asgharifard Sharbianiabd4f752007-07-22 11:12:28 +0200402int show_unhandled_signals = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403
404/*
405 * This routine handles page faults. It determines the address,
406 * and the problem, and then passes it off to one of the appropriate
407 * routines.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700408 */
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -0700409asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
410 unsigned long error_code)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700411{
412 struct task_struct *tsk;
413 struct mm_struct *mm;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100414 struct vm_area_struct *vma;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700415 unsigned long address;
Nick Piggin83c54072007-07-19 01:47:05 -0700416 int write, fault;
Jan Beulich12091402005-09-12 18:49:24 +0200417 unsigned long flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700418 siginfo_t info;
419
Peter Zijlstra143a5d32007-10-25 14:01:10 +0200420 /*
421 * We can fault from pretty much anywhere, with unknown IRQ state.
422 */
423 trace_hardirqs_fixup();
424
Arjan van de Vena9ba9a32006-03-25 16:30:10 +0100425 tsk = current;
426 mm = tsk->mm;
427 prefetchw(&mm->mmap_sem);
428
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429 /* get the address */
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200430 address = read_cr2();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700431
Linus Torvalds1da177e2005-04-16 15:20:36 -0700432 info.si_code = SEGV_MAPERR;
433
434
435 /*
436 * We fault-in kernel-space virtual memory on-demand. The
437 * 'reference' page table is init_mm.pgd.
438 *
439 * NOTE! We MUST NOT take any locks for this case. We may
440 * be in an interrupt or a critical region, and should
441 * only copy the information from the master page table,
442 * nothing more.
443 *
444 * This verifies that the fault happens in kernel space
445 * (error_code & 4) == 0, and that the fault was not a
Jan Beulich8b1bde92006-01-11 22:42:23 +0100446 * protection error (error_code & 9) == 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700447 */
Suresh Siddha84929802005-06-21 17:14:32 -0700448 if (unlikely(address >= TASK_SIZE64)) {
Andi Kleenf95190b2006-01-11 22:44:00 +0100449 /*
450 * Don't check for the module range here: its PML4
451 * is always initialized because it's shared with the main
452 * kernel text. Only vmalloc may need PML4 syncups.
453 */
Andi Kleen66c58152006-01-11 22:44:09 +0100454 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
Andi Kleenf95190b2006-01-11 22:44:00 +0100455 ((address >= VMALLOC_START && address < VMALLOC_END))) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100456 if (vmalloc_fault(address) >= 0)
457 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700458 }
Christoph Hellwig74a0b572007-10-16 01:24:07 -0700459 if (notify_page_fault(regs))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100460 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700461 /*
462 * Don't take the mm semaphore here. If we fixup a prefetch
463 * fault we could otherwise deadlock.
464 */
465 goto bad_area_nosemaphore;
466 }
467
Christoph Hellwig74a0b572007-10-16 01:24:07 -0700468 if (notify_page_fault(regs))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100469 return;
470
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100471 if (likely(regs->flags & X86_EFLAGS_IF))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100472 local_irq_enable();
473
Andi Kleen66c58152006-01-11 22:44:09 +0100474 if (unlikely(error_code & PF_RSVD))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700475 pgtable_bad(address, regs, error_code);
476
477 /*
Harvey Harrison33cb5242008-01-30 13:32:19 +0100478 * If we're in an interrupt, have no user context or are running in an
479 * atomic region then we must not take the fault.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700480 */
481 if (unlikely(in_atomic() || !mm))
482 goto bad_area_nosemaphore;
483
Linus Torvaldsdbe3ed12007-09-19 11:37:14 -0700484 /*
485 * User-mode registers count as a user access even for any
486 * potential system fault or CPU buglet.
487 */
488 if (user_mode_vm(regs))
489 error_code |= PF_USER;
490
Linus Torvalds1da177e2005-04-16 15:20:36 -0700491 again:
492 /* When running in the kernel we expect faults to occur only to
493 * addresses in user space. All other faults represent errors in the
Simon Arlott676b1852007-10-20 01:25:36 +0200494 * kernel and should generate an OOPS. Unfortunately, in the case of an
Adrian Bunk80f72282006-06-30 18:27:16 +0200495 * erroneous fault occurring in a code path which already holds mmap_sem
Linus Torvalds1da177e2005-04-16 15:20:36 -0700496 * we will deadlock attempting to validate the fault against the
497 * address space. Luckily the kernel only validly references user
498 * space from well defined areas of code, which are listed in the
499 * exceptions table.
500 *
501 * As the vast majority of faults will be valid we will only perform
Simon Arlott676b1852007-10-20 01:25:36 +0200502 * the source reference check when there is a possibility of a deadlock.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700503 * Attempt to lock the address space, if we cannot we then validate the
504 * source. If this is invalid we can skip the address space check,
505 * thus avoiding the deadlock.
506 */
507 if (!down_read_trylock(&mm->mmap_sem)) {
Andi Kleen66c58152006-01-11 22:44:09 +0100508 if ((error_code & PF_USER) == 0 &&
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100509 !search_exception_tables(regs->ip))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700510 goto bad_area_nosemaphore;
511 down_read(&mm->mmap_sem);
512 }
513
514 vma = find_vma(mm, address);
515 if (!vma)
516 goto bad_area;
517 if (likely(vma->vm_start <= address))
518 goto good_area;
519 if (!(vma->vm_flags & VM_GROWSDOWN))
520 goto bad_area;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100521 if (error_code & PF_USER) {
Chuck Ebbert03fdc2c2006-06-26 13:59:50 +0200522 /* Allow userspace just enough access below the stack pointer
523 * to let the 'enter' instruction work.
524 */
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100525 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700526 goto bad_area;
527 }
528 if (expand_stack(vma, address))
529 goto bad_area;
530/*
531 * Ok, we have a good vm_area for this memory access, so
532 * we can handle it..
533 */
534good_area:
535 info.si_code = SEGV_ACCERR;
536 write = 0;
Andi Kleen66c58152006-01-11 22:44:09 +0100537 switch (error_code & (PF_PROT|PF_WRITE)) {
Harvey Harrison33cb5242008-01-30 13:32:19 +0100538 default: /* 3: write, present */
539 /* fall through */
540 case PF_WRITE: /* write, not present */
541 if (!(vma->vm_flags & VM_WRITE))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700542 goto bad_area;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100543 write++;
544 break;
545 case PF_PROT: /* read, present */
546 goto bad_area;
547 case 0: /* read, not present */
548 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
549 goto bad_area;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700550 }
551
552 /*
553 * If for any reason at all we couldn't handle the fault,
554 * make sure we exit gracefully rather than endlessly redo
555 * the fault.
556 */
Nick Piggin83c54072007-07-19 01:47:05 -0700557 fault = handle_mm_fault(mm, vma, address, write);
558 if (unlikely(fault & VM_FAULT_ERROR)) {
559 if (fault & VM_FAULT_OOM)
560 goto out_of_memory;
561 else if (fault & VM_FAULT_SIGBUS)
562 goto do_sigbus;
563 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700564 }
Nick Piggin83c54072007-07-19 01:47:05 -0700565 if (fault & VM_FAULT_MAJOR)
566 tsk->maj_flt++;
567 else
568 tsk->min_flt++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700569 up_read(&mm->mmap_sem);
570 return;
571
572/*
573 * Something tried to access memory that isn't in our memory map..
574 * Fix it, but check if it's kernel or user first..
575 */
576bad_area:
577 up_read(&mm->mmap_sem);
578
579bad_area_nosemaphore:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700580 /* User mode accesses just cause a SIGSEGV */
Andi Kleen66c58152006-01-11 22:44:09 +0100581 if (error_code & PF_USER) {
Steven Rostedte5e3c842007-06-06 23:34:04 -0400582
583 /*
584 * It's possible to have interrupts off here.
585 */
586 local_irq_enable();
587
Linus Torvalds1da177e2005-04-16 15:20:36 -0700588 if (is_prefetch(regs, address, error_code))
589 return;
590
591 /* Work around K8 erratum #100 K8 in compat mode
592 occasionally jumps to illegal addresses >4GB. We
593 catch this here in the page fault handler because
594 these addresses are not reachable. Just detect this
595 case and return. Any code segment in LDT is
596 compatibility mode. */
597 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
598 (address >> 32))
599 return;
600
Masoud Asgharifard Sharbianiabd4f752007-07-22 11:12:28 +0200601 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
602 printk_ratelimit()) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700603 printk(
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100604 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700605 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100606 tsk->comm, tsk->pid, address, regs->ip,
607 regs->sp, error_code);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700608 }
Harvey Harrison33cb5242008-01-30 13:32:19 +0100609
Linus Torvalds1da177e2005-04-16 15:20:36 -0700610 tsk->thread.cr2 = address;
611 /* Kernel addresses are always protection faults */
612 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
613 tsk->thread.trap_no = 14;
614 info.si_signo = SIGSEGV;
615 info.si_errno = 0;
616 /* info.si_code has been set above */
617 info.si_addr = (void __user *)address;
618 force_sig_info(SIGSEGV, &info, tsk);
619 return;
620 }
621
622no_context:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700623 /* Are we prepared to handle this kernel fault? */
Harvey Harrison33cb5242008-01-30 13:32:19 +0100624 if (fixup_exception(regs))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700625 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700626
Harvey Harrison33cb5242008-01-30 13:32:19 +0100627 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700628 * Hall of shame of CPU/BIOS bugs.
629 */
630
Harvey Harrison33cb5242008-01-30 13:32:19 +0100631 if (is_prefetch(regs, address, error_code))
632 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700633
634 if (is_errata93(regs, address))
Harvey Harrison33cb5242008-01-30 13:32:19 +0100635 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700636
637/*
638 * Oops. The kernel tried to access some bad page. We'll have to
639 * terminate things with extreme prejudice.
640 */
641
Jan Beulich12091402005-09-12 18:49:24 +0200642 flags = oops_begin();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700643
644 if (address < PAGE_SIZE)
645 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
646 else
647 printk(KERN_ALERT "Unable to handle kernel paging request");
Harvey Harrison33cb5242008-01-30 13:32:19 +0100648 printk(" at %016lx RIP: \n" KERN_ALERT, address);
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100649 printk_address(regs->ip);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100651 tsk->thread.cr2 = address;
652 tsk->thread.trap_no = 14;
653 tsk->thread.error_code = error_code;
Jan Beulich22f59912008-01-30 13:31:23 +0100654 if (__die("Oops", regs, error_code))
655 regs = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700656 /* Executive summary in case the body of the oops scrolled away */
657 printk(KERN_EMERG "CR2: %016lx\n", address);
Jan Beulich22f59912008-01-30 13:31:23 +0100658 oops_end(flags, regs, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659
660/*
661 * We ran out of memory, or some other thing happened to us that made
662 * us unable to handle the page fault gracefully.
663 */
664out_of_memory:
665 up_read(&mm->mmap_sem);
Serge E. Hallynb460cbc2007-10-18 23:39:52 -0700666 if (is_global_init(current)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667 yield();
668 goto again;
669 }
670 printk("VM: killing process %s\n", tsk->comm);
671 if (error_code & 4)
Will Schmidt021daae2007-07-21 17:11:17 +0200672 do_group_exit(SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700673 goto no_context;
674
675do_sigbus:
676 up_read(&mm->mmap_sem);
677
678 /* Kernel mode? Handle exceptions or die */
Andi Kleen66c58152006-01-11 22:44:09 +0100679 if (!(error_code & PF_USER))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680 goto no_context;
681
682 tsk->thread.cr2 = address;
683 tsk->thread.error_code = error_code;
684 tsk->thread.trap_no = 14;
685 info.si_signo = SIGBUS;
686 info.si_errno = 0;
687 info.si_code = BUS_ADRERR;
688 info.si_addr = (void __user *)address;
689 force_sig_info(SIGBUS, &info, tsk);
690 return;
691}
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100692
Jan Beulich8c914cb2006-03-25 16:29:40 +0100693DEFINE_SPINLOCK(pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200694LIST_HEAD(pgd_list);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100695
696void vmalloc_sync_all(void)
697{
Harvey Harrison33cb5242008-01-30 13:32:19 +0100698 /* Note that races in the updates of insync and start aren't
Jan Beulich8c914cb2006-03-25 16:29:40 +0100699 problematic:
700 insync can only get set bits added, and updates to start are only
701 improving performance (without affecting correctness if undone). */
702 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
703 static unsigned long start = VMALLOC_START & PGDIR_MASK;
704 unsigned long address;
705
706 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
707 if (!test_bit(pgd_index(address), insync)) {
708 const pgd_t *pgd_ref = pgd_offset_k(address);
709 struct page *page;
710
711 if (pgd_none(*pgd_ref))
712 continue;
713 spin_lock(&pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200714 list_for_each_entry(page, &pgd_list, lru) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100715 pgd_t *pgd;
716 pgd = (pgd_t *)page_address(page) + pgd_index(address);
717 if (pgd_none(*pgd))
718 set_pgd(pgd, *pgd_ref);
719 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700720 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Jan Beulich8c914cb2006-03-25 16:29:40 +0100721 }
722 spin_unlock(&pgd_lock);
723 set_bit(pgd_index(address), insync);
724 }
725 if (address == start)
726 start = address + PGDIR_SIZE;
727 }
728 /* Check that there is no need to do the same for the modules area. */
729 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
Harvey Harrison33cb5242008-01-30 13:32:19 +0100730 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
Jan Beulich8c914cb2006-03-25 16:29:40 +0100731 (__START_KERNEL & PGDIR_MASK)));
732}