blob: 95f142f5b5cc6053f2adeb694e0730450d7787f0 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
Linus Torvalds1da177e2005-04-16 15:20:36 -07006#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070016#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070021#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/module.h>
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -070023#include <linux/kprobes.h>
Andi Kleenab2bf0c2006-12-07 02:14:06 +010024#include <linux/uaccess.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070025#include <linux/kdebug.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070026
27#include <asm/system.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070028#include <asm/pgalloc.h>
29#include <asm/smp.h>
30#include <asm/tlbflush.h>
31#include <asm/proto.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070032#include <asm-generic/sections.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070033
Harvey Harrison33cb5242008-01-30 13:32:19 +010034/*
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
41 */
Ingo Molnar8a19da72008-01-30 13:32:53 +010042#define PF_PROT (1<<0)
Andi Kleen66c58152006-01-11 22:44:09 +010043#define PF_WRITE (1<<1)
Ingo Molnar8a19da72008-01-30 13:32:53 +010044#define PF_USER (1<<2)
45#define PF_RSVD (1<<3)
Andi Kleen66c58152006-01-11 22:44:09 +010046#define PF_INSTR (1<<4)
47
Christoph Hellwig74a0b572007-10-16 01:24:07 -070048static inline int notify_page_fault(struct pt_regs *regs)
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070049{
Harvey Harrison33cb5242008-01-30 13:32:19 +010050#ifdef CONFIG_KPROBES
Christoph Hellwig74a0b572007-10-16 01:24:07 -070051 int ret = 0;
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070052
Christoph Hellwig74a0b572007-10-16 01:24:07 -070053 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
55 preempt_disable();
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
57 ret = 1;
58 preempt_enable();
59 }
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070060
Christoph Hellwig74a0b572007-10-16 01:24:07 -070061 return ret;
Christoph Hellwig74a0b572007-10-16 01:24:07 -070062#else
Christoph Hellwig74a0b572007-10-16 01:24:07 -070063 return 0;
Christoph Hellwig74a0b572007-10-16 01:24:07 -070064#endif
Harvey Harrison33cb5242008-01-30 13:32:19 +010065}
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070066
Harvey Harrison1dc85be2008-01-30 13:32:35 +010067#ifdef CONFIG_X86_32
68/*
69 * Return EIP plus the CS segment base. The segment limit is also
70 * adjusted, clamped to the kernel/user address space (whichever is
71 * appropriate), and returned in *eip_limit.
72 *
73 * The segment is checked, because it might have been changed by another
74 * task between the original faulting instruction and here.
75 *
76 * If CS is no longer a valid code segment, or if EIP is beyond the
77 * limit, or if it is a kernel address when CS is not a kernel segment,
78 * then the returned value will be greater than *eip_limit.
79 *
80 * This is slow, but is very rarely executed.
81 */
82static inline unsigned long get_segment_eip(struct pt_regs *regs,
83 unsigned long *eip_limit)
84{
85 unsigned long ip = regs->ip;
86 unsigned seg = regs->cs & 0xffff;
87 u32 seg_ar, seg_limit, base, *desc;
88
89 /* Unlikely, but must come before segment checks. */
90 if (unlikely(regs->flags & VM_MASK)) {
91 base = seg << 4;
92 *eip_limit = base + 0xffff;
93 return base + (ip & 0xffff);
94 }
95
96 /* The standard kernel/user address space limit. */
97 *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
98
99 /* By far the most common cases. */
100 if (likely(SEGMENT_IS_FLAT_CODE(seg)))
101 return ip;
102
103 /* Check the segment exists, is within the current LDT/GDT size,
104 that kernel/user (ring 0..3) has the appropriate privilege,
105 that it's a code segment, and get the limit. */
106 __asm__("larl %3,%0; lsll %3,%1"
107 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
108 if ((~seg_ar & 0x9800) || ip > seg_limit) {
109 *eip_limit = 0;
110 return 1; /* So that returned ip > *eip_limit. */
111 }
112
113 /* Get the GDT/LDT descriptor base.
114 When you look for races in this code remember that
115 LDT and other horrors are only used in user space. */
116 if (seg & (1<<2)) {
117 /* Must lock the LDT while reading it. */
118 mutex_lock(&current->mm->context.lock);
119 desc = current->mm->context.ldt;
120 desc = (void *)desc + (seg & ~7);
121 } else {
122 /* Must disable preemption while reading the GDT. */
123 desc = (u32 *)get_cpu_gdt_table(get_cpu());
124 desc = (void *)desc + (seg & ~7);
125 }
126
127 /* Decode the code segment base from the descriptor */
128 base = get_desc_base((struct desc_struct *)desc);
129
130 if (seg & (1<<2))
131 mutex_unlock(&current->mm->context.lock);
132 else
133 put_cpu();
134
135 /* Adjust EIP and segment limit, and clamp at the kernel limit.
136 It's legitimate for segments to wrap at 0xffffffff. */
137 seg_limit += base;
138 if (seg_limit < *eip_limit && seg_limit >= base)
139 *eip_limit = seg_limit;
140 return ip + base;
141}
142#endif
143
144/*
145 * X86_32
146 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
147 * Check that here and ignore it.
148 *
149 * X86_64
150 * Sometimes the CPU reports invalid exceptions on prefetch.
151 * Check that here and ignore it.
152 *
153 * Opcode checker based on code by Richard Brunner
154 */
155static int is_prefetch(struct pt_regs *regs, unsigned long addr,
156 unsigned long error_code)
Harvey Harrison33cb5242008-01-30 13:32:19 +0100157{
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100158 unsigned char *instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700159 int scan_more = 1;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100160 int prefetch = 0;
Andi Kleenf1290ec2005-04-16 15:24:59 -0700161 unsigned char *max_instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700162
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100163#ifdef CONFIG_X86_32
164 unsigned long limit;
165 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
166 boot_cpu_data.x86 >= 6)) {
167 /* Catch an obscure case of prefetch inside an NX page. */
168 if (nx_enabled && (error_code & PF_INSTR))
169 return 0;
170 } else {
171 return 0;
172 }
173 instr = (unsigned char *)get_segment_eip(regs, &limit);
174#else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700175 /* If it was a exec fault ignore */
Andi Kleen66c58152006-01-11 22:44:09 +0100176 if (error_code & PF_INSTR)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700177 return 0;
Andi Kleendd2994f2006-09-26 10:52:33 +0200178 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100179#endif
180
Andi Kleenf1290ec2005-04-16 15:24:59 -0700181 max_instr = instr + 15;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700182
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100183#ifdef CONFIG_X86_64
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700184 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185 return 0;
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100186#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700187
Harvey Harrison33cb5242008-01-30 13:32:19 +0100188 while (scan_more && instr < max_instr) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189 unsigned char opcode;
190 unsigned char instr_hi;
191 unsigned char instr_lo;
192
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100193#ifdef CONFIG_X86_32
194 if (instr > (unsigned char *)limit)
195 break;
196#endif
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100197 if (probe_kernel_address(instr, opcode))
Harvey Harrison33cb5242008-01-30 13:32:19 +0100198 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199
Harvey Harrison33cb5242008-01-30 13:32:19 +0100200 instr_hi = opcode & 0xf0;
201 instr_lo = opcode & 0x0f;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202 instr++;
203
Harvey Harrison33cb5242008-01-30 13:32:19 +0100204 switch (instr_hi) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700205 case 0x20:
206 case 0x30:
Harvey Harrison33cb5242008-01-30 13:32:19 +0100207 /*
208 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
209 * In X86_64 long mode, the CPU will signal invalid
210 * opcode if some of these prefixes are present so
211 * X86_64 will never get here anyway
212 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213 scan_more = ((instr_lo & 7) == 0x6);
214 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100215#ifdef CONFIG_X86_64
Linus Torvalds1da177e2005-04-16 15:20:36 -0700216 case 0x40:
Harvey Harrison33cb5242008-01-30 13:32:19 +0100217 /*
218 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
219 * Need to figure out under what instruction mode the
220 * instruction was issued. Could check the LDT for lm,
221 * but for now it's good enough to assume that long
222 * mode only uses well known segments or kernel.
223 */
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700224 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700225 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100226#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700227 case 0x60:
228 /* 0x64 thru 0x67 are valid prefixes in all modes. */
229 scan_more = (instr_lo & 0xC) == 0x4;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100230 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700231 case 0xF0:
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100232 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700233 scan_more = !instr_lo || (instr_lo>>1) == 1;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100234 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700235 case 0x00:
236 /* Prefetch instruction is 0x0F0D or 0x0F18 */
237 scan_more = 0;
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100238#ifdef CONFIG_X86_32
239 if (instr > (unsigned char *)limit)
240 break;
241#endif
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100242 if (probe_kernel_address(instr, opcode))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243 break;
244 prefetch = (instr_lo == 0xF) &&
245 (opcode == 0x0D || opcode == 0x18);
Harvey Harrison33cb5242008-01-30 13:32:19 +0100246 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700247 default:
248 scan_more = 0;
249 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100250 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251 }
252 return prefetch;
253}
254
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100255static void force_sig_info_fault(int si_signo, int si_code,
256 unsigned long address, struct task_struct *tsk)
257{
258 siginfo_t info;
259
260 info.si_signo = si_signo;
261 info.si_errno = 0;
262 info.si_code = si_code;
263 info.si_addr = (void __user *)address;
264 force_sig_info(si_signo, &info, tsk);
265}
266
Harvey Harrison33cb5242008-01-30 13:32:19 +0100267static int bad_address(void *p)
268{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700269 unsigned long dummy;
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100270 return probe_kernel_address((unsigned long *)p, dummy);
Harvey Harrison33cb5242008-01-30 13:32:19 +0100271}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272
273void dump_pagetable(unsigned long address)
274{
275 pgd_t *pgd;
276 pud_t *pud;
277 pmd_t *pmd;
278 pte_t *pte;
279
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200280 pgd = (pgd_t *)read_cr3();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700281
Harvey Harrison33cb5242008-01-30 13:32:19 +0100282 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700283 pgd += pgd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 if (bad_address(pgd)) goto bad;
Jan Beulichd646bce2006-02-03 21:51:47 +0100285 printk("PGD %lx ", pgd_val(*pgd));
Harvey Harrison33cb5242008-01-30 13:32:19 +0100286 if (!pgd_present(*pgd)) goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700287
Andi Kleend2ae5b52006-06-26 13:57:56 +0200288 pud = pud_offset(pgd, address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289 if (bad_address(pud)) goto bad;
290 printk("PUD %lx ", pud_val(*pud));
291 if (!pud_present(*pud)) goto ret;
292
293 pmd = pmd_offset(pud, address);
294 if (bad_address(pmd)) goto bad;
295 printk("PMD %lx ", pmd_val(*pmd));
Jan Beulichb1992df2007-10-19 20:35:03 +0200296 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297
298 pte = pte_offset_kernel(pmd, address);
299 if (bad_address(pte)) goto bad;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100300 printk("PTE %lx", pte_val(*pte));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301ret:
302 printk("\n");
303 return;
304bad:
305 printk("BAD\n");
306}
307
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100308#ifdef CONFIG_X86_64
Harvey Harrison33cb5242008-01-30 13:32:19 +0100309static const char errata93_warning[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -0700310KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
311KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
312KERN_ERR "******* Please consider a BIOS update.\n"
313KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
314
315/* Workaround for K8 erratum #93 & buggy BIOS.
316 BIOS SMM functions are required to use a specific workaround
Harvey Harrison33cb5242008-01-30 13:32:19 +0100317 to avoid corruption of the 64bit RIP register on C stepping K8.
318 A lot of BIOS that didn't get tested properly miss this.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319 The OS sees this as a page fault with the upper 32bits of RIP cleared.
320 Try to work around it here.
321 Note we only handle faults in kernel here. */
322
Harvey Harrison33cb5242008-01-30 13:32:19 +0100323static int is_errata93(struct pt_regs *regs, unsigned long address)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700324{
325 static int warned;
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100326 if (address != regs->ip)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327 return 0;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100328 if ((address >> 32) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 return 0;
330 address |= 0xffffffffUL << 32;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100331 if ((address >= (u64)_stext && address <= (u64)_etext) ||
332 (address >= MODULES_VADDR && address <= MODULES_END)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700333 if (!warned) {
Harvey Harrison33cb5242008-01-30 13:32:19 +0100334 printk(errata93_warning);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335 warned = 1;
336 }
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100337 regs->ip = address;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338 return 1;
339 }
340 return 0;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100341}
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100342#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
345 unsigned long error_code)
346{
Jan Beulich12091402005-09-12 18:49:24 +0200347 unsigned long flags = oops_begin();
Jan Beulich6e3f3612006-01-11 22:42:14 +0100348 struct task_struct *tsk;
Jan Beulich12091402005-09-12 18:49:24 +0200349
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
351 current->comm, address);
352 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100353 tsk = current;
354 tsk->thread.cr2 = address;
355 tsk->thread.trap_no = 14;
356 tsk->thread.error_code = error_code;
Jan Beulich22f59912008-01-30 13:31:23 +0100357 if (__die("Bad pagetable", regs, error_code))
358 regs = NULL;
359 oops_end(flags, regs, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360}
361
362/*
Andi Kleenf95190b2006-01-11 22:44:00 +0100363 * Handle a fault on the vmalloc area
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700364 *
365 * This assumes no large pages in there.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700366 */
367static int vmalloc_fault(unsigned long address)
368{
369 pgd_t *pgd, *pgd_ref;
370 pud_t *pud, *pud_ref;
371 pmd_t *pmd, *pmd_ref;
372 pte_t *pte, *pte_ref;
373
374 /* Copy kernel mappings over when needed. This can also
375 happen within a race in page table update. In the later
376 case just flush. */
377
378 pgd = pgd_offset(current->mm ?: &init_mm, address);
379 pgd_ref = pgd_offset_k(address);
380 if (pgd_none(*pgd_ref))
381 return -1;
382 if (pgd_none(*pgd))
383 set_pgd(pgd, *pgd_ref);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100384 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700385 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386
387 /* Below here mismatches are bugs because these lower tables
388 are shared */
389
390 pud = pud_offset(pgd, address);
391 pud_ref = pud_offset(pgd_ref, address);
392 if (pud_none(*pud_ref))
393 return -1;
Dave McCracken46a82b22006-09-25 23:31:48 -0700394 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700395 BUG();
396 pmd = pmd_offset(pud, address);
397 pmd_ref = pmd_offset(pud_ref, address);
398 if (pmd_none(*pmd_ref))
399 return -1;
400 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
401 BUG();
402 pte_ref = pte_offset_kernel(pmd_ref, address);
403 if (!pte_present(*pte_ref))
404 return -1;
405 pte = pte_offset_kernel(pmd, address);
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700406 /* Don't use pte_page here, because the mappings can point
407 outside mem_map, and the NUMA hash lookup cannot handle
408 that. */
409 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700410 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700411 return 0;
412}
413
Masoud Asgharifard Sharbianiabd4f752007-07-22 11:12:28 +0200414int show_unhandled_signals = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700415
416/*
417 * This routine handles page faults. It determines the address,
418 * and the problem, and then passes it off to one of the appropriate
419 * routines.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700420 */
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -0700421asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
422 unsigned long error_code)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700423{
424 struct task_struct *tsk;
425 struct mm_struct *mm;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100426 struct vm_area_struct *vma;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700427 unsigned long address;
Nick Piggin83c54072007-07-19 01:47:05 -0700428 int write, fault;
Jan Beulich12091402005-09-12 18:49:24 +0200429 unsigned long flags;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100430 int si_code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700431
Peter Zijlstra143a5d32007-10-25 14:01:10 +0200432 /*
433 * We can fault from pretty much anywhere, with unknown IRQ state.
434 */
435 trace_hardirqs_fixup();
436
Arjan van de Vena9ba9a32006-03-25 16:30:10 +0100437 tsk = current;
438 mm = tsk->mm;
439 prefetchw(&mm->mmap_sem);
440
Linus Torvalds1da177e2005-04-16 15:20:36 -0700441 /* get the address */
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200442 address = read_cr2();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700443
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100444 si_code = SEGV_MAPERR;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445
446
447 /*
448 * We fault-in kernel-space virtual memory on-demand. The
449 * 'reference' page table is init_mm.pgd.
450 *
451 * NOTE! We MUST NOT take any locks for this case. We may
452 * be in an interrupt or a critical region, and should
453 * only copy the information from the master page table,
454 * nothing more.
455 *
456 * This verifies that the fault happens in kernel space
457 * (error_code & 4) == 0, and that the fault was not a
Jan Beulich8b1bde92006-01-11 22:42:23 +0100458 * protection error (error_code & 9) == 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700459 */
Suresh Siddha84929802005-06-21 17:14:32 -0700460 if (unlikely(address >= TASK_SIZE64)) {
Andi Kleenf95190b2006-01-11 22:44:00 +0100461 /*
462 * Don't check for the module range here: its PML4
463 * is always initialized because it's shared with the main
464 * kernel text. Only vmalloc may need PML4 syncups.
465 */
Andi Kleen66c58152006-01-11 22:44:09 +0100466 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
Andi Kleenf95190b2006-01-11 22:44:00 +0100467 ((address >= VMALLOC_START && address < VMALLOC_END))) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100468 if (vmalloc_fault(address) >= 0)
469 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700470 }
Christoph Hellwig74a0b572007-10-16 01:24:07 -0700471 if (notify_page_fault(regs))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100472 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700473 /*
474 * Don't take the mm semaphore here. If we fixup a prefetch
475 * fault we could otherwise deadlock.
476 */
477 goto bad_area_nosemaphore;
478 }
479
Christoph Hellwig74a0b572007-10-16 01:24:07 -0700480 if (notify_page_fault(regs))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100481 return;
482
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100483 if (likely(regs->flags & X86_EFLAGS_IF))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100484 local_irq_enable();
485
Andi Kleen66c58152006-01-11 22:44:09 +0100486 if (unlikely(error_code & PF_RSVD))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700487 pgtable_bad(address, regs, error_code);
488
489 /*
Harvey Harrison33cb5242008-01-30 13:32:19 +0100490 * If we're in an interrupt, have no user context or are running in an
491 * atomic region then we must not take the fault.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700492 */
493 if (unlikely(in_atomic() || !mm))
494 goto bad_area_nosemaphore;
495
Linus Torvaldsdbe3ed12007-09-19 11:37:14 -0700496 /*
497 * User-mode registers count as a user access even for any
498 * potential system fault or CPU buglet.
499 */
500 if (user_mode_vm(regs))
501 error_code |= PF_USER;
502
Linus Torvalds1da177e2005-04-16 15:20:36 -0700503 again:
504 /* When running in the kernel we expect faults to occur only to
505 * addresses in user space. All other faults represent errors in the
Simon Arlott676b1852007-10-20 01:25:36 +0200506 * kernel and should generate an OOPS. Unfortunately, in the case of an
Adrian Bunk80f72282006-06-30 18:27:16 +0200507 * erroneous fault occurring in a code path which already holds mmap_sem
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508 * we will deadlock attempting to validate the fault against the
509 * address space. Luckily the kernel only validly references user
510 * space from well defined areas of code, which are listed in the
511 * exceptions table.
512 *
513 * As the vast majority of faults will be valid we will only perform
Simon Arlott676b1852007-10-20 01:25:36 +0200514 * the source reference check when there is a possibility of a deadlock.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700515 * Attempt to lock the address space, if we cannot we then validate the
516 * source. If this is invalid we can skip the address space check,
517 * thus avoiding the deadlock.
518 */
519 if (!down_read_trylock(&mm->mmap_sem)) {
Andi Kleen66c58152006-01-11 22:44:09 +0100520 if ((error_code & PF_USER) == 0 &&
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100521 !search_exception_tables(regs->ip))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700522 goto bad_area_nosemaphore;
523 down_read(&mm->mmap_sem);
524 }
525
526 vma = find_vma(mm, address);
527 if (!vma)
528 goto bad_area;
529 if (likely(vma->vm_start <= address))
530 goto good_area;
531 if (!(vma->vm_flags & VM_GROWSDOWN))
532 goto bad_area;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100533 if (error_code & PF_USER) {
Chuck Ebbert03fdc2c2006-06-26 13:59:50 +0200534 /* Allow userspace just enough access below the stack pointer
535 * to let the 'enter' instruction work.
536 */
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100537 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700538 goto bad_area;
539 }
540 if (expand_stack(vma, address))
541 goto bad_area;
542/*
543 * Ok, we have a good vm_area for this memory access, so
544 * we can handle it..
545 */
546good_area:
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100547 si_code = SEGV_ACCERR;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700548 write = 0;
Andi Kleen66c58152006-01-11 22:44:09 +0100549 switch (error_code & (PF_PROT|PF_WRITE)) {
Harvey Harrison33cb5242008-01-30 13:32:19 +0100550 default: /* 3: write, present */
551 /* fall through */
552 case PF_WRITE: /* write, not present */
553 if (!(vma->vm_flags & VM_WRITE))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700554 goto bad_area;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100555 write++;
556 break;
557 case PF_PROT: /* read, present */
558 goto bad_area;
559 case 0: /* read, not present */
560 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
561 goto bad_area;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700562 }
563
564 /*
565 * If for any reason at all we couldn't handle the fault,
566 * make sure we exit gracefully rather than endlessly redo
567 * the fault.
568 */
Nick Piggin83c54072007-07-19 01:47:05 -0700569 fault = handle_mm_fault(mm, vma, address, write);
570 if (unlikely(fault & VM_FAULT_ERROR)) {
571 if (fault & VM_FAULT_OOM)
572 goto out_of_memory;
573 else if (fault & VM_FAULT_SIGBUS)
574 goto do_sigbus;
575 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700576 }
Nick Piggin83c54072007-07-19 01:47:05 -0700577 if (fault & VM_FAULT_MAJOR)
578 tsk->maj_flt++;
579 else
580 tsk->min_flt++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700581 up_read(&mm->mmap_sem);
582 return;
583
584/*
585 * Something tried to access memory that isn't in our memory map..
586 * Fix it, but check if it's kernel or user first..
587 */
588bad_area:
589 up_read(&mm->mmap_sem);
590
591bad_area_nosemaphore:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700592 /* User mode accesses just cause a SIGSEGV */
Andi Kleen66c58152006-01-11 22:44:09 +0100593 if (error_code & PF_USER) {
Steven Rostedte5e3c842007-06-06 23:34:04 -0400594
595 /*
596 * It's possible to have interrupts off here.
597 */
598 local_irq_enable();
599
Linus Torvalds1da177e2005-04-16 15:20:36 -0700600 if (is_prefetch(regs, address, error_code))
601 return;
602
603 /* Work around K8 erratum #100 K8 in compat mode
604 occasionally jumps to illegal addresses >4GB. We
605 catch this here in the page fault handler because
606 these addresses are not reachable. Just detect this
607 case and return. Any code segment in LDT is
608 compatibility mode. */
609 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
610 (address >> 32))
611 return;
612
Masoud Asgharifard Sharbianiabd4f752007-07-22 11:12:28 +0200613 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
614 printk_ratelimit()) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700615 printk(
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100616 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700617 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100618 tsk->comm, tsk->pid, address, regs->ip,
619 regs->sp, error_code);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700620 }
Harvey Harrison33cb5242008-01-30 13:32:19 +0100621
Linus Torvalds1da177e2005-04-16 15:20:36 -0700622 tsk->thread.cr2 = address;
623 /* Kernel addresses are always protection faults */
624 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
625 tsk->thread.trap_no = 14;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100626
627 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700628 return;
629 }
630
631no_context:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700632 /* Are we prepared to handle this kernel fault? */
Harvey Harrison33cb5242008-01-30 13:32:19 +0100633 if (fixup_exception(regs))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700634 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700635
Harvey Harrison33cb5242008-01-30 13:32:19 +0100636 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700637 * Hall of shame of CPU/BIOS bugs.
638 */
639
Harvey Harrison33cb5242008-01-30 13:32:19 +0100640 if (is_prefetch(regs, address, error_code))
641 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700642
643 if (is_errata93(regs, address))
Harvey Harrison33cb5242008-01-30 13:32:19 +0100644 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700645
646/*
647 * Oops. The kernel tried to access some bad page. We'll have to
648 * terminate things with extreme prejudice.
649 */
650
Jan Beulich12091402005-09-12 18:49:24 +0200651 flags = oops_begin();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700652
653 if (address < PAGE_SIZE)
654 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
655 else
656 printk(KERN_ALERT "Unable to handle kernel paging request");
Harvey Harrison33cb5242008-01-30 13:32:19 +0100657 printk(" at %016lx RIP: \n" KERN_ALERT, address);
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100658 printk_address(regs->ip);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100660 tsk->thread.cr2 = address;
661 tsk->thread.trap_no = 14;
662 tsk->thread.error_code = error_code;
Jan Beulich22f59912008-01-30 13:31:23 +0100663 if (__die("Oops", regs, error_code))
664 regs = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665 /* Executive summary in case the body of the oops scrolled away */
666 printk(KERN_EMERG "CR2: %016lx\n", address);
Jan Beulich22f59912008-01-30 13:31:23 +0100667 oops_end(flags, regs, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700668
669/*
670 * We ran out of memory, or some other thing happened to us that made
671 * us unable to handle the page fault gracefully.
672 */
673out_of_memory:
674 up_read(&mm->mmap_sem);
Serge E. Hallynb460cbc2007-10-18 23:39:52 -0700675 if (is_global_init(current)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700676 yield();
677 goto again;
678 }
679 printk("VM: killing process %s\n", tsk->comm);
Harvey Harrison318aa292008-01-30 13:32:59 +0100680 if (error_code & PF_USER)
Will Schmidt021daae2007-07-21 17:11:17 +0200681 do_group_exit(SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700682 goto no_context;
683
684do_sigbus:
685 up_read(&mm->mmap_sem);
686
687 /* Kernel mode? Handle exceptions or die */
Andi Kleen66c58152006-01-11 22:44:09 +0100688 if (!(error_code & PF_USER))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700689 goto no_context;
690
691 tsk->thread.cr2 = address;
692 tsk->thread.error_code = error_code;
693 tsk->thread.trap_no = 14;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100694 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700695 return;
696}
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100697
Jan Beulich8c914cb2006-03-25 16:29:40 +0100698DEFINE_SPINLOCK(pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200699LIST_HEAD(pgd_list);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100700
701void vmalloc_sync_all(void)
702{
Harvey Harrison33cb5242008-01-30 13:32:19 +0100703 /* Note that races in the updates of insync and start aren't
Jan Beulich8c914cb2006-03-25 16:29:40 +0100704 problematic:
705 insync can only get set bits added, and updates to start are only
706 improving performance (without affecting correctness if undone). */
707 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
708 static unsigned long start = VMALLOC_START & PGDIR_MASK;
709 unsigned long address;
710
711 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
712 if (!test_bit(pgd_index(address), insync)) {
713 const pgd_t *pgd_ref = pgd_offset_k(address);
714 struct page *page;
715
716 if (pgd_none(*pgd_ref))
717 continue;
718 spin_lock(&pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200719 list_for_each_entry(page, &pgd_list, lru) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100720 pgd_t *pgd;
721 pgd = (pgd_t *)page_address(page) + pgd_index(address);
722 if (pgd_none(*pgd))
723 set_pgd(pgd, *pgd_ref);
724 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700725 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Jan Beulich8c914cb2006-03-25 16:29:40 +0100726 }
727 spin_unlock(&pgd_lock);
728 set_bit(pgd_index(address), insync);
729 }
730 if (address == start)
731 start = address + PGDIR_SIZE;
732 }
733 /* Check that there is no need to do the same for the modules area. */
734 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
Harvey Harrison33cb5242008-01-30 13:32:19 +0100735 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
Jan Beulich8c914cb2006-03-25 16:29:40 +0100736 (__START_KERNEL & PGDIR_MASK)));
737}