blob: c6b3ad515cf12c5e071c0e2761fd049db4be914f [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
Linus Torvalds1da177e2005-04-16 15:20:36 -07006#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070016#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070021#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/module.h>
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -070023#include <linux/kprobes.h>
Andi Kleenab2bf0c2006-12-07 02:14:06 +010024#include <linux/uaccess.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070025#include <linux/kdebug.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070026
27#include <asm/system.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070028#include <asm/pgalloc.h>
29#include <asm/smp.h>
30#include <asm/tlbflush.h>
31#include <asm/proto.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070032#include <asm-generic/sections.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070033
Harvey Harrison33cb5242008-01-30 13:32:19 +010034/*
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
41 */
Ingo Molnar8a19da72008-01-30 13:32:53 +010042#define PF_PROT (1<<0)
Andi Kleen66c58152006-01-11 22:44:09 +010043#define PF_WRITE (1<<1)
Ingo Molnar8a19da72008-01-30 13:32:53 +010044#define PF_USER (1<<2)
45#define PF_RSVD (1<<3)
Andi Kleen66c58152006-01-11 22:44:09 +010046#define PF_INSTR (1<<4)
47
Christoph Hellwig74a0b572007-10-16 01:24:07 -070048static inline int notify_page_fault(struct pt_regs *regs)
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070049{
Harvey Harrison33cb5242008-01-30 13:32:19 +010050#ifdef CONFIG_KPROBES
Christoph Hellwig74a0b572007-10-16 01:24:07 -070051 int ret = 0;
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070052
Christoph Hellwig74a0b572007-10-16 01:24:07 -070053 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
55 preempt_disable();
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
57 ret = 1;
58 preempt_enable();
59 }
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070060
Christoph Hellwig74a0b572007-10-16 01:24:07 -070061 return ret;
Christoph Hellwig74a0b572007-10-16 01:24:07 -070062#else
Christoph Hellwig74a0b572007-10-16 01:24:07 -070063 return 0;
Christoph Hellwig74a0b572007-10-16 01:24:07 -070064#endif
Harvey Harrison33cb5242008-01-30 13:32:19 +010065}
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070066
Harvey Harrison1dc85be2008-01-30 13:32:35 +010067/*
68 * X86_32
69 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
70 * Check that here and ignore it.
71 *
72 * X86_64
73 * Sometimes the CPU reports invalid exceptions on prefetch.
74 * Check that here and ignore it.
75 *
76 * Opcode checker based on code by Richard Brunner
77 */
78static int is_prefetch(struct pt_regs *regs, unsigned long addr,
79 unsigned long error_code)
Harvey Harrison33cb5242008-01-30 13:32:19 +010080{
Andi Kleenab2bf0c2006-12-07 02:14:06 +010081 unsigned char *instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -070082 int scan_more = 1;
Harvey Harrison33cb5242008-01-30 13:32:19 +010083 int prefetch = 0;
Andi Kleenf1290ec2005-04-16 15:24:59 -070084 unsigned char *max_instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -070085
Harvey Harrison1dc85be2008-01-30 13:32:35 +010086#ifdef CONFIG_X86_32
Harvey Harrison1dc85be2008-01-30 13:32:35 +010087 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
88 boot_cpu_data.x86 >= 6)) {
89 /* Catch an obscure case of prefetch inside an NX page. */
90 if (nx_enabled && (error_code & PF_INSTR))
91 return 0;
92 } else {
93 return 0;
94 }
Harvey Harrison1dc85be2008-01-30 13:32:35 +010095#else
Linus Torvalds1da177e2005-04-16 15:20:36 -070096 /* If it was a exec fault ignore */
Andi Kleen66c58152006-01-11 22:44:09 +010097 if (error_code & PF_INSTR)
Linus Torvalds1da177e2005-04-16 15:20:36 -070098 return 0;
Harvey Harrison1dc85be2008-01-30 13:32:35 +010099#endif
100
Harvey Harrisonf2857ce2008-01-30 13:33:12 +0100101 instr = (unsigned char *)convert_ip_to_linear(current, regs);
Andi Kleenf1290ec2005-04-16 15:24:59 -0700102 max_instr = instr + 15;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700104 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105 return 0;
106
Harvey Harrison33cb5242008-01-30 13:32:19 +0100107 while (scan_more && instr < max_instr) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108 unsigned char opcode;
109 unsigned char instr_hi;
110 unsigned char instr_lo;
111
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100112 if (probe_kernel_address(instr, opcode))
Harvey Harrison33cb5242008-01-30 13:32:19 +0100113 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114
Harvey Harrison33cb5242008-01-30 13:32:19 +0100115 instr_hi = opcode & 0xf0;
116 instr_lo = opcode & 0x0f;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117 instr++;
118
Harvey Harrison33cb5242008-01-30 13:32:19 +0100119 switch (instr_hi) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700120 case 0x20:
121 case 0x30:
Harvey Harrison33cb5242008-01-30 13:32:19 +0100122 /*
123 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
124 * In X86_64 long mode, the CPU will signal invalid
125 * opcode if some of these prefixes are present so
126 * X86_64 will never get here anyway
127 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700128 scan_more = ((instr_lo & 7) == 0x6);
129 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100130#ifdef CONFIG_X86_64
Linus Torvalds1da177e2005-04-16 15:20:36 -0700131 case 0x40:
Harvey Harrison33cb5242008-01-30 13:32:19 +0100132 /*
133 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
134 * Need to figure out under what instruction mode the
135 * instruction was issued. Could check the LDT for lm,
136 * but for now it's good enough to assume that long
137 * mode only uses well known segments or kernel.
138 */
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700139 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700140 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100141#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142 case 0x60:
143 /* 0x64 thru 0x67 are valid prefixes in all modes. */
144 scan_more = (instr_lo & 0xC) == 0x4;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100145 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700146 case 0xF0:
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100147 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148 scan_more = !instr_lo || (instr_lo>>1) == 1;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100149 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700150 case 0x00:
151 /* Prefetch instruction is 0x0F0D or 0x0F18 */
152 scan_more = 0;
Harvey Harrisonf2857ce2008-01-30 13:33:12 +0100153
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100154 if (probe_kernel_address(instr, opcode))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 break;
156 prefetch = (instr_lo == 0xF) &&
157 (opcode == 0x0D || opcode == 0x18);
Harvey Harrison33cb5242008-01-30 13:32:19 +0100158 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700159 default:
160 scan_more = 0;
161 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100162 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163 }
164 return prefetch;
165}
166
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100167static void force_sig_info_fault(int si_signo, int si_code,
168 unsigned long address, struct task_struct *tsk)
169{
170 siginfo_t info;
171
172 info.si_signo = si_signo;
173 info.si_errno = 0;
174 info.si_code = si_code;
175 info.si_addr = (void __user *)address;
176 force_sig_info(si_signo, &info, tsk);
177}
178
Harvey Harrison33cb5242008-01-30 13:32:19 +0100179static int bad_address(void *p)
180{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181 unsigned long dummy;
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100182 return probe_kernel_address((unsigned long *)p, dummy);
Harvey Harrison33cb5242008-01-30 13:32:19 +0100183}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184
185void dump_pagetable(unsigned long address)
186{
187 pgd_t *pgd;
188 pud_t *pud;
189 pmd_t *pmd;
190 pte_t *pte;
191
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200192 pgd = (pgd_t *)read_cr3();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193
Harvey Harrison33cb5242008-01-30 13:32:19 +0100194 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195 pgd += pgd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196 if (bad_address(pgd)) goto bad;
Jan Beulichd646bce2006-02-03 21:51:47 +0100197 printk("PGD %lx ", pgd_val(*pgd));
Harvey Harrison33cb5242008-01-30 13:32:19 +0100198 if (!pgd_present(*pgd)) goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199
Andi Kleend2ae5b52006-06-26 13:57:56 +0200200 pud = pud_offset(pgd, address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700201 if (bad_address(pud)) goto bad;
202 printk("PUD %lx ", pud_val(*pud));
203 if (!pud_present(*pud)) goto ret;
204
205 pmd = pmd_offset(pud, address);
206 if (bad_address(pmd)) goto bad;
207 printk("PMD %lx ", pmd_val(*pmd));
Jan Beulichb1992df2007-10-19 20:35:03 +0200208 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209
210 pte = pte_offset_kernel(pmd, address);
211 if (bad_address(pte)) goto bad;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100212 printk("PTE %lx", pte_val(*pte));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213ret:
214 printk("\n");
215 return;
216bad:
217 printk("BAD\n");
218}
219
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100220#ifdef CONFIG_X86_64
Harvey Harrison33cb5242008-01-30 13:32:19 +0100221static const char errata93_warning[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
223KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
224KERN_ERR "******* Please consider a BIOS update.\n"
225KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
226
227/* Workaround for K8 erratum #93 & buggy BIOS.
228 BIOS SMM functions are required to use a specific workaround
Harvey Harrison33cb5242008-01-30 13:32:19 +0100229 to avoid corruption of the 64bit RIP register on C stepping K8.
230 A lot of BIOS that didn't get tested properly miss this.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700231 The OS sees this as a page fault with the upper 32bits of RIP cleared.
232 Try to work around it here.
233 Note we only handle faults in kernel here. */
234
Harvey Harrison33cb5242008-01-30 13:32:19 +0100235static int is_errata93(struct pt_regs *regs, unsigned long address)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700236{
237 static int warned;
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100238 if (address != regs->ip)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700239 return 0;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100240 if ((address >> 32) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241 return 0;
242 address |= 0xffffffffUL << 32;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100243 if ((address >= (u64)_stext && address <= (u64)_etext) ||
244 (address >= MODULES_VADDR && address <= MODULES_END)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700245 if (!warned) {
Harvey Harrison33cb5242008-01-30 13:32:19 +0100246 printk(errata93_warning);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700247 warned = 1;
248 }
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100249 regs->ip = address;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250 return 1;
251 }
252 return 0;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100253}
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100254#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700255
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
257 unsigned long error_code)
258{
Jan Beulich12091402005-09-12 18:49:24 +0200259 unsigned long flags = oops_begin();
Jan Beulich6e3f3612006-01-11 22:42:14 +0100260 struct task_struct *tsk;
Jan Beulich12091402005-09-12 18:49:24 +0200261
Linus Torvalds1da177e2005-04-16 15:20:36 -0700262 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
263 current->comm, address);
264 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100265 tsk = current;
266 tsk->thread.cr2 = address;
267 tsk->thread.trap_no = 14;
268 tsk->thread.error_code = error_code;
Jan Beulich22f59912008-01-30 13:31:23 +0100269 if (__die("Bad pagetable", regs, error_code))
270 regs = NULL;
271 oops_end(flags, regs, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272}
273
274/*
Andi Kleenf95190b2006-01-11 22:44:00 +0100275 * Handle a fault on the vmalloc area
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700276 *
277 * This assumes no large pages in there.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278 */
279static int vmalloc_fault(unsigned long address)
280{
281 pgd_t *pgd, *pgd_ref;
282 pud_t *pud, *pud_ref;
283 pmd_t *pmd, *pmd_ref;
284 pte_t *pte, *pte_ref;
285
286 /* Copy kernel mappings over when needed. This can also
287 happen within a race in page table update. In the later
288 case just flush. */
289
290 pgd = pgd_offset(current->mm ?: &init_mm, address);
291 pgd_ref = pgd_offset_k(address);
292 if (pgd_none(*pgd_ref))
293 return -1;
294 if (pgd_none(*pgd))
295 set_pgd(pgd, *pgd_ref);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100296 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700297 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298
299 /* Below here mismatches are bugs because these lower tables
300 are shared */
301
302 pud = pud_offset(pgd, address);
303 pud_ref = pud_offset(pgd_ref, address);
304 if (pud_none(*pud_ref))
305 return -1;
Dave McCracken46a82b22006-09-25 23:31:48 -0700306 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307 BUG();
308 pmd = pmd_offset(pud, address);
309 pmd_ref = pmd_offset(pud_ref, address);
310 if (pmd_none(*pmd_ref))
311 return -1;
312 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
313 BUG();
314 pte_ref = pte_offset_kernel(pmd_ref, address);
315 if (!pte_present(*pte_ref))
316 return -1;
317 pte = pte_offset_kernel(pmd, address);
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700318 /* Don't use pte_page here, because the mappings can point
319 outside mem_map, and the NUMA hash lookup cannot handle
320 that. */
321 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323 return 0;
324}
325
Masoud Asgharifard Sharbianiabd4f752007-07-22 11:12:28 +0200326int show_unhandled_signals = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327
328/*
329 * This routine handles page faults. It determines the address,
330 * and the problem, and then passes it off to one of the appropriate
331 * routines.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332 */
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -0700333asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
334 unsigned long error_code)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335{
336 struct task_struct *tsk;
337 struct mm_struct *mm;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100338 struct vm_area_struct *vma;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339 unsigned long address;
Nick Piggin83c54072007-07-19 01:47:05 -0700340 int write, fault;
Jan Beulich12091402005-09-12 18:49:24 +0200341 unsigned long flags;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100342 int si_code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343
Peter Zijlstra143a5d32007-10-25 14:01:10 +0200344 /*
345 * We can fault from pretty much anywhere, with unknown IRQ state.
346 */
347 trace_hardirqs_fixup();
348
Arjan van de Vena9ba9a32006-03-25 16:30:10 +0100349 tsk = current;
350 mm = tsk->mm;
351 prefetchw(&mm->mmap_sem);
352
Linus Torvalds1da177e2005-04-16 15:20:36 -0700353 /* get the address */
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200354 address = read_cr2();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100356 si_code = SEGV_MAPERR;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700357
Harvey Harrison608566b2008-01-30 13:33:12 +0100358 if (notify_page_fault(regs))
359 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360
361 /*
362 * We fault-in kernel-space virtual memory on-demand. The
363 * 'reference' page table is init_mm.pgd.
364 *
365 * NOTE! We MUST NOT take any locks for this case. We may
366 * be in an interrupt or a critical region, and should
367 * only copy the information from the master page table,
368 * nothing more.
369 *
370 * This verifies that the fault happens in kernel space
371 * (error_code & 4) == 0, and that the fault was not a
Jan Beulich8b1bde92006-01-11 22:42:23 +0100372 * protection error (error_code & 9) == 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700373 */
Suresh Siddha84929802005-06-21 17:14:32 -0700374 if (unlikely(address >= TASK_SIZE64)) {
Andi Kleenf95190b2006-01-11 22:44:00 +0100375 /*
376 * Don't check for the module range here: its PML4
377 * is always initialized because it's shared with the main
378 * kernel text. Only vmalloc may need PML4 syncups.
379 */
Andi Kleen66c58152006-01-11 22:44:09 +0100380 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
Andi Kleenf95190b2006-01-11 22:44:00 +0100381 ((address >= VMALLOC_START && address < VMALLOC_END))) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100382 if (vmalloc_fault(address) >= 0)
383 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700384 }
385 /*
386 * Don't take the mm semaphore here. If we fixup a prefetch
387 * fault we could otherwise deadlock.
388 */
389 goto bad_area_nosemaphore;
390 }
391
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100392 if (likely(regs->flags & X86_EFLAGS_IF))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100393 local_irq_enable();
394
Andi Kleen66c58152006-01-11 22:44:09 +0100395 if (unlikely(error_code & PF_RSVD))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700396 pgtable_bad(address, regs, error_code);
397
398 /*
Harvey Harrison33cb5242008-01-30 13:32:19 +0100399 * If we're in an interrupt, have no user context or are running in an
400 * atomic region then we must not take the fault.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401 */
402 if (unlikely(in_atomic() || !mm))
403 goto bad_area_nosemaphore;
404
Linus Torvaldsdbe3ed12007-09-19 11:37:14 -0700405 /*
406 * User-mode registers count as a user access even for any
407 * potential system fault or CPU buglet.
408 */
409 if (user_mode_vm(regs))
410 error_code |= PF_USER;
411
Linus Torvalds1da177e2005-04-16 15:20:36 -0700412 again:
413 /* When running in the kernel we expect faults to occur only to
414 * addresses in user space. All other faults represent errors in the
Simon Arlott676b1852007-10-20 01:25:36 +0200415 * kernel and should generate an OOPS. Unfortunately, in the case of an
Adrian Bunk80f72282006-06-30 18:27:16 +0200416 * erroneous fault occurring in a code path which already holds mmap_sem
Linus Torvalds1da177e2005-04-16 15:20:36 -0700417 * we will deadlock attempting to validate the fault against the
418 * address space. Luckily the kernel only validly references user
419 * space from well defined areas of code, which are listed in the
420 * exceptions table.
421 *
422 * As the vast majority of faults will be valid we will only perform
Simon Arlott676b1852007-10-20 01:25:36 +0200423 * the source reference check when there is a possibility of a deadlock.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700424 * Attempt to lock the address space, if we cannot we then validate the
425 * source. If this is invalid we can skip the address space check,
426 * thus avoiding the deadlock.
427 */
428 if (!down_read_trylock(&mm->mmap_sem)) {
Andi Kleen66c58152006-01-11 22:44:09 +0100429 if ((error_code & PF_USER) == 0 &&
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100430 !search_exception_tables(regs->ip))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700431 goto bad_area_nosemaphore;
432 down_read(&mm->mmap_sem);
433 }
434
435 vma = find_vma(mm, address);
436 if (!vma)
437 goto bad_area;
438 if (likely(vma->vm_start <= address))
439 goto good_area;
440 if (!(vma->vm_flags & VM_GROWSDOWN))
441 goto bad_area;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100442 if (error_code & PF_USER) {
Chuck Ebbert03fdc2c2006-06-26 13:59:50 +0200443 /* Allow userspace just enough access below the stack pointer
444 * to let the 'enter' instruction work.
445 */
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100446 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700447 goto bad_area;
448 }
449 if (expand_stack(vma, address))
450 goto bad_area;
451/*
452 * Ok, we have a good vm_area for this memory access, so
453 * we can handle it..
454 */
455good_area:
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100456 si_code = SEGV_ACCERR;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700457 write = 0;
Andi Kleen66c58152006-01-11 22:44:09 +0100458 switch (error_code & (PF_PROT|PF_WRITE)) {
Harvey Harrison33cb5242008-01-30 13:32:19 +0100459 default: /* 3: write, present */
460 /* fall through */
461 case PF_WRITE: /* write, not present */
462 if (!(vma->vm_flags & VM_WRITE))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463 goto bad_area;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100464 write++;
465 break;
466 case PF_PROT: /* read, present */
467 goto bad_area;
468 case 0: /* read, not present */
469 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
470 goto bad_area;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700471 }
472
473 /*
474 * If for any reason at all we couldn't handle the fault,
475 * make sure we exit gracefully rather than endlessly redo
476 * the fault.
477 */
Nick Piggin83c54072007-07-19 01:47:05 -0700478 fault = handle_mm_fault(mm, vma, address, write);
479 if (unlikely(fault & VM_FAULT_ERROR)) {
480 if (fault & VM_FAULT_OOM)
481 goto out_of_memory;
482 else if (fault & VM_FAULT_SIGBUS)
483 goto do_sigbus;
484 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485 }
Nick Piggin83c54072007-07-19 01:47:05 -0700486 if (fault & VM_FAULT_MAJOR)
487 tsk->maj_flt++;
488 else
489 tsk->min_flt++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700490 up_read(&mm->mmap_sem);
491 return;
492
493/*
494 * Something tried to access memory that isn't in our memory map..
495 * Fix it, but check if it's kernel or user first..
496 */
497bad_area:
498 up_read(&mm->mmap_sem);
499
500bad_area_nosemaphore:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700501 /* User mode accesses just cause a SIGSEGV */
Andi Kleen66c58152006-01-11 22:44:09 +0100502 if (error_code & PF_USER) {
Steven Rostedte5e3c842007-06-06 23:34:04 -0400503
504 /*
505 * It's possible to have interrupts off here.
506 */
507 local_irq_enable();
508
Linus Torvalds1da177e2005-04-16 15:20:36 -0700509 if (is_prefetch(regs, address, error_code))
510 return;
511
512 /* Work around K8 erratum #100 K8 in compat mode
513 occasionally jumps to illegal addresses >4GB. We
514 catch this here in the page fault handler because
515 these addresses are not reachable. Just detect this
516 case and return. Any code segment in LDT is
517 compatibility mode. */
518 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
519 (address >> 32))
520 return;
521
Masoud Asgharifard Sharbianiabd4f752007-07-22 11:12:28 +0200522 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
523 printk_ratelimit()) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700524 printk(
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100525 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700526 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100527 tsk->comm, tsk->pid, address, regs->ip,
528 regs->sp, error_code);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700529 }
Harvey Harrison33cb5242008-01-30 13:32:19 +0100530
Linus Torvalds1da177e2005-04-16 15:20:36 -0700531 tsk->thread.cr2 = address;
532 /* Kernel addresses are always protection faults */
533 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
534 tsk->thread.trap_no = 14;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100535
536 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700537 return;
538 }
539
540no_context:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700541 /* Are we prepared to handle this kernel fault? */
Harvey Harrison33cb5242008-01-30 13:32:19 +0100542 if (fixup_exception(regs))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700543 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700544
Harvey Harrison33cb5242008-01-30 13:32:19 +0100545 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700546 * Hall of shame of CPU/BIOS bugs.
547 */
548
Harvey Harrison33cb5242008-01-30 13:32:19 +0100549 if (is_prefetch(regs, address, error_code))
550 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700551
552 if (is_errata93(regs, address))
Harvey Harrison33cb5242008-01-30 13:32:19 +0100553 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700554
555/*
556 * Oops. The kernel tried to access some bad page. We'll have to
557 * terminate things with extreme prejudice.
558 */
559
Jan Beulich12091402005-09-12 18:49:24 +0200560 flags = oops_begin();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700561
562 if (address < PAGE_SIZE)
563 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
564 else
565 printk(KERN_ALERT "Unable to handle kernel paging request");
Harvey Harrison33cb5242008-01-30 13:32:19 +0100566 printk(" at %016lx RIP: \n" KERN_ALERT, address);
Arjan van de Venbc850d62008-01-30 13:33:07 +0100567 printk_address(regs->ip, regs->bp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700568 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100569 tsk->thread.cr2 = address;
570 tsk->thread.trap_no = 14;
571 tsk->thread.error_code = error_code;
Jan Beulich22f59912008-01-30 13:31:23 +0100572 if (__die("Oops", regs, error_code))
573 regs = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700574 /* Executive summary in case the body of the oops scrolled away */
575 printk(KERN_EMERG "CR2: %016lx\n", address);
Jan Beulich22f59912008-01-30 13:31:23 +0100576 oops_end(flags, regs, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700577
578/*
579 * We ran out of memory, or some other thing happened to us that made
580 * us unable to handle the page fault gracefully.
581 */
582out_of_memory:
583 up_read(&mm->mmap_sem);
Serge E. Hallynb460cbc2007-10-18 23:39:52 -0700584 if (is_global_init(current)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700585 yield();
586 goto again;
587 }
588 printk("VM: killing process %s\n", tsk->comm);
Harvey Harrison318aa292008-01-30 13:32:59 +0100589 if (error_code & PF_USER)
Will Schmidt021daae2007-07-21 17:11:17 +0200590 do_group_exit(SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700591 goto no_context;
592
593do_sigbus:
594 up_read(&mm->mmap_sem);
595
596 /* Kernel mode? Handle exceptions or die */
Andi Kleen66c58152006-01-11 22:44:09 +0100597 if (!(error_code & PF_USER))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700598 goto no_context;
599
600 tsk->thread.cr2 = address;
601 tsk->thread.error_code = error_code;
602 tsk->thread.trap_no = 14;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100603 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700604 return;
605}
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100606
Jan Beulich8c914cb2006-03-25 16:29:40 +0100607DEFINE_SPINLOCK(pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200608LIST_HEAD(pgd_list);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100609
610void vmalloc_sync_all(void)
611{
Harvey Harrison33cb5242008-01-30 13:32:19 +0100612 /* Note that races in the updates of insync and start aren't
Jan Beulich8c914cb2006-03-25 16:29:40 +0100613 problematic:
614 insync can only get set bits added, and updates to start are only
615 improving performance (without affecting correctness if undone). */
616 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
617 static unsigned long start = VMALLOC_START & PGDIR_MASK;
618 unsigned long address;
619
620 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
621 if (!test_bit(pgd_index(address), insync)) {
622 const pgd_t *pgd_ref = pgd_offset_k(address);
623 struct page *page;
624
625 if (pgd_none(*pgd_ref))
626 continue;
627 spin_lock(&pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200628 list_for_each_entry(page, &pgd_list, lru) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100629 pgd_t *pgd;
630 pgd = (pgd_t *)page_address(page) + pgd_index(address);
631 if (pgd_none(*pgd))
632 set_pgd(pgd, *pgd_ref);
633 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700634 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Jan Beulich8c914cb2006-03-25 16:29:40 +0100635 }
636 spin_unlock(&pgd_lock);
637 set_bit(pgd_index(address), insync);
638 }
639 if (address == start)
640 start = address + PGDIR_SIZE;
641 }
642 /* Check that there is no need to do the same for the modules area. */
643 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
Harvey Harrison33cb5242008-01-30 13:32:19 +0100644 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
Jan Beulich8c914cb2006-03-25 16:29:40 +0100645 (__START_KERNEL & PGDIR_MASK)));
646}