blob: b606bdefbb7295233db7e763770989988e0c3c8b [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
Linus Torvalds1da177e2005-04-16 15:20:36 -07006#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070016#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070021#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/module.h>
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -070023#include <linux/kprobes.h>
Andi Kleenab2bf0c2006-12-07 02:14:06 +010024#include <linux/uaccess.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070025#include <linux/kdebug.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070026
27#include <asm/system.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070028#include <asm/pgalloc.h>
29#include <asm/smp.h>
30#include <asm/tlbflush.h>
31#include <asm/proto.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070032#include <asm-generic/sections.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070033
Harvey Harrison33cb5242008-01-30 13:32:19 +010034/*
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
41 */
Ingo Molnar8a19da72008-01-30 13:32:53 +010042#define PF_PROT (1<<0)
Andi Kleen66c58152006-01-11 22:44:09 +010043#define PF_WRITE (1<<1)
Ingo Molnar8a19da72008-01-30 13:32:53 +010044#define PF_USER (1<<2)
45#define PF_RSVD (1<<3)
Andi Kleen66c58152006-01-11 22:44:09 +010046#define PF_INSTR (1<<4)
47
Christoph Hellwig74a0b572007-10-16 01:24:07 -070048static inline int notify_page_fault(struct pt_regs *regs)
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070049{
Harvey Harrison33cb5242008-01-30 13:32:19 +010050#ifdef CONFIG_KPROBES
Christoph Hellwig74a0b572007-10-16 01:24:07 -070051 int ret = 0;
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070052
Christoph Hellwig74a0b572007-10-16 01:24:07 -070053 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
55 preempt_disable();
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
57 ret = 1;
58 preempt_enable();
59 }
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070060
Christoph Hellwig74a0b572007-10-16 01:24:07 -070061 return ret;
Christoph Hellwig74a0b572007-10-16 01:24:07 -070062#else
Christoph Hellwig74a0b572007-10-16 01:24:07 -070063 return 0;
Christoph Hellwig74a0b572007-10-16 01:24:07 -070064#endif
Harvey Harrison33cb5242008-01-30 13:32:19 +010065}
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070066
Harvey Harrison1dc85be2008-01-30 13:32:35 +010067/*
68 * X86_32
69 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
70 * Check that here and ignore it.
71 *
72 * X86_64
73 * Sometimes the CPU reports invalid exceptions on prefetch.
74 * Check that here and ignore it.
75 *
76 * Opcode checker based on code by Richard Brunner
77 */
78static int is_prefetch(struct pt_regs *regs, unsigned long addr,
79 unsigned long error_code)
Harvey Harrison33cb5242008-01-30 13:32:19 +010080{
Andi Kleenab2bf0c2006-12-07 02:14:06 +010081 unsigned char *instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -070082 int scan_more = 1;
Harvey Harrison33cb5242008-01-30 13:32:19 +010083 int prefetch = 0;
Andi Kleenf1290ec2005-04-16 15:24:59 -070084 unsigned char *max_instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -070085
Harvey Harrison1dc85be2008-01-30 13:32:35 +010086#ifdef CONFIG_X86_32
Harvey Harrison1dc85be2008-01-30 13:32:35 +010087 if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
88 boot_cpu_data.x86 >= 6)) {
89 /* Catch an obscure case of prefetch inside an NX page. */
90 if (nx_enabled && (error_code & PF_INSTR))
91 return 0;
92 } else {
93 return 0;
94 }
Harvey Harrison1dc85be2008-01-30 13:32:35 +010095#else
Linus Torvalds1da177e2005-04-16 15:20:36 -070096 /* If it was a exec fault ignore */
Andi Kleen66c58152006-01-11 22:44:09 +010097 if (error_code & PF_INSTR)
Linus Torvalds1da177e2005-04-16 15:20:36 -070098 return 0;
Harvey Harrison1dc85be2008-01-30 13:32:35 +010099#endif
100
Harvey Harrisonf2857ce2008-01-30 13:33:12 +0100101 instr = (unsigned char *)convert_ip_to_linear(current, regs);
Andi Kleenf1290ec2005-04-16 15:24:59 -0700102 max_instr = instr + 15;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700104 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105 return 0;
106
Harvey Harrison33cb5242008-01-30 13:32:19 +0100107 while (scan_more && instr < max_instr) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108 unsigned char opcode;
109 unsigned char instr_hi;
110 unsigned char instr_lo;
111
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100112 if (probe_kernel_address(instr, opcode))
Harvey Harrison33cb5242008-01-30 13:32:19 +0100113 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114
Harvey Harrison33cb5242008-01-30 13:32:19 +0100115 instr_hi = opcode & 0xf0;
116 instr_lo = opcode & 0x0f;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117 instr++;
118
Harvey Harrison33cb5242008-01-30 13:32:19 +0100119 switch (instr_hi) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700120 case 0x20:
121 case 0x30:
Harvey Harrison33cb5242008-01-30 13:32:19 +0100122 /*
123 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
124 * In X86_64 long mode, the CPU will signal invalid
125 * opcode if some of these prefixes are present so
126 * X86_64 will never get here anyway
127 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700128 scan_more = ((instr_lo & 7) == 0x6);
129 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100130#ifdef CONFIG_X86_64
Linus Torvalds1da177e2005-04-16 15:20:36 -0700131 case 0x40:
Harvey Harrison33cb5242008-01-30 13:32:19 +0100132 /*
133 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
134 * Need to figure out under what instruction mode the
135 * instruction was issued. Could check the LDT for lm,
136 * but for now it's good enough to assume that long
137 * mode only uses well known segments or kernel.
138 */
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700139 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700140 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100141#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142 case 0x60:
143 /* 0x64 thru 0x67 are valid prefixes in all modes. */
144 scan_more = (instr_lo & 0xC) == 0x4;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100145 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700146 case 0xF0:
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100147 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148 scan_more = !instr_lo || (instr_lo>>1) == 1;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100149 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700150 case 0x00:
151 /* Prefetch instruction is 0x0F0D or 0x0F18 */
152 scan_more = 0;
Harvey Harrisonf2857ce2008-01-30 13:33:12 +0100153
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100154 if (probe_kernel_address(instr, opcode))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 break;
156 prefetch = (instr_lo == 0xF) &&
157 (opcode == 0x0D || opcode == 0x18);
Harvey Harrison33cb5242008-01-30 13:32:19 +0100158 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700159 default:
160 scan_more = 0;
161 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100162 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163 }
164 return prefetch;
165}
166
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100167static void force_sig_info_fault(int si_signo, int si_code,
168 unsigned long address, struct task_struct *tsk)
169{
170 siginfo_t info;
171
172 info.si_signo = si_signo;
173 info.si_errno = 0;
174 info.si_code = si_code;
175 info.si_addr = (void __user *)address;
176 force_sig_info(si_signo, &info, tsk);
177}
178
Harvey Harrison33cb5242008-01-30 13:32:19 +0100179static int bad_address(void *p)
180{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181 unsigned long dummy;
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100182 return probe_kernel_address((unsigned long *)p, dummy);
Harvey Harrison33cb5242008-01-30 13:32:19 +0100183}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184
185void dump_pagetable(unsigned long address)
186{
187 pgd_t *pgd;
188 pud_t *pud;
189 pmd_t *pmd;
190 pte_t *pte;
191
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200192 pgd = (pgd_t *)read_cr3();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193
Harvey Harrison33cb5242008-01-30 13:32:19 +0100194 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195 pgd += pgd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196 if (bad_address(pgd)) goto bad;
Jan Beulichd646bce2006-02-03 21:51:47 +0100197 printk("PGD %lx ", pgd_val(*pgd));
Harvey Harrison33cb5242008-01-30 13:32:19 +0100198 if (!pgd_present(*pgd)) goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199
Andi Kleend2ae5b52006-06-26 13:57:56 +0200200 pud = pud_offset(pgd, address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700201 if (bad_address(pud)) goto bad;
202 printk("PUD %lx ", pud_val(*pud));
203 if (!pud_present(*pud)) goto ret;
204
205 pmd = pmd_offset(pud, address);
206 if (bad_address(pmd)) goto bad;
207 printk("PMD %lx ", pmd_val(*pmd));
Jan Beulichb1992df2007-10-19 20:35:03 +0200208 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209
210 pte = pte_offset_kernel(pmd, address);
211 if (bad_address(pte)) goto bad;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100212 printk("PTE %lx", pte_val(*pte));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213ret:
214 printk("\n");
215 return;
216bad:
217 printk("BAD\n");
218}
219
Harvey Harrison1dc85be2008-01-30 13:32:35 +0100220#ifdef CONFIG_X86_64
Harvey Harrison33cb5242008-01-30 13:32:19 +0100221static const char errata93_warning[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
223KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
224KERN_ERR "******* Please consider a BIOS update.\n"
225KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
Harvey Harrisonfdfe8aa2008-01-30 13:33:13 +0100226#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700227
228/* Workaround for K8 erratum #93 & buggy BIOS.
229 BIOS SMM functions are required to use a specific workaround
Harvey Harrison33cb5242008-01-30 13:32:19 +0100230 to avoid corruption of the 64bit RIP register on C stepping K8.
231 A lot of BIOS that didn't get tested properly miss this.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700232 The OS sees this as a page fault with the upper 32bits of RIP cleared.
233 Try to work around it here.
Harvey Harrisonfdfe8aa2008-01-30 13:33:13 +0100234 Note we only handle faults in kernel here.
235 Does nothing for X86_32
236 */
Harvey Harrison33cb5242008-01-30 13:32:19 +0100237static int is_errata93(struct pt_regs *regs, unsigned long address)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700238{
Harvey Harrisonfdfe8aa2008-01-30 13:33:13 +0100239#ifdef CONFIG_X86_64
Linus Torvalds1da177e2005-04-16 15:20:36 -0700240 static int warned;
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100241 if (address != regs->ip)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700242 return 0;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100243 if ((address >> 32) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244 return 0;
245 address |= 0xffffffffUL << 32;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100246 if ((address >= (u64)_stext && address <= (u64)_etext) ||
247 (address >= MODULES_VADDR && address <= MODULES_END)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700248 if (!warned) {
Harvey Harrison33cb5242008-01-30 13:32:19 +0100249 printk(errata93_warning);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250 warned = 1;
251 }
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100252 regs->ip = address;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700253 return 1;
254 }
Harvey Harrisonfdfe8aa2008-01-30 13:33:13 +0100255#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256 return 0;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100257}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
260 unsigned long error_code)
261{
Jan Beulich12091402005-09-12 18:49:24 +0200262 unsigned long flags = oops_begin();
Jan Beulich6e3f3612006-01-11 22:42:14 +0100263 struct task_struct *tsk;
Jan Beulich12091402005-09-12 18:49:24 +0200264
Linus Torvalds1da177e2005-04-16 15:20:36 -0700265 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
266 current->comm, address);
267 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100268 tsk = current;
269 tsk->thread.cr2 = address;
270 tsk->thread.trap_no = 14;
271 tsk->thread.error_code = error_code;
Jan Beulich22f59912008-01-30 13:31:23 +0100272 if (__die("Bad pagetable", regs, error_code))
273 regs = NULL;
274 oops_end(flags, regs, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275}
276
277/*
Andi Kleenf95190b2006-01-11 22:44:00 +0100278 * Handle a fault on the vmalloc area
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700279 *
280 * This assumes no large pages in there.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700281 */
282static int vmalloc_fault(unsigned long address)
283{
Harvey Harrisonfdfe8aa2008-01-30 13:33:13 +0100284#ifdef CONFIG_X86_32
285 unsigned long pgd_paddr;
286 pmd_t *pmd_k;
287 pte_t *pte_k;
288 /*
289 * Synchronize this task's top level page-table
290 * with the 'reference' page table.
291 *
292 * Do _not_ use "current" here. We might be inside
293 * an interrupt in the middle of a task switch..
294 */
295 pgd_paddr = read_cr3();
296 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
297 if (!pmd_k)
298 return -1;
299 pte_k = pte_offset_kernel(pmd_k, address);
300 if (!pte_present(*pte_k))
301 return -1;
302 return 0;
303#else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304 pgd_t *pgd, *pgd_ref;
305 pud_t *pud, *pud_ref;
306 pmd_t *pmd, *pmd_ref;
307 pte_t *pte, *pte_ref;
308
309 /* Copy kernel mappings over when needed. This can also
310 happen within a race in page table update. In the later
311 case just flush. */
312
313 pgd = pgd_offset(current->mm ?: &init_mm, address);
314 pgd_ref = pgd_offset_k(address);
315 if (pgd_none(*pgd_ref))
316 return -1;
317 if (pgd_none(*pgd))
318 set_pgd(pgd, *pgd_ref);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100319 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700320 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321
322 /* Below here mismatches are bugs because these lower tables
323 are shared */
324
325 pud = pud_offset(pgd, address);
326 pud_ref = pud_offset(pgd_ref, address);
327 if (pud_none(*pud_ref))
328 return -1;
Dave McCracken46a82b22006-09-25 23:31:48 -0700329 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700330 BUG();
331 pmd = pmd_offset(pud, address);
332 pmd_ref = pmd_offset(pud_ref, address);
333 if (pmd_none(*pmd_ref))
334 return -1;
335 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
336 BUG();
337 pte_ref = pte_offset_kernel(pmd_ref, address);
338 if (!pte_present(*pte_ref))
339 return -1;
340 pte = pte_offset_kernel(pmd, address);
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700341 /* Don't use pte_page here, because the mappings can point
342 outside mem_map, and the NUMA hash lookup cannot handle
343 that. */
344 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700345 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700346 return 0;
Harvey Harrisonfdfe8aa2008-01-30 13:33:13 +0100347#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348}
349
Masoud Asgharifard Sharbianiabd4f752007-07-22 11:12:28 +0200350int show_unhandled_signals = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351
352/*
353 * This routine handles page faults. It determines the address,
354 * and the problem, and then passes it off to one of the appropriate
355 * routines.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700356 */
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -0700357asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
358 unsigned long error_code)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359{
360 struct task_struct *tsk;
361 struct mm_struct *mm;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100362 struct vm_area_struct *vma;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700363 unsigned long address;
Nick Piggin83c54072007-07-19 01:47:05 -0700364 int write, fault;
Jan Beulich12091402005-09-12 18:49:24 +0200365 unsigned long flags;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100366 int si_code;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700367
Peter Zijlstra143a5d32007-10-25 14:01:10 +0200368 /*
369 * We can fault from pretty much anywhere, with unknown IRQ state.
370 */
371 trace_hardirqs_fixup();
372
Arjan van de Vena9ba9a32006-03-25 16:30:10 +0100373 tsk = current;
374 mm = tsk->mm;
375 prefetchw(&mm->mmap_sem);
376
Linus Torvalds1da177e2005-04-16 15:20:36 -0700377 /* get the address */
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200378 address = read_cr2();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100380 si_code = SEGV_MAPERR;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700381
Harvey Harrison608566b2008-01-30 13:33:12 +0100382 if (notify_page_fault(regs))
383 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700384
385 /*
386 * We fault-in kernel-space virtual memory on-demand. The
387 * 'reference' page table is init_mm.pgd.
388 *
389 * NOTE! We MUST NOT take any locks for this case. We may
390 * be in an interrupt or a critical region, and should
391 * only copy the information from the master page table,
392 * nothing more.
393 *
394 * This verifies that the fault happens in kernel space
395 * (error_code & 4) == 0, and that the fault was not a
Jan Beulich8b1bde92006-01-11 22:42:23 +0100396 * protection error (error_code & 9) == 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700397 */
Suresh Siddha84929802005-06-21 17:14:32 -0700398 if (unlikely(address >= TASK_SIZE64)) {
Andi Kleenf95190b2006-01-11 22:44:00 +0100399 /*
400 * Don't check for the module range here: its PML4
401 * is always initialized because it's shared with the main
402 * kernel text. Only vmalloc may need PML4 syncups.
403 */
Andi Kleen66c58152006-01-11 22:44:09 +0100404 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
Andi Kleenf95190b2006-01-11 22:44:00 +0100405 ((address >= VMALLOC_START && address < VMALLOC_END))) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100406 if (vmalloc_fault(address) >= 0)
407 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700408 }
409 /*
410 * Don't take the mm semaphore here. If we fixup a prefetch
411 * fault we could otherwise deadlock.
412 */
413 goto bad_area_nosemaphore;
414 }
415
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100416 if (likely(regs->flags & X86_EFLAGS_IF))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100417 local_irq_enable();
418
Andi Kleen66c58152006-01-11 22:44:09 +0100419 if (unlikely(error_code & PF_RSVD))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700420 pgtable_bad(address, regs, error_code);
421
422 /*
Harvey Harrison33cb5242008-01-30 13:32:19 +0100423 * If we're in an interrupt, have no user context or are running in an
424 * atomic region then we must not take the fault.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425 */
426 if (unlikely(in_atomic() || !mm))
427 goto bad_area_nosemaphore;
428
Linus Torvaldsdbe3ed12007-09-19 11:37:14 -0700429 /*
430 * User-mode registers count as a user access even for any
431 * potential system fault or CPU buglet.
432 */
433 if (user_mode_vm(regs))
434 error_code |= PF_USER;
435
Linus Torvalds1da177e2005-04-16 15:20:36 -0700436 again:
437 /* When running in the kernel we expect faults to occur only to
438 * addresses in user space. All other faults represent errors in the
Simon Arlott676b1852007-10-20 01:25:36 +0200439 * kernel and should generate an OOPS. Unfortunately, in the case of an
Adrian Bunk80f72282006-06-30 18:27:16 +0200440 * erroneous fault occurring in a code path which already holds mmap_sem
Linus Torvalds1da177e2005-04-16 15:20:36 -0700441 * we will deadlock attempting to validate the fault against the
442 * address space. Luckily the kernel only validly references user
443 * space from well defined areas of code, which are listed in the
444 * exceptions table.
445 *
446 * As the vast majority of faults will be valid we will only perform
Simon Arlott676b1852007-10-20 01:25:36 +0200447 * the source reference check when there is a possibility of a deadlock.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700448 * Attempt to lock the address space, if we cannot we then validate the
449 * source. If this is invalid we can skip the address space check,
450 * thus avoiding the deadlock.
451 */
452 if (!down_read_trylock(&mm->mmap_sem)) {
Andi Kleen66c58152006-01-11 22:44:09 +0100453 if ((error_code & PF_USER) == 0 &&
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100454 !search_exception_tables(regs->ip))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700455 goto bad_area_nosemaphore;
456 down_read(&mm->mmap_sem);
457 }
458
459 vma = find_vma(mm, address);
460 if (!vma)
461 goto bad_area;
462 if (likely(vma->vm_start <= address))
463 goto good_area;
464 if (!(vma->vm_flags & VM_GROWSDOWN))
465 goto bad_area;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100466 if (error_code & PF_USER) {
Harvey Harrison6f4d3682008-01-30 13:33:13 +0100467 /*
468 * Accessing the stack below %sp is always a bug.
469 * The large cushion allows instructions like enter
470 * and pusha to work. ("enter $65535,$31" pushes
471 * 32 pointers and then decrements %sp by 65535.)
Chuck Ebbert03fdc2c2006-06-26 13:59:50 +0200472 */
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100473 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474 goto bad_area;
475 }
476 if (expand_stack(vma, address))
477 goto bad_area;
478/*
479 * Ok, we have a good vm_area for this memory access, so
480 * we can handle it..
481 */
482good_area:
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100483 si_code = SEGV_ACCERR;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700484 write = 0;
Andi Kleen66c58152006-01-11 22:44:09 +0100485 switch (error_code & (PF_PROT|PF_WRITE)) {
Harvey Harrison33cb5242008-01-30 13:32:19 +0100486 default: /* 3: write, present */
487 /* fall through */
488 case PF_WRITE: /* write, not present */
489 if (!(vma->vm_flags & VM_WRITE))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700490 goto bad_area;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100491 write++;
492 break;
493 case PF_PROT: /* read, present */
494 goto bad_area;
495 case 0: /* read, not present */
496 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
497 goto bad_area;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498 }
499
500 /*
501 * If for any reason at all we couldn't handle the fault,
502 * make sure we exit gracefully rather than endlessly redo
503 * the fault.
504 */
Nick Piggin83c54072007-07-19 01:47:05 -0700505 fault = handle_mm_fault(mm, vma, address, write);
506 if (unlikely(fault & VM_FAULT_ERROR)) {
507 if (fault & VM_FAULT_OOM)
508 goto out_of_memory;
509 else if (fault & VM_FAULT_SIGBUS)
510 goto do_sigbus;
511 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700512 }
Nick Piggin83c54072007-07-19 01:47:05 -0700513 if (fault & VM_FAULT_MAJOR)
514 tsk->maj_flt++;
515 else
516 tsk->min_flt++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700517 up_read(&mm->mmap_sem);
518 return;
519
520/*
521 * Something tried to access memory that isn't in our memory map..
522 * Fix it, but check if it's kernel or user first..
523 */
524bad_area:
525 up_read(&mm->mmap_sem);
526
527bad_area_nosemaphore:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700528 /* User mode accesses just cause a SIGSEGV */
Andi Kleen66c58152006-01-11 22:44:09 +0100529 if (error_code & PF_USER) {
Steven Rostedte5e3c842007-06-06 23:34:04 -0400530
531 /*
532 * It's possible to have interrupts off here.
533 */
534 local_irq_enable();
535
Linus Torvalds1da177e2005-04-16 15:20:36 -0700536 if (is_prefetch(regs, address, error_code))
537 return;
538
539 /* Work around K8 erratum #100 K8 in compat mode
540 occasionally jumps to illegal addresses >4GB. We
541 catch this here in the page fault handler because
542 these addresses are not reachable. Just detect this
543 case and return. Any code segment in LDT is
544 compatibility mode. */
545 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
546 (address >> 32))
547 return;
548
Masoud Asgharifard Sharbianiabd4f752007-07-22 11:12:28 +0200549 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
550 printk_ratelimit()) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700551 printk(
Harvey Harrison6f4d3682008-01-30 13:33:13 +0100552#ifdef CONFIG_X86_32
Harvey Harrisonedcd8112008-01-30 13:33:16 +0100553 "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx",
Harvey Harrison6f4d3682008-01-30 13:33:13 +0100554#else
555 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
556#endif
557 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
558 tsk->comm, task_pid_nr(tsk), address, regs->ip,
559 regs->sp, error_code);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700560 }
Harvey Harrison33cb5242008-01-30 13:32:19 +0100561
Linus Torvalds1da177e2005-04-16 15:20:36 -0700562 tsk->thread.cr2 = address;
563 /* Kernel addresses are always protection faults */
564 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
565 tsk->thread.trap_no = 14;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100566
567 force_sig_info_fault(SIGSEGV, si_code, address, tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700568 return;
569 }
570
571no_context:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700572 /* Are we prepared to handle this kernel fault? */
Harvey Harrison33cb5242008-01-30 13:32:19 +0100573 if (fixup_exception(regs))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700574 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700575
Harvey Harrison33cb5242008-01-30 13:32:19 +0100576 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700577 * Hall of shame of CPU/BIOS bugs.
578 */
579
Harvey Harrison33cb5242008-01-30 13:32:19 +0100580 if (is_prefetch(regs, address, error_code))
581 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700582
583 if (is_errata93(regs, address))
Harvey Harrison33cb5242008-01-30 13:32:19 +0100584 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700585
586/*
587 * Oops. The kernel tried to access some bad page. We'll have to
588 * terminate things with extreme prejudice.
589 */
590
Jan Beulich12091402005-09-12 18:49:24 +0200591 flags = oops_begin();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700592
593 if (address < PAGE_SIZE)
594 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
595 else
596 printk(KERN_ALERT "Unable to handle kernel paging request");
Harvey Harrison33cb5242008-01-30 13:32:19 +0100597 printk(" at %016lx RIP: \n" KERN_ALERT, address);
Arjan van de Venbc850d62008-01-30 13:33:07 +0100598 printk_address(regs->ip, regs->bp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700599 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100600 tsk->thread.cr2 = address;
601 tsk->thread.trap_no = 14;
602 tsk->thread.error_code = error_code;
Jan Beulich22f59912008-01-30 13:31:23 +0100603 if (__die("Oops", regs, error_code))
604 regs = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700605 /* Executive summary in case the body of the oops scrolled away */
606 printk(KERN_EMERG "CR2: %016lx\n", address);
Jan Beulich22f59912008-01-30 13:31:23 +0100607 oops_end(flags, regs, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700608
609/*
610 * We ran out of memory, or some other thing happened to us that made
611 * us unable to handle the page fault gracefully.
612 */
613out_of_memory:
614 up_read(&mm->mmap_sem);
Serge E. Hallynb460cbc2007-10-18 23:39:52 -0700615 if (is_global_init(current)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700616 yield();
617 goto again;
618 }
619 printk("VM: killing process %s\n", tsk->comm);
Harvey Harrison318aa292008-01-30 13:32:59 +0100620 if (error_code & PF_USER)
Will Schmidt021daae2007-07-21 17:11:17 +0200621 do_group_exit(SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700622 goto no_context;
623
624do_sigbus:
625 up_read(&mm->mmap_sem);
626
627 /* Kernel mode? Handle exceptions or die */
Andi Kleen66c58152006-01-11 22:44:09 +0100628 if (!(error_code & PF_USER))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629 goto no_context;
630
631 tsk->thread.cr2 = address;
632 tsk->thread.error_code = error_code;
633 tsk->thread.trap_no = 14;
Harvey Harrisonc4aba4a2008-01-30 13:32:35 +0100634 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700635 return;
636}
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100637
Jan Beulich8c914cb2006-03-25 16:29:40 +0100638DEFINE_SPINLOCK(pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200639LIST_HEAD(pgd_list);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100640
641void vmalloc_sync_all(void)
642{
Harvey Harrison6f4d3682008-01-30 13:33:13 +0100643 /*
644 * Note that races in the updates of insync and start aren't
645 * problematic: insync can only get set bits added, and updates to
646 * start are only improving performance (without affecting correctness
647 * if undone).
648 */
Jan Beulich8c914cb2006-03-25 16:29:40 +0100649 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
650 static unsigned long start = VMALLOC_START & PGDIR_MASK;
651 unsigned long address;
652
653 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
654 if (!test_bit(pgd_index(address), insync)) {
655 const pgd_t *pgd_ref = pgd_offset_k(address);
656 struct page *page;
657
658 if (pgd_none(*pgd_ref))
659 continue;
660 spin_lock(&pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200661 list_for_each_entry(page, &pgd_list, lru) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100662 pgd_t *pgd;
663 pgd = (pgd_t *)page_address(page) + pgd_index(address);
664 if (pgd_none(*pgd))
665 set_pgd(pgd, *pgd_ref);
666 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700667 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Jan Beulich8c914cb2006-03-25 16:29:40 +0100668 }
669 spin_unlock(&pgd_lock);
670 set_bit(pgd_index(address), insync);
671 }
672 if (address == start)
673 start = address + PGDIR_SIZE;
674 }
675 /* Check that there is no need to do the same for the modules area. */
676 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
Harvey Harrison33cb5242008-01-30 13:32:19 +0100677 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
Jan Beulich8c914cb2006-03-25 16:29:40 +0100678 (__START_KERNEL & PGDIR_MASK)));
679}