blob: 7e98a76912837c92b02b1d83f0fdb35daf59fc39 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002 * Copyright (C) 1995 Linus Torvalds
3 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
4 */
5
Linus Torvalds1da177e2005-04-16 15:20:36 -07006#include <linux/signal.h>
7#include <linux/sched.h>
8#include <linux/kernel.h>
9#include <linux/errno.h>
10#include <linux/string.h>
11#include <linux/types.h>
12#include <linux/ptrace.h>
13#include <linux/mman.h>
14#include <linux/mm.h>
15#include <linux/smp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070016#include <linux/interrupt.h>
17#include <linux/init.h>
18#include <linux/tty.h>
19#include <linux/vt_kern.h> /* For unblank_screen() */
20#include <linux/compiler.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070021#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/module.h>
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -070023#include <linux/kprobes.h>
Andi Kleenab2bf0c2006-12-07 02:14:06 +010024#include <linux/uaccess.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070025#include <linux/kdebug.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070026
27#include <asm/system.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070028#include <asm/pgalloc.h>
29#include <asm/smp.h>
30#include <asm/tlbflush.h>
31#include <asm/proto.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070032#include <asm-generic/sections.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070033
Harvey Harrison33cb5242008-01-30 13:32:19 +010034/*
35 * Page fault error code bits
36 * bit 0 == 0 means no page found, 1 means protection fault
37 * bit 1 == 0 means read, 1 means write
38 * bit 2 == 0 means kernel, 1 means user-mode
39 * bit 3 == 1 means use of reserved bit detected
40 * bit 4 == 1 means fault was an instruction fetch
41 */
42#define PF_PROT (1<<0)
Andi Kleen66c58152006-01-11 22:44:09 +010043#define PF_WRITE (1<<1)
44#define PF_USER (1<<2)
45#define PF_RSVD (1<<3)
46#define PF_INSTR (1<<4)
47
Christoph Hellwig74a0b572007-10-16 01:24:07 -070048static inline int notify_page_fault(struct pt_regs *regs)
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070049{
Harvey Harrison33cb5242008-01-30 13:32:19 +010050#ifdef CONFIG_KPROBES
Christoph Hellwig74a0b572007-10-16 01:24:07 -070051 int ret = 0;
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070052
Christoph Hellwig74a0b572007-10-16 01:24:07 -070053 /* kprobe_running() needs smp_processor_id() */
54 if (!user_mode(regs)) {
55 preempt_disable();
56 if (kprobe_running() && kprobe_fault_handler(regs, 14))
57 ret = 1;
58 preempt_enable();
59 }
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070060
Christoph Hellwig74a0b572007-10-16 01:24:07 -070061 return ret;
Christoph Hellwig74a0b572007-10-16 01:24:07 -070062#else
Christoph Hellwig74a0b572007-10-16 01:24:07 -070063 return 0;
Christoph Hellwig74a0b572007-10-16 01:24:07 -070064#endif
Harvey Harrison33cb5242008-01-30 13:32:19 +010065}
Anil S Keshavamurthy1bd858a2006-06-26 00:25:25 -070066
Linus Torvalds1da177e2005-04-16 15:20:36 -070067/* Sometimes the CPU reports invalid exceptions on prefetch.
68 Check that here and ignore.
69 Opcode checker based on code by Richard Brunner */
70static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
71 unsigned long error_code)
Harvey Harrison33cb5242008-01-30 13:32:19 +010072{
Andi Kleenab2bf0c2006-12-07 02:14:06 +010073 unsigned char *instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -070074 int scan_more = 1;
Harvey Harrison33cb5242008-01-30 13:32:19 +010075 int prefetch = 0;
Andi Kleenf1290ec2005-04-16 15:24:59 -070076 unsigned char *max_instr;
Linus Torvalds1da177e2005-04-16 15:20:36 -070077
78 /* If it was a exec fault ignore */
Andi Kleen66c58152006-01-11 22:44:09 +010079 if (error_code & PF_INSTR)
Linus Torvalds1da177e2005-04-16 15:20:36 -070080 return 0;
Harvey Harrison33cb5242008-01-30 13:32:19 +010081
Andi Kleendd2994f2006-09-26 10:52:33 +020082 instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
Andi Kleenf1290ec2005-04-16 15:24:59 -070083 max_instr = instr + 15;
Linus Torvalds1da177e2005-04-16 15:20:36 -070084
Vincent Hanquez76381fe2005-06-23 00:08:46 -070085 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -070086 return 0;
87
Harvey Harrison33cb5242008-01-30 13:32:19 +010088 while (scan_more && instr < max_instr) {
Linus Torvalds1da177e2005-04-16 15:20:36 -070089 unsigned char opcode;
90 unsigned char instr_hi;
91 unsigned char instr_lo;
92
Andi Kleenab2bf0c2006-12-07 02:14:06 +010093 if (probe_kernel_address(instr, opcode))
Harvey Harrison33cb5242008-01-30 13:32:19 +010094 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -070095
Harvey Harrison33cb5242008-01-30 13:32:19 +010096 instr_hi = opcode & 0xf0;
97 instr_lo = opcode & 0x0f;
Linus Torvalds1da177e2005-04-16 15:20:36 -070098 instr++;
99
Harvey Harrison33cb5242008-01-30 13:32:19 +0100100 switch (instr_hi) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101 case 0x20:
102 case 0x30:
Harvey Harrison33cb5242008-01-30 13:32:19 +0100103 /*
104 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
105 * In X86_64 long mode, the CPU will signal invalid
106 * opcode if some of these prefixes are present so
107 * X86_64 will never get here anyway
108 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700109 scan_more = ((instr_lo & 7) == 0x6);
110 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100111#ifdef CONFIG_X86_64
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112 case 0x40:
Harvey Harrison33cb5242008-01-30 13:32:19 +0100113 /*
114 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
115 * Need to figure out under what instruction mode the
116 * instruction was issued. Could check the LDT for lm,
117 * but for now it's good enough to assume that long
118 * mode only uses well known segments or kernel.
119 */
Vincent Hanquez76381fe2005-06-23 00:08:46 -0700120 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100122#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123 case 0x60:
124 /* 0x64 thru 0x67 are valid prefixes in all modes. */
125 scan_more = (instr_lo & 0xC) == 0x4;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100126 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700127 case 0xF0:
128 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
129 scan_more = !instr_lo || (instr_lo>>1) == 1;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100130 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700131 case 0x00:
132 /* Prefetch instruction is 0x0F0D or 0x0F18 */
133 scan_more = 0;
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100134 if (probe_kernel_address(instr, opcode))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135 break;
136 prefetch = (instr_lo == 0xF) &&
137 (opcode == 0x0D || opcode == 0x18);
Harvey Harrison33cb5242008-01-30 13:32:19 +0100138 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700139 default:
140 scan_more = 0;
141 break;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100142 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143 }
144 return prefetch;
145}
146
Harvey Harrison33cb5242008-01-30 13:32:19 +0100147static int bad_address(void *p)
148{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149 unsigned long dummy;
Andi Kleenab2bf0c2006-12-07 02:14:06 +0100150 return probe_kernel_address((unsigned long *)p, dummy);
Harvey Harrison33cb5242008-01-30 13:32:19 +0100151}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700152
153void dump_pagetable(unsigned long address)
154{
155 pgd_t *pgd;
156 pud_t *pud;
157 pmd_t *pmd;
158 pte_t *pte;
159
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200160 pgd = (pgd_t *)read_cr3();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700161
Harvey Harrison33cb5242008-01-30 13:32:19 +0100162 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163 pgd += pgd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700164 if (bad_address(pgd)) goto bad;
Jan Beulichd646bce2006-02-03 21:51:47 +0100165 printk("PGD %lx ", pgd_val(*pgd));
Harvey Harrison33cb5242008-01-30 13:32:19 +0100166 if (!pgd_present(*pgd)) goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167
Andi Kleend2ae5b52006-06-26 13:57:56 +0200168 pud = pud_offset(pgd, address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169 if (bad_address(pud)) goto bad;
170 printk("PUD %lx ", pud_val(*pud));
171 if (!pud_present(*pud)) goto ret;
172
173 pmd = pmd_offset(pud, address);
174 if (bad_address(pmd)) goto bad;
175 printk("PMD %lx ", pmd_val(*pmd));
Jan Beulichb1992df2007-10-19 20:35:03 +0200176 if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700177
178 pte = pte_offset_kernel(pmd, address);
179 if (bad_address(pte)) goto bad;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100180 printk("PTE %lx", pte_val(*pte));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181ret:
182 printk("\n");
183 return;
184bad:
185 printk("BAD\n");
186}
187
Harvey Harrison33cb5242008-01-30 13:32:19 +0100188static const char errata93_warning[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
190KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
191KERN_ERR "******* Please consider a BIOS update.\n"
192KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
193
194/* Workaround for K8 erratum #93 & buggy BIOS.
195 BIOS SMM functions are required to use a specific workaround
Harvey Harrison33cb5242008-01-30 13:32:19 +0100196 to avoid corruption of the 64bit RIP register on C stepping K8.
197 A lot of BIOS that didn't get tested properly miss this.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198 The OS sees this as a page fault with the upper 32bits of RIP cleared.
199 Try to work around it here.
200 Note we only handle faults in kernel here. */
201
Harvey Harrison33cb5242008-01-30 13:32:19 +0100202static int is_errata93(struct pt_regs *regs, unsigned long address)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700203{
204 static int warned;
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100205 if (address != regs->ip)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206 return 0;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100207 if ((address >> 32) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700208 return 0;
209 address |= 0xffffffffUL << 32;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100210 if ((address >= (u64)_stext && address <= (u64)_etext) ||
211 (address >= MODULES_VADDR && address <= MODULES_END)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700212 if (!warned) {
Harvey Harrison33cb5242008-01-30 13:32:19 +0100213 printk(errata93_warning);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700214 warned = 1;
215 }
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100216 regs->ip = address;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700217 return 1;
218 }
219 return 0;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100220}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700221
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
223 unsigned long error_code)
224{
Jan Beulich12091402005-09-12 18:49:24 +0200225 unsigned long flags = oops_begin();
Jan Beulich6e3f3612006-01-11 22:42:14 +0100226 struct task_struct *tsk;
Jan Beulich12091402005-09-12 18:49:24 +0200227
Linus Torvalds1da177e2005-04-16 15:20:36 -0700228 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
229 current->comm, address);
230 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100231 tsk = current;
232 tsk->thread.cr2 = address;
233 tsk->thread.trap_no = 14;
234 tsk->thread.error_code = error_code;
Jan Beulich22f59912008-01-30 13:31:23 +0100235 if (__die("Bad pagetable", regs, error_code))
236 regs = NULL;
237 oops_end(flags, regs, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700238}
239
240/*
Andi Kleenf95190b2006-01-11 22:44:00 +0100241 * Handle a fault on the vmalloc area
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700242 *
243 * This assumes no large pages in there.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244 */
245static int vmalloc_fault(unsigned long address)
246{
247 pgd_t *pgd, *pgd_ref;
248 pud_t *pud, *pud_ref;
249 pmd_t *pmd, *pmd_ref;
250 pte_t *pte, *pte_ref;
251
252 /* Copy kernel mappings over when needed. This can also
253 happen within a race in page table update. In the later
254 case just flush. */
255
256 pgd = pgd_offset(current->mm ?: &init_mm, address);
257 pgd_ref = pgd_offset_k(address);
258 if (pgd_none(*pgd_ref))
259 return -1;
260 if (pgd_none(*pgd))
261 set_pgd(pgd, *pgd_ref);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100262 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700263 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700264
265 /* Below here mismatches are bugs because these lower tables
266 are shared */
267
268 pud = pud_offset(pgd, address);
269 pud_ref = pud_offset(pgd_ref, address);
270 if (pud_none(*pud_ref))
271 return -1;
Dave McCracken46a82b22006-09-25 23:31:48 -0700272 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700273 BUG();
274 pmd = pmd_offset(pud, address);
275 pmd_ref = pmd_offset(pud_ref, address);
276 if (pmd_none(*pmd_ref))
277 return -1;
278 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
279 BUG();
280 pte_ref = pte_offset_kernel(pmd_ref, address);
281 if (!pte_present(*pte_ref))
282 return -1;
283 pte = pte_offset_kernel(pmd, address);
Andi Kleen3b9ba4d2005-05-16 21:53:31 -0700284 /* Don't use pte_page here, because the mappings can point
285 outside mem_map, and the NUMA hash lookup cannot handle
286 that. */
287 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700288 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289 return 0;
290}
291
Masoud Asgharifard Sharbianiabd4f752007-07-22 11:12:28 +0200292int show_unhandled_signals = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700293
294/*
295 * This routine handles page faults. It determines the address,
296 * and the problem, and then passes it off to one of the appropriate
297 * routines.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298 */
Prasanna S Panchamukhi0f2fbdc2005-09-06 15:19:28 -0700299asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
300 unsigned long error_code)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301{
302 struct task_struct *tsk;
303 struct mm_struct *mm;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100304 struct vm_area_struct *vma;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700305 unsigned long address;
Nick Piggin83c54072007-07-19 01:47:05 -0700306 int write, fault;
Jan Beulich12091402005-09-12 18:49:24 +0200307 unsigned long flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308 siginfo_t info;
309
Peter Zijlstra143a5d32007-10-25 14:01:10 +0200310 /*
311 * We can fault from pretty much anywhere, with unknown IRQ state.
312 */
313 trace_hardirqs_fixup();
314
Arjan van de Vena9ba9a32006-03-25 16:30:10 +0100315 tsk = current;
316 mm = tsk->mm;
317 prefetchw(&mm->mmap_sem);
318
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319 /* get the address */
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200320 address = read_cr2();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322 info.si_code = SEGV_MAPERR;
323
324
325 /*
326 * We fault-in kernel-space virtual memory on-demand. The
327 * 'reference' page table is init_mm.pgd.
328 *
329 * NOTE! We MUST NOT take any locks for this case. We may
330 * be in an interrupt or a critical region, and should
331 * only copy the information from the master page table,
332 * nothing more.
333 *
334 * This verifies that the fault happens in kernel space
335 * (error_code & 4) == 0, and that the fault was not a
Jan Beulich8b1bde92006-01-11 22:42:23 +0100336 * protection error (error_code & 9) == 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700337 */
Suresh Siddha84929802005-06-21 17:14:32 -0700338 if (unlikely(address >= TASK_SIZE64)) {
Andi Kleenf95190b2006-01-11 22:44:00 +0100339 /*
340 * Don't check for the module range here: its PML4
341 * is always initialized because it's shared with the main
342 * kernel text. Only vmalloc may need PML4 syncups.
343 */
Andi Kleen66c58152006-01-11 22:44:09 +0100344 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
Andi Kleenf95190b2006-01-11 22:44:00 +0100345 ((address >= VMALLOC_START && address < VMALLOC_END))) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100346 if (vmalloc_fault(address) >= 0)
347 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348 }
Christoph Hellwig74a0b572007-10-16 01:24:07 -0700349 if (notify_page_fault(regs))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100350 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351 /*
352 * Don't take the mm semaphore here. If we fixup a prefetch
353 * fault we could otherwise deadlock.
354 */
355 goto bad_area_nosemaphore;
356 }
357
Christoph Hellwig74a0b572007-10-16 01:24:07 -0700358 if (notify_page_fault(regs))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100359 return;
360
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100361 if (likely(regs->flags & X86_EFLAGS_IF))
Jan Beulich8c914cb2006-03-25 16:29:40 +0100362 local_irq_enable();
363
Andi Kleen66c58152006-01-11 22:44:09 +0100364 if (unlikely(error_code & PF_RSVD))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700365 pgtable_bad(address, regs, error_code);
366
367 /*
Harvey Harrison33cb5242008-01-30 13:32:19 +0100368 * If we're in an interrupt, have no user context or are running in an
369 * atomic region then we must not take the fault.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370 */
371 if (unlikely(in_atomic() || !mm))
372 goto bad_area_nosemaphore;
373
Linus Torvaldsdbe3ed12007-09-19 11:37:14 -0700374 /*
375 * User-mode registers count as a user access even for any
376 * potential system fault or CPU buglet.
377 */
378 if (user_mode_vm(regs))
379 error_code |= PF_USER;
380
Linus Torvalds1da177e2005-04-16 15:20:36 -0700381 again:
382 /* When running in the kernel we expect faults to occur only to
383 * addresses in user space. All other faults represent errors in the
Simon Arlott676b1852007-10-20 01:25:36 +0200384 * kernel and should generate an OOPS. Unfortunately, in the case of an
Adrian Bunk80f72282006-06-30 18:27:16 +0200385 * erroneous fault occurring in a code path which already holds mmap_sem
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386 * we will deadlock attempting to validate the fault against the
387 * address space. Luckily the kernel only validly references user
388 * space from well defined areas of code, which are listed in the
389 * exceptions table.
390 *
391 * As the vast majority of faults will be valid we will only perform
Simon Arlott676b1852007-10-20 01:25:36 +0200392 * the source reference check when there is a possibility of a deadlock.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393 * Attempt to lock the address space, if we cannot we then validate the
394 * source. If this is invalid we can skip the address space check,
395 * thus avoiding the deadlock.
396 */
397 if (!down_read_trylock(&mm->mmap_sem)) {
Andi Kleen66c58152006-01-11 22:44:09 +0100398 if ((error_code & PF_USER) == 0 &&
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100399 !search_exception_tables(regs->ip))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400 goto bad_area_nosemaphore;
401 down_read(&mm->mmap_sem);
402 }
403
404 vma = find_vma(mm, address);
405 if (!vma)
406 goto bad_area;
407 if (likely(vma->vm_start <= address))
408 goto good_area;
409 if (!(vma->vm_flags & VM_GROWSDOWN))
410 goto bad_area;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100411 if (error_code & PF_USER) {
Chuck Ebbert03fdc2c2006-06-26 13:59:50 +0200412 /* Allow userspace just enough access below the stack pointer
413 * to let the 'enter' instruction work.
414 */
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100415 if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416 goto bad_area;
417 }
418 if (expand_stack(vma, address))
419 goto bad_area;
420/*
421 * Ok, we have a good vm_area for this memory access, so
422 * we can handle it..
423 */
424good_area:
425 info.si_code = SEGV_ACCERR;
426 write = 0;
Andi Kleen66c58152006-01-11 22:44:09 +0100427 switch (error_code & (PF_PROT|PF_WRITE)) {
Harvey Harrison33cb5242008-01-30 13:32:19 +0100428 default: /* 3: write, present */
429 /* fall through */
430 case PF_WRITE: /* write, not present */
431 if (!(vma->vm_flags & VM_WRITE))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700432 goto bad_area;
Harvey Harrison33cb5242008-01-30 13:32:19 +0100433 write++;
434 break;
435 case PF_PROT: /* read, present */
436 goto bad_area;
437 case 0: /* read, not present */
438 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
439 goto bad_area;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440 }
441
442 /*
443 * If for any reason at all we couldn't handle the fault,
444 * make sure we exit gracefully rather than endlessly redo
445 * the fault.
446 */
Nick Piggin83c54072007-07-19 01:47:05 -0700447 fault = handle_mm_fault(mm, vma, address, write);
448 if (unlikely(fault & VM_FAULT_ERROR)) {
449 if (fault & VM_FAULT_OOM)
450 goto out_of_memory;
451 else if (fault & VM_FAULT_SIGBUS)
452 goto do_sigbus;
453 BUG();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454 }
Nick Piggin83c54072007-07-19 01:47:05 -0700455 if (fault & VM_FAULT_MAJOR)
456 tsk->maj_flt++;
457 else
458 tsk->min_flt++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700459 up_read(&mm->mmap_sem);
460 return;
461
462/*
463 * Something tried to access memory that isn't in our memory map..
464 * Fix it, but check if it's kernel or user first..
465 */
466bad_area:
467 up_read(&mm->mmap_sem);
468
469bad_area_nosemaphore:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700470 /* User mode accesses just cause a SIGSEGV */
Andi Kleen66c58152006-01-11 22:44:09 +0100471 if (error_code & PF_USER) {
Steven Rostedte5e3c842007-06-06 23:34:04 -0400472
473 /*
474 * It's possible to have interrupts off here.
475 */
476 local_irq_enable();
477
Linus Torvalds1da177e2005-04-16 15:20:36 -0700478 if (is_prefetch(regs, address, error_code))
479 return;
480
481 /* Work around K8 erratum #100 K8 in compat mode
482 occasionally jumps to illegal addresses >4GB. We
483 catch this here in the page fault handler because
484 these addresses are not reachable. Just detect this
485 case and return. Any code segment in LDT is
486 compatibility mode. */
487 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
488 (address >> 32))
489 return;
490
Masoud Asgharifard Sharbianiabd4f752007-07-22 11:12:28 +0200491 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
492 printk_ratelimit()) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700493 printk(
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100494 "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100496 tsk->comm, tsk->pid, address, regs->ip,
497 regs->sp, error_code);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498 }
Harvey Harrison33cb5242008-01-30 13:32:19 +0100499
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500 tsk->thread.cr2 = address;
501 /* Kernel addresses are always protection faults */
502 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
503 tsk->thread.trap_no = 14;
504 info.si_signo = SIGSEGV;
505 info.si_errno = 0;
506 /* info.si_code has been set above */
507 info.si_addr = (void __user *)address;
508 force_sig_info(SIGSEGV, &info, tsk);
509 return;
510 }
511
512no_context:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700513 /* Are we prepared to handle this kernel fault? */
Harvey Harrison33cb5242008-01-30 13:32:19 +0100514 if (fixup_exception(regs))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700515 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700516
Harvey Harrison33cb5242008-01-30 13:32:19 +0100517 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700518 * Hall of shame of CPU/BIOS bugs.
519 */
520
Harvey Harrison33cb5242008-01-30 13:32:19 +0100521 if (is_prefetch(regs, address, error_code))
522 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700523
524 if (is_errata93(regs, address))
Harvey Harrison33cb5242008-01-30 13:32:19 +0100525 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700526
527/*
528 * Oops. The kernel tried to access some bad page. We'll have to
529 * terminate things with extreme prejudice.
530 */
531
Jan Beulich12091402005-09-12 18:49:24 +0200532 flags = oops_begin();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700533
534 if (address < PAGE_SIZE)
535 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
536 else
537 printk(KERN_ALERT "Unable to handle kernel paging request");
Harvey Harrison33cb5242008-01-30 13:32:19 +0100538 printk(" at %016lx RIP: \n" KERN_ALERT, address);
H. Peter Anvin65ea5b02008-01-30 13:30:56 +0100539 printk_address(regs->ip);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700540 dump_pagetable(address);
Jan Beulich6e3f3612006-01-11 22:42:14 +0100541 tsk->thread.cr2 = address;
542 tsk->thread.trap_no = 14;
543 tsk->thread.error_code = error_code;
Jan Beulich22f59912008-01-30 13:31:23 +0100544 if (__die("Oops", regs, error_code))
545 regs = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700546 /* Executive summary in case the body of the oops scrolled away */
547 printk(KERN_EMERG "CR2: %016lx\n", address);
Jan Beulich22f59912008-01-30 13:31:23 +0100548 oops_end(flags, regs, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549
550/*
551 * We ran out of memory, or some other thing happened to us that made
552 * us unable to handle the page fault gracefully.
553 */
554out_of_memory:
555 up_read(&mm->mmap_sem);
Serge E. Hallynb460cbc2007-10-18 23:39:52 -0700556 if (is_global_init(current)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700557 yield();
558 goto again;
559 }
560 printk("VM: killing process %s\n", tsk->comm);
561 if (error_code & 4)
Will Schmidt021daae2007-07-21 17:11:17 +0200562 do_group_exit(SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700563 goto no_context;
564
565do_sigbus:
566 up_read(&mm->mmap_sem);
567
568 /* Kernel mode? Handle exceptions or die */
Andi Kleen66c58152006-01-11 22:44:09 +0100569 if (!(error_code & PF_USER))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700570 goto no_context;
571
572 tsk->thread.cr2 = address;
573 tsk->thread.error_code = error_code;
574 tsk->thread.trap_no = 14;
575 info.si_signo = SIGBUS;
576 info.si_errno = 0;
577 info.si_code = BUS_ADRERR;
578 info.si_addr = (void __user *)address;
579 force_sig_info(SIGBUS, &info, tsk);
580 return;
581}
Andi Kleen9e43e1b2005-11-05 17:25:54 +0100582
Jan Beulich8c914cb2006-03-25 16:29:40 +0100583DEFINE_SPINLOCK(pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200584LIST_HEAD(pgd_list);
Jan Beulich8c914cb2006-03-25 16:29:40 +0100585
586void vmalloc_sync_all(void)
587{
Harvey Harrison33cb5242008-01-30 13:32:19 +0100588 /* Note that races in the updates of insync and start aren't
Jan Beulich8c914cb2006-03-25 16:29:40 +0100589 problematic:
590 insync can only get set bits added, and updates to start are only
591 improving performance (without affecting correctness if undone). */
592 static DECLARE_BITMAP(insync, PTRS_PER_PGD);
593 static unsigned long start = VMALLOC_START & PGDIR_MASK;
594 unsigned long address;
595
596 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
597 if (!test_bit(pgd_index(address), insync)) {
598 const pgd_t *pgd_ref = pgd_offset_k(address);
599 struct page *page;
600
601 if (pgd_none(*pgd_ref))
602 continue;
603 spin_lock(&pgd_lock);
Christoph Lameter2bff7382007-05-02 19:27:10 +0200604 list_for_each_entry(page, &pgd_list, lru) {
Jan Beulich8c914cb2006-03-25 16:29:40 +0100605 pgd_t *pgd;
606 pgd = (pgd_t *)page_address(page) + pgd_index(address);
607 if (pgd_none(*pgd))
608 set_pgd(pgd, *pgd_ref);
609 else
Dave McCracken46a82b22006-09-25 23:31:48 -0700610 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
Jan Beulich8c914cb2006-03-25 16:29:40 +0100611 }
612 spin_unlock(&pgd_lock);
613 set_bit(pgd_index(address), insync);
614 }
615 if (address == start)
616 start = address + PGDIR_SIZE;
617 }
618 /* Check that there is no need to do the same for the modules area. */
619 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
Harvey Harrison33cb5242008-01-30 13:32:19 +0100620 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
Jan Beulich8c914cb2006-03-25 16:29:40 +0100621 (__START_KERNEL & PGDIR_MASK)));
622}