blob: 1ad5111aec381d7042648c0a871f2707181f3400 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */
8
Linus Torvalds1da177e2005-04-16 15:20:36 -07009#include <linux/signal.h>
10#include <linux/sched.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/string.h>
14#include <linux/types.h>
15#include <linux/ptrace.h>
16#include <linux/mman.h>
17#include <linux/mm.h>
18#include <linux/swap.h>
19#include <linux/smp.h>
20#include <linux/init.h>
21#include <linux/pagemap.h>
22#include <linux/bootmem.h>
23#include <linux/proc_fs.h>
Andi Kleen59170892005-11-05 17:25:53 +010024#include <linux/pci.h>
Jan Beulich6fb14752007-05-02 19:27:10 +020025#include <linux/pfn.h>
Randy Dunlapc9cf5522006-06-27 02:53:52 -070026#include <linux/poison.h>
Muli Ben-Yehuda17a941d2006-01-11 22:44:42 +010027#include <linux/dma-mapping.h>
Matt Tolentino44df75e2006-01-17 07:03:41 +010028#include <linux/module.h>
29#include <linux/memory_hotplug.h>
Konrad Rzeszutekae32b122007-05-02 19:27:11 +020030#include <linux/nmi.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070031
32#include <asm/processor.h>
33#include <asm/system.h>
34#include <asm/uaccess.h>
35#include <asm/pgtable.h>
36#include <asm/pgalloc.h>
37#include <asm/dma.h>
38#include <asm/fixmap.h>
39#include <asm/e820.h>
40#include <asm/apic.h>
41#include <asm/tlb.h>
42#include <asm/mmu_context.h>
43#include <asm/proto.h>
44#include <asm/smp.h>
Andi Kleen2bc04142005-11-05 17:25:53 +010045#include <asm/sections.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070046
47#ifndef Dprintk
48#define Dprintk(x...)
49#endif
50
Stephen Hemmingere6584502007-05-02 19:27:06 +020051const struct dma_mapping_ops* dma_ops;
Muli Ben-Yehuda17a941d2006-01-11 22:44:42 +010052EXPORT_SYMBOL(dma_ops);
53
Andi Kleene18c6872005-11-05 17:25:53 +010054static unsigned long dma_reserve __initdata;
55
Linus Torvalds1da177e2005-04-16 15:20:36 -070056DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
57
58/*
59 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
60 * physical space so we can cache the place of the first one and move
61 * around without checking the pgd every time.
62 */
63
64void show_mem(void)
65{
Andi Kleene92343c2005-09-12 18:49:24 +020066 long i, total = 0, reserved = 0;
67 long shared = 0, cached = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -070068 pg_data_t *pgdat;
69 struct page *page;
70
Andi Kleene92343c2005-09-12 18:49:24 +020071 printk(KERN_INFO "Mem-info:\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -070072 show_free_areas();
Andi Kleene92343c2005-09-12 18:49:24 +020073 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
Linus Torvalds1da177e2005-04-16 15:20:36 -070074
KAMEZAWA Hiroyukiec936fc2006-03-27 01:15:59 -080075 for_each_online_pgdat(pgdat) {
Linus Torvalds1da177e2005-04-16 15:20:36 -070076 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
Konrad Rzeszutekae32b122007-05-02 19:27:11 +020077 /* this loop can take a while with 256 GB and 4k pages
78 so update the NMI watchdog */
79 if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
80 touch_nmi_watchdog();
81 }
Linus Torvalds1da177e2005-04-16 15:20:36 -070082 page = pfn_to_page(pgdat->node_start_pfn + i);
83 total++;
Andi Kleene92343c2005-09-12 18:49:24 +020084 if (PageReserved(page))
85 reserved++;
86 else if (PageSwapCache(page))
87 cached++;
88 else if (page_count(page))
89 shared += page_count(page) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -070090 }
91 }
Andi Kleene92343c2005-09-12 18:49:24 +020092 printk(KERN_INFO "%lu pages of RAM\n", total);
93 printk(KERN_INFO "%lu reserved pages\n",reserved);
94 printk(KERN_INFO "%lu pages shared\n",shared);
95 printk(KERN_INFO "%lu pages swap cached\n",cached);
Linus Torvalds1da177e2005-04-16 15:20:36 -070096}
97
Linus Torvalds1da177e2005-04-16 15:20:36 -070098int after_bootmem;
99
Andi Kleen5f44a662006-03-25 16:30:25 +0100100static __init void *spp_getpage(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101{
102 void *ptr;
103 if (after_bootmem)
104 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
105 else
106 ptr = alloc_bootmem_pages(PAGE_SIZE);
107 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
108 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
109
110 Dprintk("spp_getpage %p\n", ptr);
111 return ptr;
112}
113
Andi Kleen5f44a662006-03-25 16:30:25 +0100114static __init void set_pte_phys(unsigned long vaddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115 unsigned long phys, pgprot_t prot)
116{
117 pgd_t *pgd;
118 pud_t *pud;
119 pmd_t *pmd;
120 pte_t *pte, new_pte;
121
122 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
123
124 pgd = pgd_offset_k(vaddr);
125 if (pgd_none(*pgd)) {
126 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
127 return;
128 }
129 pud = pud_offset(pgd, vaddr);
130 if (pud_none(*pud)) {
131 pmd = (pmd_t *) spp_getpage();
132 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
133 if (pmd != pmd_offset(pud, 0)) {
134 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
135 return;
136 }
137 }
138 pmd = pmd_offset(pud, vaddr);
139 if (pmd_none(*pmd)) {
140 pte = (pte_t *) spp_getpage();
141 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
142 if (pte != pte_offset_kernel(pmd, 0)) {
143 printk("PAGETABLE BUG #02!\n");
144 return;
145 }
146 }
147 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
148
149 pte = pte_offset_kernel(pmd, vaddr);
150 if (!pte_none(*pte) &&
151 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
152 pte_ERROR(*pte);
153 set_pte(pte, new_pte);
154
155 /*
156 * It's enough to flush this one mapping.
157 * (PGE mappings get flushed as well)
158 */
159 __flush_tlb_one(vaddr);
160}
161
162/* NOTE: this is meant to be run only at boot */
Andi Kleen5f44a662006-03-25 16:30:25 +0100163void __init
164__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165{
166 unsigned long address = __fix_to_virt(idx);
167
168 if (idx >= __end_of_fixed_addresses) {
169 printk("Invalid __set_fixmap\n");
170 return;
171 }
172 set_pte_phys(address, phys, prot);
173}
174
Yasunori Gotoa3142c82007-05-08 00:23:07 -0700175unsigned long __meminitdata table_start, table_end;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700176
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200177static __meminit void *alloc_low_page(unsigned long *phys)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200179 unsigned long pfn = table_end++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180 void *adr;
181
Matt Tolentino44df75e2006-01-17 07:03:41 +0100182 if (after_bootmem) {
183 adr = (void *)get_zeroed_page(GFP_ATOMIC);
184 *phys = __pa(adr);
185 return adr;
186 }
187
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188 if (pfn >= end_pfn)
189 panic("alloc_low_page: ran out of memory");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700190
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200191 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
192 memset(adr, 0, PAGE_SIZE);
193 *phys = pfn * PAGE_SIZE;
194 return adr;
195}
196
197static __meminit void unmap_low_page(void *adr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198{
Matt Tolentino44df75e2006-01-17 07:03:41 +0100199
200 if (after_bootmem)
201 return;
202
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200203 early_iounmap(adr, PAGE_SIZE);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204}
205
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100206/* Must run before zap_low_mappings */
Yasunori Gotoa3142c82007-05-08 00:23:07 -0700207__meminit void *early_ioremap(unsigned long addr, unsigned long size)
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100208{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200209 unsigned long vaddr;
210 pmd_t *pmd, *last_pmd;
211 int i, pmds;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100212
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200213 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
214 vaddr = __START_KERNEL_map;
215 pmd = level2_kernel_pgt;
216 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
217 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
218 for (i = 0; i < pmds; i++) {
219 if (pmd_present(pmd[i]))
220 goto next;
221 }
222 vaddr += addr & ~PMD_MASK;
223 addr &= PMD_MASK;
224 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
225 set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
226 __flush_tlb();
227 return (void *)vaddr;
228 next:
229 ;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100230 }
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200231 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
232 return NULL;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100233}
234
235/* To avoid virtual aliases later */
Yasunori Gotoa3142c82007-05-08 00:23:07 -0700236__meminit void early_iounmap(void *addr, unsigned long size)
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100237{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200238 unsigned long vaddr;
239 pmd_t *pmd;
240 int i, pmds;
241
242 vaddr = (unsigned long)addr;
243 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
244 pmd = level2_kernel_pgt + pmd_index(vaddr);
245 for (i = 0; i < pmds; i++)
246 pmd_clear(pmd + i);
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100247 __flush_tlb();
248}
249
Matt Tolentino44df75e2006-01-17 07:03:41 +0100250static void __meminit
Keith Mannthey6ad91652006-09-26 10:52:36 +0200251phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
Matt Tolentino44df75e2006-01-17 07:03:41 +0100252{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200253 int i = pmd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700254
Keith Mannthey6ad91652006-09-26 10:52:36 +0200255 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
Matt Tolentino44df75e2006-01-17 07:03:41 +0100256 unsigned long entry;
Keith Mannthey6ad91652006-09-26 10:52:36 +0200257 pmd_t *pmd = pmd_page + pmd_index(address);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100258
Jan Beulich5f51e132006-06-26 13:59:02 +0200259 if (address >= end) {
260 if (!after_bootmem)
261 for (; i < PTRS_PER_PMD; i++, pmd++)
262 set_pmd(pmd, __pmd(0));
Matt Tolentino44df75e2006-01-17 07:03:41 +0100263 break;
264 }
Keith Mannthey6ad91652006-09-26 10:52:36 +0200265
266 if (pmd_val(*pmd))
267 continue;
268
Matt Tolentino44df75e2006-01-17 07:03:41 +0100269 entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
270 entry &= __supported_pte_mask;
271 set_pmd(pmd, __pmd(entry));
272 }
273}
274
275static void __meminit
276phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
277{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200278 pmd_t *pmd = pmd_offset(pud,0);
279 spin_lock(&init_mm.page_table_lock);
280 phys_pmd_init(pmd, address, end);
281 spin_unlock(&init_mm.page_table_lock);
282 __flush_tlb_all();
Matt Tolentino44df75e2006-01-17 07:03:41 +0100283}
284
Keith Mannthey6ad91652006-09-26 10:52:36 +0200285static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
Matt Tolentino44df75e2006-01-17 07:03:41 +0100286{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200287 int i = pud_index(addr);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100288
Matt Tolentino44df75e2006-01-17 07:03:41 +0100289
Keith Mannthey6ad91652006-09-26 10:52:36 +0200290 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
Keith Mannthey6ad91652006-09-26 10:52:36 +0200291 unsigned long pmd_phys;
292 pud_t *pud = pud_page + pud_index(addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700293 pmd_t *pmd;
294
Keith Mannthey6ad91652006-09-26 10:52:36 +0200295 if (addr >= end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297
Keith Mannthey6ad91652006-09-26 10:52:36 +0200298 if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700299 set_pud(pud, __pud(0));
300 continue;
301 }
302
Keith Mannthey6ad91652006-09-26 10:52:36 +0200303 if (pud_val(*pud)) {
304 phys_pmd_update(pud, addr, end);
305 continue;
306 }
307
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200308 pmd = alloc_low_page(&pmd_phys);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100309 spin_lock(&init_mm.page_table_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700310 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
Keith Mannthey6ad91652006-09-26 10:52:36 +0200311 phys_pmd_init(pmd, addr, end);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100312 spin_unlock(&init_mm.page_table_lock);
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200313 unmap_low_page(pmd);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700314 }
315 __flush_tlb();
316}
317
318static void __init find_early_table_space(unsigned long end)
319{
Andi Kleen6c5acd12006-01-11 22:46:57 +0100320 unsigned long puds, pmds, tables, start;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321
322 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
323 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
324 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
325 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
326
Andi Kleenee408c72006-01-16 01:56:51 +0100327 /* RED-PEN putting page tables only on node 0 could
328 cause a hotspot and fill up ZONE_DMA. The page tables
329 need roughly 0.5KB per GB. */
330 start = 0x8000;
331 table_start = find_e820_area(start, end, tables);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332 if (table_start == -1UL)
333 panic("Cannot find space for the kernel page tables");
334
335 table_start >>= PAGE_SHIFT;
336 table_end = table_start;
Matt Tolentino44df75e2006-01-17 07:03:41 +0100337
338 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
Jan Beulich5f51e132006-06-26 13:59:02 +0200339 end, table_start << PAGE_SHIFT,
340 (table_start << PAGE_SHIFT) + tables);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341}
342
343/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
344 This runs before bootmem is initialized and gets pages directly from the
345 physical memory. To access them they are temporarily mapped. */
Matt Tolentino44df75e2006-01-17 07:03:41 +0100346void __meminit init_memory_mapping(unsigned long start, unsigned long end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347{
348 unsigned long next;
349
350 Dprintk("init_memory_mapping\n");
351
352 /*
353 * Find space for the kernel direct mapping tables.
354 * Later we should allocate these tables in the local node of the memory
355 * mapped. Unfortunately this is done currently before the nodes are
356 * discovered.
357 */
Matt Tolentino44df75e2006-01-17 07:03:41 +0100358 if (!after_bootmem)
359 find_early_table_space(end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360
361 start = (unsigned long)__va(start);
362 end = (unsigned long)__va(end);
363
364 for (; start < end; start = next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700365 unsigned long pud_phys;
Matt Tolentino44df75e2006-01-17 07:03:41 +0100366 pgd_t *pgd = pgd_offset_k(start);
367 pud_t *pud;
368
369 if (after_bootmem)
Andi Kleend2ae5b52006-06-26 13:57:56 +0200370 pud = pud_offset(pgd, start & PGDIR_MASK);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100371 else
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200372 pud = alloc_low_page(&pud_phys);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100373
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374 next = start + PGDIR_SIZE;
375 if (next > end)
376 next = end;
377 phys_pud_init(pud, __pa(start), __pa(next));
Matt Tolentino44df75e2006-01-17 07:03:41 +0100378 if (!after_bootmem)
379 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200380 unmap_low_page(pud);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700381 }
382
Matt Tolentino44df75e2006-01-17 07:03:41 +0100383 if (!after_bootmem)
384 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700385 __flush_tlb_all();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386}
387
Matt Tolentino2b976902005-06-23 00:08:06 -0700388#ifndef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389void __init paging_init(void)
390{
Mel Gorman6391af12006-10-11 01:20:39 -0700391 unsigned long max_zone_pfns[MAX_NR_ZONES];
392 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
393 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
394 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
395 max_zone_pfns[ZONE_NORMAL] = end_pfn;
396
Matt Tolentino44df75e2006-01-17 07:03:41 +0100397 memory_present(0, 0, end_pfn);
398 sparse_init();
Mel Gorman5cb248a2006-09-27 01:49:52 -0700399 free_area_init_nodes(max_zone_pfns);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400}
401#endif
402
403/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
404 from the CPU leading to inconsistent cache lines. address and size
405 must be aligned to 2MB boundaries.
406 Does nothing when the mapping doesn't exist. */
407void __init clear_kernel_mapping(unsigned long address, unsigned long size)
408{
409 unsigned long end = address + size;
410
411 BUG_ON(address & ~LARGE_PAGE_MASK);
412 BUG_ON(size & ~LARGE_PAGE_MASK);
413
414 for (; address < end; address += LARGE_PAGE_SIZE) {
415 pgd_t *pgd = pgd_offset_k(address);
416 pud_t *pud;
417 pmd_t *pmd;
418 if (pgd_none(*pgd))
419 continue;
420 pud = pud_offset(pgd, address);
421 if (pud_none(*pud))
422 continue;
423 pmd = pmd_offset(pud, address);
424 if (!pmd || pmd_none(*pmd))
425 continue;
426 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
427 /* Could handle this, but it should not happen currently. */
428 printk(KERN_ERR
429 "clear_kernel_mapping: mapping has been split. will leak memory\n");
430 pmd_ERROR(*pmd);
431 }
432 set_pmd(pmd, __pmd(0));
433 }
434 __flush_tlb_all();
435}
436
Matt Tolentino44df75e2006-01-17 07:03:41 +0100437/*
438 * Memory hotplug specific functions
Matt Tolentino44df75e2006-01-17 07:03:41 +0100439 */
Matt Tolentino44df75e2006-01-17 07:03:41 +0100440void online_page(struct page *page)
441{
442 ClearPageReserved(page);
Nick Piggin7835e982006-03-22 00:08:40 -0800443 init_page_count(page);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100444 __free_page(page);
445 totalram_pages++;
446 num_physpages++;
447}
448
Yasunori Gotobc02af92006-06-27 02:53:30 -0700449#ifdef CONFIG_MEMORY_HOTPLUG
450/*
Yasunori Gotobc02af92006-06-27 02:53:30 -0700451 * Memory is added always to NORMAL zone. This means you will never get
452 * additional DMA/DMA32 memory.
453 */
454int arch_add_memory(int nid, u64 start, u64 size)
455{
456 struct pglist_data *pgdat = NODE_DATA(nid);
Christoph Lameter776ed982006-09-25 23:31:09 -0700457 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
Yasunori Gotobc02af92006-06-27 02:53:30 -0700458 unsigned long start_pfn = start >> PAGE_SHIFT;
459 unsigned long nr_pages = size >> PAGE_SHIFT;
460 int ret;
461
Keith Mannthey45e0b782006-09-30 23:27:09 -0700462 init_memory_mapping(start, (start + size -1));
463
Yasunori Gotobc02af92006-06-27 02:53:30 -0700464 ret = __add_pages(zone, start_pfn, nr_pages);
465 if (ret)
466 goto error;
467
Yasunori Gotobc02af92006-06-27 02:53:30 -0700468 return ret;
469error:
470 printk("%s: Problem encountered in __add_pages!\n", __func__);
471 return ret;
472}
473EXPORT_SYMBOL_GPL(arch_add_memory);
474
475int remove_memory(u64 start, u64 size)
476{
477 return -EINVAL;
478}
479EXPORT_SYMBOL_GPL(remove_memory);
480
Yasunori Goto82432292006-11-18 22:19:40 -0800481#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
Keith Mannthey4942e992006-09-30 23:27:06 -0700482int memory_add_physaddr_to_nid(u64 start)
483{
484 return 0;
485}
Keith Mannthey8c2676a2006-09-30 23:27:07 -0700486EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
Keith Mannthey4942e992006-09-30 23:27:06 -0700487#endif
488
Keith Mannthey45e0b782006-09-30 23:27:09 -0700489#endif /* CONFIG_MEMORY_HOTPLUG */
490
491#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
Andi Kleen9d99aaa2006-04-07 19:49:15 +0200492/*
493 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
494 * just online the pages.
495 */
496int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
497{
498 int err = -EIO;
499 unsigned long pfn;
500 unsigned long total = 0, mem = 0;
501 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200502 if (pfn_valid(pfn)) {
Andi Kleen9d99aaa2006-04-07 19:49:15 +0200503 online_page(pfn_to_page(pfn));
504 err = 0;
505 mem++;
506 }
507 total++;
508 }
509 if (!err) {
510 z->spanned_pages += total;
511 z->present_pages += mem;
512 z->zone_pgdat->node_spanned_pages += total;
513 z->zone_pgdat->node_present_pages += mem;
514 }
515 return err;
516}
Keith Mannthey45e0b782006-09-30 23:27:09 -0700517#endif
Matt Tolentino44df75e2006-01-17 07:03:41 +0100518
Linus Torvalds1da177e2005-04-16 15:20:36 -0700519static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
520 kcore_vsyscall;
521
522void __init mem_init(void)
523{
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200524 long codesize, reservedpages, datasize, initsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700525
Jon Mason0dc243a2006-06-26 13:58:11 +0200526 pci_iommu_alloc();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700527
Linus Torvalds1da177e2005-04-16 15:20:36 -0700528 /* clear the zero-page */
529 memset(empty_zero_page, 0, PAGE_SIZE);
530
531 reservedpages = 0;
532
533 /* this will put all low memory onto the freelists */
Matt Tolentino2b976902005-06-23 00:08:06 -0700534#ifdef CONFIG_NUMA
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200535 totalram_pages = numa_free_all_bootmem();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700536#else
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200537 totalram_pages = free_all_bootmem();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700538#endif
Mel Gorman5cb248a2006-09-27 01:49:52 -0700539 reservedpages = end_pfn - totalram_pages -
540 absent_pages_in_range(0, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700541
542 after_bootmem = 1;
543
544 codesize = (unsigned long) &_etext - (unsigned long) &_text;
545 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
546 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
547
548 /* Register memory areas for /proc/kcore */
549 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
550 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
551 VMALLOC_END-VMALLOC_START);
552 kclist_add(&kcore_kernel, &_stext, _end - _stext);
553 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
554 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
555 VSYSCALL_END - VSYSCALL_START);
556
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200557 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700558 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
559 end_pfn << (PAGE_SHIFT-10),
560 codesize >> 10,
561 reservedpages << (PAGE_SHIFT-10),
562 datasize >> 10,
563 initsize >> 10);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700564}
565
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200566void free_init_pages(char *what, unsigned long begin, unsigned long end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700567{
568 unsigned long addr;
569
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200570 if (begin >= end)
571 return;
572
Jan Beulich6fb14752007-05-02 19:27:10 +0200573 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200574 for (addr = begin; addr < end; addr += PAGE_SIZE) {
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700575 ClearPageReserved(virt_to_page(addr));
576 init_page_count(virt_to_page(addr));
577 memset((void *)(addr & ~(PAGE_SIZE-1)),
578 POISON_FREE_INITMEM, PAGE_SIZE);
Jan Beulich6fb14752007-05-02 19:27:10 +0200579 if (addr >= __START_KERNEL_map)
580 change_page_attr_addr(addr, 1, __pgprot(0));
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700581 free_page(addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700582 totalram_pages++;
583 }
Jan Beulich6fb14752007-05-02 19:27:10 +0200584 if (addr > __START_KERNEL_map)
585 global_flush_tlb();
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200586}
587
588void free_initmem(void)
589{
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200590 free_init_pages("unused kernel memory",
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700591 (unsigned long)(&__init_begin),
592 (unsigned long)(&__init_end));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700593}
594
Arjan van de Ven67df1972006-01-06 00:12:04 -0800595#ifdef CONFIG_DEBUG_RODATA
596
Arjan van de Ven67df1972006-01-06 00:12:04 -0800597void mark_rodata_ro(void)
598{
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700599 unsigned long start = (unsigned long)_stext, end;
Arjan van de Ven67df1972006-01-06 00:12:04 -0800600
Jan Beulich6fb14752007-05-02 19:27:10 +0200601#ifdef CONFIG_HOTPLUG_CPU
602 /* It must still be possible to apply SMP alternatives. */
603 if (num_possible_cpus() > 1)
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700604 start = (unsigned long)_etext;
Jan Beulich6fb14752007-05-02 19:27:10 +0200605#endif
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700606 end = (unsigned long)__end_rodata;
607 start = (start + PAGE_SIZE - 1) & PAGE_MASK;
608 end &= PAGE_MASK;
609 if (end <= start)
610 return;
611
612 change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
Arjan van de Ven67df1972006-01-06 00:12:04 -0800613
Jan Beulich6fb14752007-05-02 19:27:10 +0200614 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700615 (end - start) >> 10);
Arjan van de Ven67df1972006-01-06 00:12:04 -0800616
617 /*
618 * change_page_attr_addr() requires a global_flush_tlb() call after it.
619 * We do this after the printk so that if something went wrong in the
620 * change, the printk gets out at least to give a better debug hint
621 * of who is the culprit.
622 */
623 global_flush_tlb();
624}
625#endif
626
Linus Torvalds1da177e2005-04-16 15:20:36 -0700627#ifdef CONFIG_BLK_DEV_INITRD
628void free_initrd_mem(unsigned long start, unsigned long end)
629{
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700630 free_init_pages("initrd memory", start, end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700631}
632#endif
633
634void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
635{
Matt Tolentino2b976902005-06-23 00:08:06 -0700636#ifdef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700637 int nid = phys_to_nid(phys);
Andi Kleen5e58a022006-11-14 16:57:46 +0100638#endif
639 unsigned long pfn = phys >> PAGE_SHIFT;
640 if (pfn >= end_pfn) {
641 /* This can happen with kdump kernels when accessing firmware
642 tables. */
643 if (pfn < end_pfn_map)
644 return;
645 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
646 phys, len);
647 return;
648 }
649
650 /* Should check here against the e820 map to avoid double free */
651#ifdef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700652 reserve_bootmem_node(NODE_DATA(nid), phys, len);
653#else
654 reserve_bootmem(phys, len);
655#endif
Mel Gorman0e0b8642006-09-27 01:49:56 -0700656 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
Andi Kleene18c6872005-11-05 17:25:53 +0100657 dma_reserve += len / PAGE_SIZE;
Mel Gorman0e0b8642006-09-27 01:49:56 -0700658 set_dma_reserve(dma_reserve);
659 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700660}
661
662int kern_addr_valid(unsigned long addr)
663{
664 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
665 pgd_t *pgd;
666 pud_t *pud;
667 pmd_t *pmd;
668 pte_t *pte;
669
670 if (above != 0 && above != -1UL)
671 return 0;
672
673 pgd = pgd_offset_k(addr);
674 if (pgd_none(*pgd))
675 return 0;
676
677 pud = pud_offset(pgd, addr);
678 if (pud_none(*pud))
679 return 0;
680
681 pmd = pmd_offset(pud, addr);
682 if (pmd_none(*pmd))
683 return 0;
684 if (pmd_large(*pmd))
685 return pfn_valid(pmd_pfn(*pmd));
686
687 pte = pte_offset_kernel(pmd, addr);
688 if (pte_none(*pte))
689 return 0;
690 return pfn_valid(pte_pfn(*pte));
691}
692
693#ifdef CONFIG_SYSCTL
694#include <linux/sysctl.h>
695
696extern int exception_trace, page_fault_trace;
697
698static ctl_table debug_table2[] = {
Eric W. Biedermanc37ce032007-02-14 00:33:51 -0800699 {
700 .ctl_name = 99,
701 .procname = "exception-trace",
702 .data = &exception_trace,
703 .maxlen = sizeof(int),
704 .mode = 0644,
705 .proc_handler = proc_dointvec
706 },
707 {}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700708};
709
710static ctl_table debug_root_table2[] = {
Eric W. Biedermanc37ce032007-02-14 00:33:51 -0800711 {
712 .ctl_name = CTL_DEBUG,
713 .procname = "debug",
714 .mode = 0555,
715 .child = debug_table2
716 },
717 {}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700718};
719
720static __init int x8664_sysctl_init(void)
721{
Eric W. Biederman0b4d4142007-02-14 00:34:09 -0800722 register_sysctl_table(debug_root_table2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700723 return 0;
724}
725__initcall(x8664_sysctl_init);
726#endif
727
Ernie Petrides103efcd2006-12-07 02:14:09 +0100728/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
Andi Kleen1e014412005-04-16 15:24:55 -0700729 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
730 not need special handling anymore. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731
732static struct vm_area_struct gate_vma = {
733 .vm_start = VSYSCALL_START,
Ernie Petrides103efcd2006-12-07 02:14:09 +0100734 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
735 .vm_page_prot = PAGE_READONLY_EXEC,
736 .vm_flags = VM_READ | VM_EXEC
Linus Torvalds1da177e2005-04-16 15:20:36 -0700737};
738
Linus Torvalds1da177e2005-04-16 15:20:36 -0700739struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
740{
741#ifdef CONFIG_IA32_EMULATION
Andi Kleen1e014412005-04-16 15:24:55 -0700742 if (test_tsk_thread_flag(tsk, TIF_IA32))
743 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700744#endif
745 return &gate_vma;
746}
747
748int in_gate_area(struct task_struct *task, unsigned long addr)
749{
750 struct vm_area_struct *vma = get_gate_vma(task);
Andi Kleen1e014412005-04-16 15:24:55 -0700751 if (!vma)
752 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700753 return (addr >= vma->vm_start) && (addr < vma->vm_end);
754}
755
756/* Use this when you have no reliable task/vma, typically from interrupt
757 * context. It is less reliable than using the task's vma and may give
758 * false positives.
759 */
760int in_gate_area_no_task(unsigned long addr)
761{
Andi Kleen1e014412005-04-16 15:24:55 -0700762 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700763}
Zou Nan hai2e1c49d2007-06-01 00:46:28 -0700764
765void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
766{
767 return __alloc_bootmem_core(pgdat->bdata, size,
768 SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
769}