blob: 88678e82e23da984a8f554ef1de1677b474b0cbe [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */
8
Linus Torvalds1da177e2005-04-16 15:20:36 -07009#include <linux/signal.h>
10#include <linux/sched.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/string.h>
14#include <linux/types.h>
15#include <linux/ptrace.h>
16#include <linux/mman.h>
17#include <linux/mm.h>
18#include <linux/swap.h>
19#include <linux/smp.h>
20#include <linux/init.h>
21#include <linux/pagemap.h>
22#include <linux/bootmem.h>
23#include <linux/proc_fs.h>
Andi Kleen59170892005-11-05 17:25:53 +010024#include <linux/pci.h>
Jan Beulich6fb14752007-05-02 19:27:10 +020025#include <linux/pfn.h>
Randy Dunlapc9cf5522006-06-27 02:53:52 -070026#include <linux/poison.h>
Muli Ben-Yehuda17a941d2006-01-11 22:44:42 +010027#include <linux/dma-mapping.h>
Matt Tolentino44df75e2006-01-17 07:03:41 +010028#include <linux/module.h>
29#include <linux/memory_hotplug.h>
Konrad Rzeszutekae32b122007-05-02 19:27:11 +020030#include <linux/nmi.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070031
32#include <asm/processor.h>
33#include <asm/system.h>
34#include <asm/uaccess.h>
35#include <asm/pgtable.h>
36#include <asm/pgalloc.h>
37#include <asm/dma.h>
38#include <asm/fixmap.h>
39#include <asm/e820.h>
40#include <asm/apic.h>
41#include <asm/tlb.h>
42#include <asm/mmu_context.h>
43#include <asm/proto.h>
44#include <asm/smp.h>
Andi Kleen2bc04142005-11-05 17:25:53 +010045#include <asm/sections.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070046
47#ifndef Dprintk
48#define Dprintk(x...)
49#endif
50
Stephen Hemmingere6584502007-05-02 19:27:06 +020051const struct dma_mapping_ops* dma_ops;
Muli Ben-Yehuda17a941d2006-01-11 22:44:42 +010052EXPORT_SYMBOL(dma_ops);
53
Andi Kleene18c6872005-11-05 17:25:53 +010054static unsigned long dma_reserve __initdata;
55
Linus Torvalds1da177e2005-04-16 15:20:36 -070056DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
57
58/*
59 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
60 * physical space so we can cache the place of the first one and move
61 * around without checking the pgd every time.
62 */
63
64void show_mem(void)
65{
Andi Kleene92343c2005-09-12 18:49:24 +020066 long i, total = 0, reserved = 0;
67 long shared = 0, cached = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -070068 pg_data_t *pgdat;
69 struct page *page;
70
Andi Kleene92343c2005-09-12 18:49:24 +020071 printk(KERN_INFO "Mem-info:\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -070072 show_free_areas();
Andi Kleene92343c2005-09-12 18:49:24 +020073 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
Linus Torvalds1da177e2005-04-16 15:20:36 -070074
KAMEZAWA Hiroyukiec936fc2006-03-27 01:15:59 -080075 for_each_online_pgdat(pgdat) {
Linus Torvalds1da177e2005-04-16 15:20:36 -070076 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
Konrad Rzeszutekae32b122007-05-02 19:27:11 +020077 /* this loop can take a while with 256 GB and 4k pages
78 so update the NMI watchdog */
79 if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
80 touch_nmi_watchdog();
81 }
Bob Picco12710a52007-06-08 13:47:00 -070082 if (!pfn_valid(pgdat->node_start_pfn + i))
83 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -070084 page = pfn_to_page(pgdat->node_start_pfn + i);
85 total++;
Andi Kleene92343c2005-09-12 18:49:24 +020086 if (PageReserved(page))
87 reserved++;
88 else if (PageSwapCache(page))
89 cached++;
90 else if (page_count(page))
91 shared += page_count(page) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -070092 }
93 }
Andi Kleene92343c2005-09-12 18:49:24 +020094 printk(KERN_INFO "%lu pages of RAM\n", total);
95 printk(KERN_INFO "%lu reserved pages\n",reserved);
96 printk(KERN_INFO "%lu pages shared\n",shared);
97 printk(KERN_INFO "%lu pages swap cached\n",cached);
Linus Torvalds1da177e2005-04-16 15:20:36 -070098}
99
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100int after_bootmem;
101
Andi Kleen5f44a662006-03-25 16:30:25 +0100102static __init void *spp_getpage(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103{
104 void *ptr;
105 if (after_bootmem)
106 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
107 else
108 ptr = alloc_bootmem_pages(PAGE_SIZE);
109 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
110 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
111
112 Dprintk("spp_getpage %p\n", ptr);
113 return ptr;
114}
115
Andi Kleen5f44a662006-03-25 16:30:25 +0100116static __init void set_pte_phys(unsigned long vaddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117 unsigned long phys, pgprot_t prot)
118{
119 pgd_t *pgd;
120 pud_t *pud;
121 pmd_t *pmd;
122 pte_t *pte, new_pte;
123
124 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
125
126 pgd = pgd_offset_k(vaddr);
127 if (pgd_none(*pgd)) {
128 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
129 return;
130 }
131 pud = pud_offset(pgd, vaddr);
132 if (pud_none(*pud)) {
133 pmd = (pmd_t *) spp_getpage();
134 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
135 if (pmd != pmd_offset(pud, 0)) {
136 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
137 return;
138 }
139 }
140 pmd = pmd_offset(pud, vaddr);
141 if (pmd_none(*pmd)) {
142 pte = (pte_t *) spp_getpage();
143 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
144 if (pte != pte_offset_kernel(pmd, 0)) {
145 printk("PAGETABLE BUG #02!\n");
146 return;
147 }
148 }
149 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
150
151 pte = pte_offset_kernel(pmd, vaddr);
152 if (!pte_none(*pte) &&
153 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
154 pte_ERROR(*pte);
155 set_pte(pte, new_pte);
156
157 /*
158 * It's enough to flush this one mapping.
159 * (PGE mappings get flushed as well)
160 */
161 __flush_tlb_one(vaddr);
162}
163
164/* NOTE: this is meant to be run only at boot */
Andi Kleen5f44a662006-03-25 16:30:25 +0100165void __init
166__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167{
168 unsigned long address = __fix_to_virt(idx);
169
170 if (idx >= __end_of_fixed_addresses) {
171 printk("Invalid __set_fixmap\n");
172 return;
173 }
174 set_pte_phys(address, phys, prot);
175}
176
Yasunori Gotoa3142c82007-05-08 00:23:07 -0700177unsigned long __meminitdata table_start, table_end;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200179static __meminit void *alloc_low_page(unsigned long *phys)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200181 unsigned long pfn = table_end++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700182 void *adr;
183
Matt Tolentino44df75e2006-01-17 07:03:41 +0100184 if (after_bootmem) {
185 adr = (void *)get_zeroed_page(GFP_ATOMIC);
186 *phys = __pa(adr);
187 return adr;
188 }
189
Linus Torvalds1da177e2005-04-16 15:20:36 -0700190 if (pfn >= end_pfn)
191 panic("alloc_low_page: ran out of memory");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700192
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200193 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
194 memset(adr, 0, PAGE_SIZE);
195 *phys = pfn * PAGE_SIZE;
196 return adr;
197}
198
199static __meminit void unmap_low_page(void *adr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700200{
Matt Tolentino44df75e2006-01-17 07:03:41 +0100201
202 if (after_bootmem)
203 return;
204
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200205 early_iounmap(adr, PAGE_SIZE);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206}
207
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100208/* Must run before zap_low_mappings */
Yasunori Gotoa3142c82007-05-08 00:23:07 -0700209__meminit void *early_ioremap(unsigned long addr, unsigned long size)
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100210{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200211 unsigned long vaddr;
212 pmd_t *pmd, *last_pmd;
213 int i, pmds;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100214
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200215 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
216 vaddr = __START_KERNEL_map;
217 pmd = level2_kernel_pgt;
218 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
219 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
220 for (i = 0; i < pmds; i++) {
221 if (pmd_present(pmd[i]))
222 goto next;
223 }
224 vaddr += addr & ~PMD_MASK;
225 addr &= PMD_MASK;
226 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
227 set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
228 __flush_tlb();
229 return (void *)vaddr;
230 next:
231 ;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100232 }
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200233 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
234 return NULL;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100235}
236
237/* To avoid virtual aliases later */
Yasunori Gotoa3142c82007-05-08 00:23:07 -0700238__meminit void early_iounmap(void *addr, unsigned long size)
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100239{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200240 unsigned long vaddr;
241 pmd_t *pmd;
242 int i, pmds;
243
244 vaddr = (unsigned long)addr;
245 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
246 pmd = level2_kernel_pgt + pmd_index(vaddr);
247 for (i = 0; i < pmds; i++)
248 pmd_clear(pmd + i);
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100249 __flush_tlb();
250}
251
Matt Tolentino44df75e2006-01-17 07:03:41 +0100252static void __meminit
Keith Mannthey6ad91652006-09-26 10:52:36 +0200253phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
Matt Tolentino44df75e2006-01-17 07:03:41 +0100254{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200255 int i = pmd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256
Keith Mannthey6ad91652006-09-26 10:52:36 +0200257 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
Matt Tolentino44df75e2006-01-17 07:03:41 +0100258 unsigned long entry;
Keith Mannthey6ad91652006-09-26 10:52:36 +0200259 pmd_t *pmd = pmd_page + pmd_index(address);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100260
Jan Beulich5f51e132006-06-26 13:59:02 +0200261 if (address >= end) {
262 if (!after_bootmem)
263 for (; i < PTRS_PER_PMD; i++, pmd++)
264 set_pmd(pmd, __pmd(0));
Matt Tolentino44df75e2006-01-17 07:03:41 +0100265 break;
266 }
Keith Mannthey6ad91652006-09-26 10:52:36 +0200267
268 if (pmd_val(*pmd))
269 continue;
270
Matt Tolentino44df75e2006-01-17 07:03:41 +0100271 entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
272 entry &= __supported_pte_mask;
273 set_pmd(pmd, __pmd(entry));
274 }
275}
276
277static void __meminit
278phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
279{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200280 pmd_t *pmd = pmd_offset(pud,0);
281 spin_lock(&init_mm.page_table_lock);
282 phys_pmd_init(pmd, address, end);
283 spin_unlock(&init_mm.page_table_lock);
284 __flush_tlb_all();
Matt Tolentino44df75e2006-01-17 07:03:41 +0100285}
286
Keith Mannthey6ad91652006-09-26 10:52:36 +0200287static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
Matt Tolentino44df75e2006-01-17 07:03:41 +0100288{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200289 int i = pud_index(addr);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100290
Matt Tolentino44df75e2006-01-17 07:03:41 +0100291
Keith Mannthey6ad91652006-09-26 10:52:36 +0200292 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
Keith Mannthey6ad91652006-09-26 10:52:36 +0200293 unsigned long pmd_phys;
294 pud_t *pud = pud_page + pud_index(addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700295 pmd_t *pmd;
296
Keith Mannthey6ad91652006-09-26 10:52:36 +0200297 if (addr >= end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700299
Keith Mannthey6ad91652006-09-26 10:52:36 +0200300 if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301 set_pud(pud, __pud(0));
302 continue;
303 }
304
Keith Mannthey6ad91652006-09-26 10:52:36 +0200305 if (pud_val(*pud)) {
306 phys_pmd_update(pud, addr, end);
307 continue;
308 }
309
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200310 pmd = alloc_low_page(&pmd_phys);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100311 spin_lock(&init_mm.page_table_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700312 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
Keith Mannthey6ad91652006-09-26 10:52:36 +0200313 phys_pmd_init(pmd, addr, end);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100314 spin_unlock(&init_mm.page_table_lock);
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200315 unmap_low_page(pmd);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700316 }
317 __flush_tlb();
318}
319
320static void __init find_early_table_space(unsigned long end)
321{
Andi Kleen6c5acd12006-01-11 22:46:57 +0100322 unsigned long puds, pmds, tables, start;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323
324 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
325 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
326 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
327 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
328
Andi Kleenee408c72006-01-16 01:56:51 +0100329 /* RED-PEN putting page tables only on node 0 could
330 cause a hotspot and fill up ZONE_DMA. The page tables
331 need roughly 0.5KB per GB. */
332 start = 0x8000;
333 table_start = find_e820_area(start, end, tables);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700334 if (table_start == -1UL)
335 panic("Cannot find space for the kernel page tables");
336
337 table_start >>= PAGE_SHIFT;
338 table_end = table_start;
Matt Tolentino44df75e2006-01-17 07:03:41 +0100339
340 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
Jan Beulich5f51e132006-06-26 13:59:02 +0200341 end, table_start << PAGE_SHIFT,
342 (table_start << PAGE_SHIFT) + tables);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343}
344
345/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
346 This runs before bootmem is initialized and gets pages directly from the
347 physical memory. To access them they are temporarily mapped. */
Matt Tolentino44df75e2006-01-17 07:03:41 +0100348void __meminit init_memory_mapping(unsigned long start, unsigned long end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700349{
350 unsigned long next;
351
352 Dprintk("init_memory_mapping\n");
353
354 /*
355 * Find space for the kernel direct mapping tables.
356 * Later we should allocate these tables in the local node of the memory
357 * mapped. Unfortunately this is done currently before the nodes are
358 * discovered.
359 */
Matt Tolentino44df75e2006-01-17 07:03:41 +0100360 if (!after_bootmem)
361 find_early_table_space(end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700362
363 start = (unsigned long)__va(start);
364 end = (unsigned long)__va(end);
365
366 for (; start < end; start = next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700367 unsigned long pud_phys;
Matt Tolentino44df75e2006-01-17 07:03:41 +0100368 pgd_t *pgd = pgd_offset_k(start);
369 pud_t *pud;
370
371 if (after_bootmem)
Andi Kleend2ae5b52006-06-26 13:57:56 +0200372 pud = pud_offset(pgd, start & PGDIR_MASK);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100373 else
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200374 pud = alloc_low_page(&pud_phys);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100375
Linus Torvalds1da177e2005-04-16 15:20:36 -0700376 next = start + PGDIR_SIZE;
377 if (next > end)
378 next = end;
379 phys_pud_init(pud, __pa(start), __pa(next));
Matt Tolentino44df75e2006-01-17 07:03:41 +0100380 if (!after_bootmem)
381 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200382 unmap_low_page(pud);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383 }
384
Matt Tolentino44df75e2006-01-17 07:03:41 +0100385 if (!after_bootmem)
386 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700387 __flush_tlb_all();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700388}
389
Matt Tolentino2b976902005-06-23 00:08:06 -0700390#ifndef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700391void __init paging_init(void)
392{
Mel Gorman6391af12006-10-11 01:20:39 -0700393 unsigned long max_zone_pfns[MAX_NR_ZONES];
394 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
395 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
396 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
397 max_zone_pfns[ZONE_NORMAL] = end_pfn;
398
Matt Tolentino44df75e2006-01-17 07:03:41 +0100399 memory_present(0, 0, end_pfn);
400 sparse_init();
Mel Gorman5cb248a2006-09-27 01:49:52 -0700401 free_area_init_nodes(max_zone_pfns);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402}
403#endif
404
405/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
406 from the CPU leading to inconsistent cache lines. address and size
407 must be aligned to 2MB boundaries.
408 Does nothing when the mapping doesn't exist. */
409void __init clear_kernel_mapping(unsigned long address, unsigned long size)
410{
411 unsigned long end = address + size;
412
413 BUG_ON(address & ~LARGE_PAGE_MASK);
414 BUG_ON(size & ~LARGE_PAGE_MASK);
415
416 for (; address < end; address += LARGE_PAGE_SIZE) {
417 pgd_t *pgd = pgd_offset_k(address);
418 pud_t *pud;
419 pmd_t *pmd;
420 if (pgd_none(*pgd))
421 continue;
422 pud = pud_offset(pgd, address);
423 if (pud_none(*pud))
424 continue;
425 pmd = pmd_offset(pud, address);
426 if (!pmd || pmd_none(*pmd))
427 continue;
428 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
429 /* Could handle this, but it should not happen currently. */
430 printk(KERN_ERR
431 "clear_kernel_mapping: mapping has been split. will leak memory\n");
432 pmd_ERROR(*pmd);
433 }
434 set_pmd(pmd, __pmd(0));
435 }
436 __flush_tlb_all();
437}
438
Matt Tolentino44df75e2006-01-17 07:03:41 +0100439/*
440 * Memory hotplug specific functions
Matt Tolentino44df75e2006-01-17 07:03:41 +0100441 */
Matt Tolentino44df75e2006-01-17 07:03:41 +0100442void online_page(struct page *page)
443{
444 ClearPageReserved(page);
Nick Piggin7835e982006-03-22 00:08:40 -0800445 init_page_count(page);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100446 __free_page(page);
447 totalram_pages++;
448 num_physpages++;
449}
450
Yasunori Gotobc02af92006-06-27 02:53:30 -0700451#ifdef CONFIG_MEMORY_HOTPLUG
452/*
Yasunori Gotobc02af92006-06-27 02:53:30 -0700453 * Memory is added always to NORMAL zone. This means you will never get
454 * additional DMA/DMA32 memory.
455 */
456int arch_add_memory(int nid, u64 start, u64 size)
457{
458 struct pglist_data *pgdat = NODE_DATA(nid);
Christoph Lameter776ed982006-09-25 23:31:09 -0700459 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
Yasunori Gotobc02af92006-06-27 02:53:30 -0700460 unsigned long start_pfn = start >> PAGE_SHIFT;
461 unsigned long nr_pages = size >> PAGE_SHIFT;
462 int ret;
463
Keith Mannthey45e0b782006-09-30 23:27:09 -0700464 init_memory_mapping(start, (start + size -1));
465
Yasunori Gotobc02af92006-06-27 02:53:30 -0700466 ret = __add_pages(zone, start_pfn, nr_pages);
467 if (ret)
468 goto error;
469
Yasunori Gotobc02af92006-06-27 02:53:30 -0700470 return ret;
471error:
472 printk("%s: Problem encountered in __add_pages!\n", __func__);
473 return ret;
474}
475EXPORT_SYMBOL_GPL(arch_add_memory);
476
477int remove_memory(u64 start, u64 size)
478{
479 return -EINVAL;
480}
481EXPORT_SYMBOL_GPL(remove_memory);
482
Yasunori Goto82432292006-11-18 22:19:40 -0800483#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
Keith Mannthey4942e992006-09-30 23:27:06 -0700484int memory_add_physaddr_to_nid(u64 start)
485{
486 return 0;
487}
Keith Mannthey8c2676a2006-09-30 23:27:07 -0700488EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
Keith Mannthey4942e992006-09-30 23:27:06 -0700489#endif
490
Keith Mannthey45e0b782006-09-30 23:27:09 -0700491#endif /* CONFIG_MEMORY_HOTPLUG */
492
493#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
Andi Kleen9d99aaa2006-04-07 19:49:15 +0200494/*
495 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
496 * just online the pages.
497 */
498int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
499{
500 int err = -EIO;
501 unsigned long pfn;
502 unsigned long total = 0, mem = 0;
503 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200504 if (pfn_valid(pfn)) {
Andi Kleen9d99aaa2006-04-07 19:49:15 +0200505 online_page(pfn_to_page(pfn));
506 err = 0;
507 mem++;
508 }
509 total++;
510 }
511 if (!err) {
512 z->spanned_pages += total;
513 z->present_pages += mem;
514 z->zone_pgdat->node_spanned_pages += total;
515 z->zone_pgdat->node_present_pages += mem;
516 }
517 return err;
518}
Keith Mannthey45e0b782006-09-30 23:27:09 -0700519#endif
Matt Tolentino44df75e2006-01-17 07:03:41 +0100520
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
522 kcore_vsyscall;
523
524void __init mem_init(void)
525{
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200526 long codesize, reservedpages, datasize, initsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700527
Jon Mason0dc243a2006-06-26 13:58:11 +0200528 pci_iommu_alloc();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700529
Linus Torvalds1da177e2005-04-16 15:20:36 -0700530 /* clear the zero-page */
531 memset(empty_zero_page, 0, PAGE_SIZE);
532
533 reservedpages = 0;
534
535 /* this will put all low memory onto the freelists */
Matt Tolentino2b976902005-06-23 00:08:06 -0700536#ifdef CONFIG_NUMA
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200537 totalram_pages = numa_free_all_bootmem();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700538#else
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200539 totalram_pages = free_all_bootmem();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700540#endif
Mel Gorman5cb248a2006-09-27 01:49:52 -0700541 reservedpages = end_pfn - totalram_pages -
542 absent_pages_in_range(0, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700543
544 after_bootmem = 1;
545
546 codesize = (unsigned long) &_etext - (unsigned long) &_text;
547 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
548 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
549
550 /* Register memory areas for /proc/kcore */
551 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
552 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
553 VMALLOC_END-VMALLOC_START);
554 kclist_add(&kcore_kernel, &_stext, _end - _stext);
555 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
556 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
557 VSYSCALL_END - VSYSCALL_START);
558
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200559 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700560 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
561 end_pfn << (PAGE_SHIFT-10),
562 codesize >> 10,
563 reservedpages << (PAGE_SHIFT-10),
564 datasize >> 10,
565 initsize >> 10);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700566}
567
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200568void free_init_pages(char *what, unsigned long begin, unsigned long end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700569{
570 unsigned long addr;
571
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200572 if (begin >= end)
573 return;
574
Jan Beulich6fb14752007-05-02 19:27:10 +0200575 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200576 for (addr = begin; addr < end; addr += PAGE_SIZE) {
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700577 ClearPageReserved(virt_to_page(addr));
578 init_page_count(virt_to_page(addr));
579 memset((void *)(addr & ~(PAGE_SIZE-1)),
580 POISON_FREE_INITMEM, PAGE_SIZE);
Jan Beulich6fb14752007-05-02 19:27:10 +0200581 if (addr >= __START_KERNEL_map)
582 change_page_attr_addr(addr, 1, __pgprot(0));
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700583 free_page(addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700584 totalram_pages++;
585 }
Jan Beulich6fb14752007-05-02 19:27:10 +0200586 if (addr > __START_KERNEL_map)
587 global_flush_tlb();
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200588}
589
590void free_initmem(void)
591{
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200592 free_init_pages("unused kernel memory",
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700593 (unsigned long)(&__init_begin),
594 (unsigned long)(&__init_end));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700595}
596
Arjan van de Ven67df1972006-01-06 00:12:04 -0800597#ifdef CONFIG_DEBUG_RODATA
598
Arjan van de Ven67df1972006-01-06 00:12:04 -0800599void mark_rodata_ro(void)
600{
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700601 unsigned long start = (unsigned long)_stext, end;
Arjan van de Ven67df1972006-01-06 00:12:04 -0800602
Jan Beulich6fb14752007-05-02 19:27:10 +0200603#ifdef CONFIG_HOTPLUG_CPU
604 /* It must still be possible to apply SMP alternatives. */
605 if (num_possible_cpus() > 1)
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700606 start = (unsigned long)_etext;
Jan Beulich6fb14752007-05-02 19:27:10 +0200607#endif
Arjan van de Ven0864a4e2007-06-20 22:23:21 -0700608
609#ifdef CONFIG_KPROBES
610 start = (unsigned long)__start_rodata;
611#endif
612
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700613 end = (unsigned long)__end_rodata;
614 start = (start + PAGE_SIZE - 1) & PAGE_MASK;
615 end &= PAGE_MASK;
616 if (end <= start)
617 return;
618
619 change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
Arjan van de Ven67df1972006-01-06 00:12:04 -0800620
Jan Beulich6fb14752007-05-02 19:27:10 +0200621 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700622 (end - start) >> 10);
Arjan van de Ven67df1972006-01-06 00:12:04 -0800623
624 /*
625 * change_page_attr_addr() requires a global_flush_tlb() call after it.
626 * We do this after the printk so that if something went wrong in the
627 * change, the printk gets out at least to give a better debug hint
628 * of who is the culprit.
629 */
630 global_flush_tlb();
631}
632#endif
633
Linus Torvalds1da177e2005-04-16 15:20:36 -0700634#ifdef CONFIG_BLK_DEV_INITRD
635void free_initrd_mem(unsigned long start, unsigned long end)
636{
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700637 free_init_pages("initrd memory", start, end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700638}
639#endif
640
641void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
642{
Matt Tolentino2b976902005-06-23 00:08:06 -0700643#ifdef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700644 int nid = phys_to_nid(phys);
Andi Kleen5e58a022006-11-14 16:57:46 +0100645#endif
646 unsigned long pfn = phys >> PAGE_SHIFT;
647 if (pfn >= end_pfn) {
648 /* This can happen with kdump kernels when accessing firmware
649 tables. */
650 if (pfn < end_pfn_map)
651 return;
652 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
653 phys, len);
654 return;
655 }
656
657 /* Should check here against the e820 map to avoid double free */
658#ifdef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659 reserve_bootmem_node(NODE_DATA(nid), phys, len);
660#else
661 reserve_bootmem(phys, len);
662#endif
Mel Gorman0e0b8642006-09-27 01:49:56 -0700663 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
Andi Kleene18c6872005-11-05 17:25:53 +0100664 dma_reserve += len / PAGE_SIZE;
Mel Gorman0e0b8642006-09-27 01:49:56 -0700665 set_dma_reserve(dma_reserve);
666 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667}
668
669int kern_addr_valid(unsigned long addr)
670{
671 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
672 pgd_t *pgd;
673 pud_t *pud;
674 pmd_t *pmd;
675 pte_t *pte;
676
677 if (above != 0 && above != -1UL)
678 return 0;
679
680 pgd = pgd_offset_k(addr);
681 if (pgd_none(*pgd))
682 return 0;
683
684 pud = pud_offset(pgd, addr);
685 if (pud_none(*pud))
686 return 0;
687
688 pmd = pmd_offset(pud, addr);
689 if (pmd_none(*pmd))
690 return 0;
691 if (pmd_large(*pmd))
692 return pfn_valid(pmd_pfn(*pmd));
693
694 pte = pte_offset_kernel(pmd, addr);
695 if (pte_none(*pte))
696 return 0;
697 return pfn_valid(pte_pfn(*pte));
698}
699
Ernie Petrides103efcd2006-12-07 02:14:09 +0100700/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
Andi Kleen1e014412005-04-16 15:24:55 -0700701 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
702 not need special handling anymore. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700703
704static struct vm_area_struct gate_vma = {
705 .vm_start = VSYSCALL_START,
Ernie Petrides103efcd2006-12-07 02:14:09 +0100706 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
707 .vm_page_prot = PAGE_READONLY_EXEC,
708 .vm_flags = VM_READ | VM_EXEC
Linus Torvalds1da177e2005-04-16 15:20:36 -0700709};
710
Linus Torvalds1da177e2005-04-16 15:20:36 -0700711struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
712{
713#ifdef CONFIG_IA32_EMULATION
Andi Kleen1e014412005-04-16 15:24:55 -0700714 if (test_tsk_thread_flag(tsk, TIF_IA32))
715 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716#endif
717 return &gate_vma;
718}
719
720int in_gate_area(struct task_struct *task, unsigned long addr)
721{
722 struct vm_area_struct *vma = get_gate_vma(task);
Andi Kleen1e014412005-04-16 15:24:55 -0700723 if (!vma)
724 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700725 return (addr >= vma->vm_start) && (addr < vma->vm_end);
726}
727
728/* Use this when you have no reliable task/vma, typically from interrupt
729 * context. It is less reliable than using the task's vma and may give
730 * false positives.
731 */
732int in_gate_area_no_task(unsigned long addr)
733{
Andi Kleen1e014412005-04-16 15:24:55 -0700734 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700735}
Zou Nan hai2e1c49d2007-06-01 00:46:28 -0700736
737void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
738{
739 return __alloc_bootmem_core(pgdat->bdata, size,
740 SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
741}
Andi Kleen2aae9502007-07-21 17:10:01 +0200742
743const char *arch_vma_name(struct vm_area_struct *vma)
744{
745 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
746 return "[vdso]";
747 if (vma == &gate_vma)
748 return "[vsyscall]";
749 return NULL;
750}