blob: e0c1e98ad1bf4c80935b31010e18299199c5761a [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */
8
Linus Torvalds1da177e2005-04-16 15:20:36 -07009#include <linux/signal.h>
10#include <linux/sched.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/string.h>
14#include <linux/types.h>
15#include <linux/ptrace.h>
16#include <linux/mman.h>
17#include <linux/mm.h>
18#include <linux/swap.h>
19#include <linux/smp.h>
20#include <linux/init.h>
21#include <linux/pagemap.h>
22#include <linux/bootmem.h>
23#include <linux/proc_fs.h>
Andi Kleen59170892005-11-05 17:25:53 +010024#include <linux/pci.h>
Jan Beulich6fb14752007-05-02 19:27:10 +020025#include <linux/pfn.h>
Randy Dunlapc9cf5522006-06-27 02:53:52 -070026#include <linux/poison.h>
Muli Ben-Yehuda17a941d2006-01-11 22:44:42 +010027#include <linux/dma-mapping.h>
Matt Tolentino44df75e2006-01-17 07:03:41 +010028#include <linux/module.h>
29#include <linux/memory_hotplug.h>
Konrad Rzeszutekae32b122007-05-02 19:27:11 +020030#include <linux/nmi.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070031
32#include <asm/processor.h>
33#include <asm/system.h>
34#include <asm/uaccess.h>
35#include <asm/pgtable.h>
36#include <asm/pgalloc.h>
37#include <asm/dma.h>
38#include <asm/fixmap.h>
39#include <asm/e820.h>
40#include <asm/apic.h>
41#include <asm/tlb.h>
42#include <asm/mmu_context.h>
43#include <asm/proto.h>
44#include <asm/smp.h>
Andi Kleen2bc04142005-11-05 17:25:53 +010045#include <asm/sections.h>
Thomas Gleixner718fc132008-01-30 13:30:17 +010046#include <asm/kdebug.h>
Thomas Gleixneraaa64e02008-01-30 13:30:17 +010047#include <asm/numa.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070048
49#ifndef Dprintk
50#define Dprintk(x...)
51#endif
52
Stephen Hemmingere6584502007-05-02 19:27:06 +020053const struct dma_mapping_ops* dma_ops;
Muli Ben-Yehuda17a941d2006-01-11 22:44:42 +010054EXPORT_SYMBOL(dma_ops);
55
Andi Kleene18c6872005-11-05 17:25:53 +010056static unsigned long dma_reserve __initdata;
57
Linus Torvalds1da177e2005-04-16 15:20:36 -070058DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
59
60/*
61 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
62 * physical space so we can cache the place of the first one and move
63 * around without checking the pgd every time.
64 */
65
66void show_mem(void)
67{
Andi Kleene92343c2005-09-12 18:49:24 +020068 long i, total = 0, reserved = 0;
69 long shared = 0, cached = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -070070 pg_data_t *pgdat;
71 struct page *page;
72
Andi Kleene92343c2005-09-12 18:49:24 +020073 printk(KERN_INFO "Mem-info:\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -070074 show_free_areas();
Andi Kleene92343c2005-09-12 18:49:24 +020075 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
Linus Torvalds1da177e2005-04-16 15:20:36 -070076
KAMEZAWA Hiroyukiec936fc2006-03-27 01:15:59 -080077 for_each_online_pgdat(pgdat) {
Linus Torvalds1da177e2005-04-16 15:20:36 -070078 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
Konrad Rzeszutekae32b122007-05-02 19:27:11 +020079 /* this loop can take a while with 256 GB and 4k pages
80 so update the NMI watchdog */
81 if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
82 touch_nmi_watchdog();
83 }
Bob Picco12710a52007-06-08 13:47:00 -070084 if (!pfn_valid(pgdat->node_start_pfn + i))
85 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -070086 page = pfn_to_page(pgdat->node_start_pfn + i);
87 total++;
Andi Kleene92343c2005-09-12 18:49:24 +020088 if (PageReserved(page))
89 reserved++;
90 else if (PageSwapCache(page))
91 cached++;
92 else if (page_count(page))
93 shared += page_count(page) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -070094 }
95 }
Andi Kleene92343c2005-09-12 18:49:24 +020096 printk(KERN_INFO "%lu pages of RAM\n", total);
97 printk(KERN_INFO "%lu reserved pages\n",reserved);
98 printk(KERN_INFO "%lu pages shared\n",shared);
99 printk(KERN_INFO "%lu pages swap cached\n",cached);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100}
101
Linus Torvalds1da177e2005-04-16 15:20:36 -0700102int after_bootmem;
103
Andi Kleen5f44a662006-03-25 16:30:25 +0100104static __init void *spp_getpage(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105{
106 void *ptr;
107 if (after_bootmem)
108 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
109 else
110 ptr = alloc_bootmem_pages(PAGE_SIZE);
111 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
112 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
113
114 Dprintk("spp_getpage %p\n", ptr);
115 return ptr;
116}
117
Andi Kleen5f44a662006-03-25 16:30:25 +0100118static __init void set_pte_phys(unsigned long vaddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119 unsigned long phys, pgprot_t prot)
120{
121 pgd_t *pgd;
122 pud_t *pud;
123 pmd_t *pmd;
124 pte_t *pte, new_pte;
125
126 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
127
128 pgd = pgd_offset_k(vaddr);
129 if (pgd_none(*pgd)) {
130 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
131 return;
132 }
133 pud = pud_offset(pgd, vaddr);
134 if (pud_none(*pud)) {
135 pmd = (pmd_t *) spp_getpage();
136 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
137 if (pmd != pmd_offset(pud, 0)) {
138 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
139 return;
140 }
141 }
142 pmd = pmd_offset(pud, vaddr);
143 if (pmd_none(*pmd)) {
144 pte = (pte_t *) spp_getpage();
145 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
146 if (pte != pte_offset_kernel(pmd, 0)) {
147 printk("PAGETABLE BUG #02!\n");
148 return;
149 }
150 }
151 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
152
153 pte = pte_offset_kernel(pmd, vaddr);
154 if (!pte_none(*pte) &&
155 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
156 pte_ERROR(*pte);
157 set_pte(pte, new_pte);
158
159 /*
160 * It's enough to flush this one mapping.
161 * (PGE mappings get flushed as well)
162 */
163 __flush_tlb_one(vaddr);
164}
165
166/* NOTE: this is meant to be run only at boot */
Andi Kleen5f44a662006-03-25 16:30:25 +0100167void __init
168__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169{
170 unsigned long address = __fix_to_virt(idx);
171
172 if (idx >= __end_of_fixed_addresses) {
173 printk("Invalid __set_fixmap\n");
174 return;
175 }
176 set_pte_phys(address, phys, prot);
177}
178
Andi Kleen75175272008-01-30 13:33:17 +0100179static unsigned long __initdata table_start;
180static unsigned long __meminitdata table_end;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200182static __meminit void *alloc_low_page(unsigned long *phys)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200184 unsigned long pfn = table_end++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185 void *adr;
186
Matt Tolentino44df75e2006-01-17 07:03:41 +0100187 if (after_bootmem) {
188 adr = (void *)get_zeroed_page(GFP_ATOMIC);
189 *phys = __pa(adr);
190 return adr;
191 }
192
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193 if (pfn >= end_pfn)
194 panic("alloc_low_page: ran out of memory");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200196 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
197 memset(adr, 0, PAGE_SIZE);
198 *phys = pfn * PAGE_SIZE;
199 return adr;
200}
201
202static __meminit void unmap_low_page(void *adr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700203{
Matt Tolentino44df75e2006-01-17 07:03:41 +0100204
205 if (after_bootmem)
206 return;
207
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200208 early_iounmap(adr, PAGE_SIZE);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209}
210
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100211/* Must run before zap_low_mappings */
Yasunori Gotoa3142c82007-05-08 00:23:07 -0700212__meminit void *early_ioremap(unsigned long addr, unsigned long size)
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100213{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200214 unsigned long vaddr;
215 pmd_t *pmd, *last_pmd;
216 int i, pmds;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100217
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200218 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
219 vaddr = __START_KERNEL_map;
220 pmd = level2_kernel_pgt;
221 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
222 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
223 for (i = 0; i < pmds; i++) {
224 if (pmd_present(pmd[i]))
225 goto next;
226 }
227 vaddr += addr & ~PMD_MASK;
228 addr &= PMD_MASK;
229 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
Joerg Roedel929fd582008-01-30 13:31:08 +0100230 set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
Andi Kleen1a2b4412008-01-30 13:33:54 +0100231 __flush_tlb_all();
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200232 return (void *)vaddr;
233 next:
234 ;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100235 }
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200236 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
237 return NULL;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100238}
239
240/* To avoid virtual aliases later */
Yasunori Gotoa3142c82007-05-08 00:23:07 -0700241__meminit void early_iounmap(void *addr, unsigned long size)
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100242{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200243 unsigned long vaddr;
244 pmd_t *pmd;
245 int i, pmds;
246
247 vaddr = (unsigned long)addr;
248 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
249 pmd = level2_kernel_pgt + pmd_index(vaddr);
250 for (i = 0; i < pmds; i++)
251 pmd_clear(pmd + i);
Andi Kleen1a2b4412008-01-30 13:33:54 +0100252 __flush_tlb_all();
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100253}
254
Matt Tolentino44df75e2006-01-17 07:03:41 +0100255static void __meminit
Keith Mannthey6ad91652006-09-26 10:52:36 +0200256phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
Matt Tolentino44df75e2006-01-17 07:03:41 +0100257{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200258 int i = pmd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259
Keith Mannthey6ad91652006-09-26 10:52:36 +0200260 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
Matt Tolentino44df75e2006-01-17 07:03:41 +0100261 unsigned long entry;
Keith Mannthey6ad91652006-09-26 10:52:36 +0200262 pmd_t *pmd = pmd_page + pmd_index(address);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100263
Jan Beulich5f51e132006-06-26 13:59:02 +0200264 if (address >= end) {
265 if (!after_bootmem)
266 for (; i < PTRS_PER_PMD; i++, pmd++)
267 set_pmd(pmd, __pmd(0));
Matt Tolentino44df75e2006-01-17 07:03:41 +0100268 break;
269 }
Keith Mannthey6ad91652006-09-26 10:52:36 +0200270
271 if (pmd_val(*pmd))
272 continue;
273
Joerg Roedel40842bf2008-01-30 13:31:02 +0100274 entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address;
Matt Tolentino44df75e2006-01-17 07:03:41 +0100275 entry &= __supported_pte_mask;
276 set_pmd(pmd, __pmd(entry));
277 }
278}
279
280static void __meminit
281phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
282{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200283 pmd_t *pmd = pmd_offset(pud,0);
284 spin_lock(&init_mm.page_table_lock);
285 phys_pmd_init(pmd, address, end);
286 spin_unlock(&init_mm.page_table_lock);
287 __flush_tlb_all();
Matt Tolentino44df75e2006-01-17 07:03:41 +0100288}
289
Keith Mannthey6ad91652006-09-26 10:52:36 +0200290static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
Matt Tolentino44df75e2006-01-17 07:03:41 +0100291{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200292 int i = pud_index(addr);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100293
Matt Tolentino44df75e2006-01-17 07:03:41 +0100294
Keith Mannthey6ad91652006-09-26 10:52:36 +0200295 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
Keith Mannthey6ad91652006-09-26 10:52:36 +0200296 unsigned long pmd_phys;
297 pud_t *pud = pud_page + pud_index(addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298 pmd_t *pmd;
299
Keith Mannthey6ad91652006-09-26 10:52:36 +0200300 if (addr >= end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302
Keith Mannthey6ad91652006-09-26 10:52:36 +0200303 if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304 set_pud(pud, __pud(0));
305 continue;
306 }
307
Keith Mannthey6ad91652006-09-26 10:52:36 +0200308 if (pud_val(*pud)) {
309 phys_pmd_update(pud, addr, end);
310 continue;
311 }
312
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200313 pmd = alloc_low_page(&pmd_phys);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100314 spin_lock(&init_mm.page_table_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700315 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
Keith Mannthey6ad91652006-09-26 10:52:36 +0200316 phys_pmd_init(pmd, addr, end);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100317 spin_unlock(&init_mm.page_table_lock);
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200318 unmap_low_page(pmd);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319 }
Andi Kleen1a2b4412008-01-30 13:33:54 +0100320 __flush_tlb_all();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321}
322
323static void __init find_early_table_space(unsigned long end)
324{
Andi Kleen6c5acd12006-01-11 22:46:57 +0100325 unsigned long puds, pmds, tables, start;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700326
327 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
328 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
329 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
330 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
331
Andi Kleenee408c72006-01-16 01:56:51 +0100332 /* RED-PEN putting page tables only on node 0 could
333 cause a hotspot and fill up ZONE_DMA. The page tables
334 need roughly 0.5KB per GB. */
335 start = 0x8000;
336 table_start = find_e820_area(start, end, tables);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700337 if (table_start == -1UL)
338 panic("Cannot find space for the kernel page tables");
339
340 table_start >>= PAGE_SHIFT;
341 table_end = table_start;
Matt Tolentino44df75e2006-01-17 07:03:41 +0100342
343 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
Jan Beulich5f51e132006-06-26 13:59:02 +0200344 end, table_start << PAGE_SHIFT,
345 (table_start << PAGE_SHIFT) + tables);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700346}
347
348/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
349 This runs before bootmem is initialized and gets pages directly from the
350 physical memory. To access them they are temporarily mapped. */
KAMEZAWA Hiroyukib6fd6ec2007-11-28 16:21:58 -0800351void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352{
353 unsigned long next;
354
355 Dprintk("init_memory_mapping\n");
356
357 /*
358 * Find space for the kernel direct mapping tables.
359 * Later we should allocate these tables in the local node of the memory
360 * mapped. Unfortunately this is done currently before the nodes are
361 * discovered.
362 */
Matt Tolentino44df75e2006-01-17 07:03:41 +0100363 if (!after_bootmem)
364 find_early_table_space(end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700365
366 start = (unsigned long)__va(start);
367 end = (unsigned long)__va(end);
368
369 for (; start < end; start = next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370 unsigned long pud_phys;
Matt Tolentino44df75e2006-01-17 07:03:41 +0100371 pgd_t *pgd = pgd_offset_k(start);
372 pud_t *pud;
373
374 if (after_bootmem)
Andi Kleend2ae5b52006-06-26 13:57:56 +0200375 pud = pud_offset(pgd, start & PGDIR_MASK);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100376 else
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200377 pud = alloc_low_page(&pud_phys);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100378
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 next = start + PGDIR_SIZE;
380 if (next > end)
381 next = end;
382 phys_pud_init(pud, __pa(start), __pa(next));
Matt Tolentino44df75e2006-01-17 07:03:41 +0100383 if (!after_bootmem)
384 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200385 unmap_low_page(pud);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386 }
387
Matt Tolentino44df75e2006-01-17 07:03:41 +0100388 if (!after_bootmem)
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200389 mmu_cr4_features = read_cr4();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390 __flush_tlb_all();
Andi Kleen75175272008-01-30 13:33:17 +0100391
392 reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393}
394
Matt Tolentino2b976902005-06-23 00:08:06 -0700395#ifndef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700396void __init paging_init(void)
397{
Mel Gorman6391af12006-10-11 01:20:39 -0700398 unsigned long max_zone_pfns[MAX_NR_ZONES];
399 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
400 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
401 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
402 max_zone_pfns[ZONE_NORMAL] = end_pfn;
403
Matt Tolentino44df75e2006-01-17 07:03:41 +0100404 memory_present(0, 0, end_pfn);
405 sparse_init();
Mel Gorman5cb248a2006-09-27 01:49:52 -0700406 free_area_init_nodes(max_zone_pfns);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700407}
408#endif
409
410/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
411 from the CPU leading to inconsistent cache lines. address and size
412 must be aligned to 2MB boundaries.
413 Does nothing when the mapping doesn't exist. */
414void __init clear_kernel_mapping(unsigned long address, unsigned long size)
415{
416 unsigned long end = address + size;
417
418 BUG_ON(address & ~LARGE_PAGE_MASK);
419 BUG_ON(size & ~LARGE_PAGE_MASK);
420
421 for (; address < end; address += LARGE_PAGE_SIZE) {
422 pgd_t *pgd = pgd_offset_k(address);
423 pud_t *pud;
424 pmd_t *pmd;
425 if (pgd_none(*pgd))
426 continue;
427 pud = pud_offset(pgd, address);
428 if (pud_none(*pud))
429 continue;
430 pmd = pmd_offset(pud, address);
431 if (!pmd || pmd_none(*pmd))
432 continue;
433 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
434 /* Could handle this, but it should not happen currently. */
435 printk(KERN_ERR
436 "clear_kernel_mapping: mapping has been split. will leak memory\n");
437 pmd_ERROR(*pmd);
438 }
439 set_pmd(pmd, __pmd(0));
440 }
441 __flush_tlb_all();
442}
443
Matt Tolentino44df75e2006-01-17 07:03:41 +0100444/*
445 * Memory hotplug specific functions
Matt Tolentino44df75e2006-01-17 07:03:41 +0100446 */
Matt Tolentino44df75e2006-01-17 07:03:41 +0100447void online_page(struct page *page)
448{
449 ClearPageReserved(page);
Nick Piggin7835e982006-03-22 00:08:40 -0800450 init_page_count(page);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100451 __free_page(page);
452 totalram_pages++;
453 num_physpages++;
454}
455
Yasunori Gotobc02af92006-06-27 02:53:30 -0700456#ifdef CONFIG_MEMORY_HOTPLUG
457/*
Yasunori Gotobc02af92006-06-27 02:53:30 -0700458 * Memory is added always to NORMAL zone. This means you will never get
459 * additional DMA/DMA32 memory.
460 */
461int arch_add_memory(int nid, u64 start, u64 size)
462{
463 struct pglist_data *pgdat = NODE_DATA(nid);
Christoph Lameter776ed982006-09-25 23:31:09 -0700464 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
Yasunori Gotobc02af92006-06-27 02:53:30 -0700465 unsigned long start_pfn = start >> PAGE_SHIFT;
466 unsigned long nr_pages = size >> PAGE_SHIFT;
467 int ret;
468
Keith Mannthey45e0b782006-09-30 23:27:09 -0700469 init_memory_mapping(start, (start + size -1));
470
Yasunori Gotobc02af92006-06-27 02:53:30 -0700471 ret = __add_pages(zone, start_pfn, nr_pages);
472 if (ret)
473 goto error;
474
Yasunori Gotobc02af92006-06-27 02:53:30 -0700475 return ret;
476error:
477 printk("%s: Problem encountered in __add_pages!\n", __func__);
478 return ret;
479}
480EXPORT_SYMBOL_GPL(arch_add_memory);
481
Yasunori Goto82432292006-11-18 22:19:40 -0800482#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
Keith Mannthey4942e992006-09-30 23:27:06 -0700483int memory_add_physaddr_to_nid(u64 start)
484{
485 return 0;
486}
Keith Mannthey8c2676a2006-09-30 23:27:07 -0700487EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
Keith Mannthey4942e992006-09-30 23:27:06 -0700488#endif
489
Keith Mannthey45e0b782006-09-30 23:27:09 -0700490#endif /* CONFIG_MEMORY_HOTPLUG */
491
Linus Torvalds1da177e2005-04-16 15:20:36 -0700492static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
493 kcore_vsyscall;
494
495void __init mem_init(void)
496{
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200497 long codesize, reservedpages, datasize, initsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498
Jon Mason0dc243a2006-06-26 13:58:11 +0200499 pci_iommu_alloc();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500
Yinghai Lu48ddb152008-01-30 13:32:36 +0100501 /* clear_bss() already clear the empty_zero_page */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502
Ingo Molnarf2633102008-01-30 13:32:36 +0100503 /* temporary debugging - double check it's true: */
504 {
505 int i;
506
507 for (i = 0; i < 1024; i++)
508 WARN_ON_ONCE(empty_zero_page[i]);
509 }
510
Linus Torvalds1da177e2005-04-16 15:20:36 -0700511 reservedpages = 0;
512
513 /* this will put all low memory onto the freelists */
Matt Tolentino2b976902005-06-23 00:08:06 -0700514#ifdef CONFIG_NUMA
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200515 totalram_pages = numa_free_all_bootmem();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700516#else
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200517 totalram_pages = free_all_bootmem();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700518#endif
Mel Gorman5cb248a2006-09-27 01:49:52 -0700519 reservedpages = end_pfn - totalram_pages -
520 absent_pages_in_range(0, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521
522 after_bootmem = 1;
523
524 codesize = (unsigned long) &_etext - (unsigned long) &_text;
525 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
526 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
527
528 /* Register memory areas for /proc/kcore */
529 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
530 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
531 VMALLOC_END-VMALLOC_START);
532 kclist_add(&kcore_kernel, &_stext, _end - _stext);
533 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
534 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
535 VSYSCALL_END - VSYSCALL_START);
536
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200537 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700538 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
539 end_pfn << (PAGE_SHIFT-10),
540 codesize >> 10,
541 reservedpages << (PAGE_SHIFT-10),
542 datasize >> 10,
543 initsize >> 10);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700544}
545
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200546void free_init_pages(char *what, unsigned long begin, unsigned long end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700547{
548 unsigned long addr;
549
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200550 if (begin >= end)
551 return;
552
Ingo Molnaree01f112008-01-30 13:34:09 +0100553 /*
554 * If debugging page accesses then do not free this memory but
555 * mark them not present - any buggy init-section access will
556 * create a kernel page fault:
557 */
558#ifdef CONFIG_DEBUG_PAGEALLOC
559 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
560 begin, PAGE_ALIGN(end));
561 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
562#else
Jan Beulich6fb14752007-05-02 19:27:10 +0200563 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200564 for (addr = begin; addr < end; addr += PAGE_SIZE) {
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700565 ClearPageReserved(virt_to_page(addr));
566 init_page_count(virt_to_page(addr));
567 memset((void *)(addr & ~(PAGE_SIZE-1)),
568 POISON_FREE_INITMEM, PAGE_SIZE);
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700569 free_page(addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700570 totalram_pages++;
571 }
Ingo Molnarf62d0f02008-01-30 13:34:07 +0100572#ifdef CONFIG_DEBUG_RODATA
573 /*
574 * This will make the __init pages not present and
575 * not executable, so that any attempt to use a
576 * __init function from now on will fault immediately
577 * rather than supriously later when memory gets reused.
578 *
579 * We only do this for DEBUG_RODATA to not break up the
580 * 2Mb kernel mapping just for this debug feature.
581 */
582 if (begin >= __START_KERNEL_map) {
Arjan van de Ven3c1df682008-01-30 13:34:07 +0100583 set_memory_rw(begin, (end - begin)/PAGE_SIZE);
Ingo Molnarf62d0f02008-01-30 13:34:07 +0100584 set_memory_np(begin, (end - begin)/PAGE_SIZE);
585 set_memory_nx(begin, (end - begin)/PAGE_SIZE);
586 }
587#endif
Ingo Molnaree01f112008-01-30 13:34:09 +0100588#endif
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200589}
590
591void free_initmem(void)
592{
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200593 free_init_pages("unused kernel memory",
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700594 (unsigned long)(&__init_begin),
595 (unsigned long)(&__init_end));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700596}
597
Arjan van de Ven67df1972006-01-06 00:12:04 -0800598#ifdef CONFIG_DEBUG_RODATA
Arjan van de Venedeed302008-01-30 13:34:08 +0100599const int rodata_test_data = 0xC3;
600EXPORT_SYMBOL_GPL(rodata_test_data);
Arjan van de Ven67df1972006-01-06 00:12:04 -0800601
Arjan van de Ven67df1972006-01-06 00:12:04 -0800602void mark_rodata_ro(void)
603{
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700604 unsigned long start = (unsigned long)_stext, end;
Arjan van de Ven67df1972006-01-06 00:12:04 -0800605
Linus Torvalds602033e2007-07-26 12:07:21 -0700606#ifdef CONFIG_HOTPLUG_CPU
607 /* It must still be possible to apply SMP alternatives. */
608 if (num_possible_cpus() > 1)
609 start = (unsigned long)_etext;
610#endif
611
612#ifdef CONFIG_KPROBES
613 start = (unsigned long)__start_rodata;
614#endif
615
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700616 end = (unsigned long)__end_rodata;
617 start = (start + PAGE_SIZE - 1) & PAGE_MASK;
618 end &= PAGE_MASK;
619 if (end <= start)
620 return;
621
Arjan van de Ven6d238cc2008-01-30 13:34:06 +0100622 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
Arjan van de Ven67df1972006-01-06 00:12:04 -0800623
Jan Beulich6fb14752007-05-02 19:27:10 +0200624 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700625 (end - start) >> 10);
Arjan van de Ven67df1972006-01-06 00:12:04 -0800626
Arjan van de Ven1a487252008-01-30 13:34:09 +0100627 rodata_test();
628
Andi Kleen0c42f392008-01-30 13:33:42 +0100629#ifdef CONFIG_CPA_DEBUG
630 printk("Testing CPA: undo %lx-%lx\n", start, end);
Arjan van de Ven6d238cc2008-01-30 13:34:06 +0100631 set_memory_rw(start, (end-start) >> PAGE_SHIFT);
Andi Kleen0c42f392008-01-30 13:33:42 +0100632
633 printk("Testing CPA: again\n");
Arjan van de Ven6d238cc2008-01-30 13:34:06 +0100634 set_memory_ro(start, (end-start) >> PAGE_SHIFT);
Andi Kleen0c42f392008-01-30 13:33:42 +0100635#endif
Arjan van de Ven67df1972006-01-06 00:12:04 -0800636}
637#endif
638
Linus Torvalds1da177e2005-04-16 15:20:36 -0700639#ifdef CONFIG_BLK_DEV_INITRD
640void free_initrd_mem(unsigned long start, unsigned long end)
641{
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700642 free_init_pages("initrd memory", start, end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700643}
644#endif
645
646void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
647{
Matt Tolentino2b976902005-06-23 00:08:06 -0700648#ifdef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700649 int nid = phys_to_nid(phys);
Andi Kleen5e58a022006-11-14 16:57:46 +0100650#endif
651 unsigned long pfn = phys >> PAGE_SHIFT;
652 if (pfn >= end_pfn) {
653 /* This can happen with kdump kernels when accessing firmware
654 tables. */
655 if (pfn < end_pfn_map)
656 return;
657 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
658 phys, len);
659 return;
660 }
661
662 /* Should check here against the e820 map to avoid double free */
663#ifdef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700664 reserve_bootmem_node(NODE_DATA(nid), phys, len);
665#else
666 reserve_bootmem(phys, len);
667#endif
Mel Gorman0e0b8642006-09-27 01:49:56 -0700668 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
Andi Kleene18c6872005-11-05 17:25:53 +0100669 dma_reserve += len / PAGE_SIZE;
Mel Gorman0e0b8642006-09-27 01:49:56 -0700670 set_dma_reserve(dma_reserve);
671 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700672}
673
674int kern_addr_valid(unsigned long addr)
675{
676 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
677 pgd_t *pgd;
678 pud_t *pud;
679 pmd_t *pmd;
680 pte_t *pte;
681
682 if (above != 0 && above != -1UL)
683 return 0;
684
685 pgd = pgd_offset_k(addr);
686 if (pgd_none(*pgd))
687 return 0;
688
689 pud = pud_offset(pgd, addr);
690 if (pud_none(*pud))
691 return 0;
692
693 pmd = pmd_offset(pud, addr);
694 if (pmd_none(*pmd))
695 return 0;
696 if (pmd_large(*pmd))
697 return pfn_valid(pmd_pfn(*pmd));
698
699 pte = pte_offset_kernel(pmd, addr);
700 if (pte_none(*pte))
701 return 0;
702 return pfn_valid(pte_pfn(*pte));
703}
704
Ernie Petrides103efcd2006-12-07 02:14:09 +0100705/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
Andi Kleen1e014412005-04-16 15:24:55 -0700706 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
707 not need special handling anymore. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700708
709static struct vm_area_struct gate_vma = {
710 .vm_start = VSYSCALL_START,
Ernie Petrides103efcd2006-12-07 02:14:09 +0100711 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
712 .vm_page_prot = PAGE_READONLY_EXEC,
713 .vm_flags = VM_READ | VM_EXEC
Linus Torvalds1da177e2005-04-16 15:20:36 -0700714};
715
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
717{
718#ifdef CONFIG_IA32_EMULATION
Andi Kleen1e014412005-04-16 15:24:55 -0700719 if (test_tsk_thread_flag(tsk, TIF_IA32))
720 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700721#endif
722 return &gate_vma;
723}
724
725int in_gate_area(struct task_struct *task, unsigned long addr)
726{
727 struct vm_area_struct *vma = get_gate_vma(task);
Andi Kleen1e014412005-04-16 15:24:55 -0700728 if (!vma)
729 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700730 return (addr >= vma->vm_start) && (addr < vma->vm_end);
731}
732
733/* Use this when you have no reliable task/vma, typically from interrupt
734 * context. It is less reliable than using the task's vma and may give
735 * false positives.
736 */
737int in_gate_area_no_task(unsigned long addr)
738{
Andi Kleen1e014412005-04-16 15:24:55 -0700739 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740}
Zou Nan hai2e1c49d2007-06-01 00:46:28 -0700741
Andi Kleen2aae9502007-07-21 17:10:01 +0200742const char *arch_vma_name(struct vm_area_struct *vma)
743{
744 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
745 return "[vdso]";
746 if (vma == &gate_vma)
747 return "[vsyscall]";
748 return NULL;
749}
Christoph Lameter0889eba2007-10-16 01:24:15 -0700750
751#ifdef CONFIG_SPARSEMEM_VMEMMAP
752/*
753 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
754 */
755int __meminit vmemmap_populate(struct page *start_page,
756 unsigned long size, int node)
757{
758 unsigned long addr = (unsigned long)start_page;
759 unsigned long end = (unsigned long)(start_page + size);
760 unsigned long next;
761 pgd_t *pgd;
762 pud_t *pud;
763 pmd_t *pmd;
764
765 for (; addr < end; addr = next) {
766 next = pmd_addr_end(addr, end);
767
768 pgd = vmemmap_pgd_populate(addr, node);
769 if (!pgd)
770 return -ENOMEM;
771 pud = vmemmap_pud_populate(pgd, addr, node);
772 if (!pud)
773 return -ENOMEM;
774
775 pmd = pmd_offset(pud, addr);
776 if (pmd_none(*pmd)) {
777 pte_t entry;
778 void *p = vmemmap_alloc_block(PMD_SIZE, node);
779 if (!p)
780 return -ENOMEM;
781
Jeremy Fitzhardinge27ec1612008-01-30 13:31:09 +0100782 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL_LARGE);
Christoph Lameter0889eba2007-10-16 01:24:15 -0700783 set_pmd(pmd, __pmd(pte_val(entry)));
784
785 printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
786 addr, addr + PMD_SIZE - 1, p, node);
787 } else
788 vmemmap_verify((pte_t *)pmd, node, addr, next);
789 }
790
791 return 0;
792}
793#endif