blob: f97ace7a55e5a15bca02fe015ac0f4ff73e8749a [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */
8
Linus Torvalds1da177e2005-04-16 15:20:36 -07009#include <linux/signal.h>
10#include <linux/sched.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/string.h>
14#include <linux/types.h>
15#include <linux/ptrace.h>
16#include <linux/mman.h>
17#include <linux/mm.h>
18#include <linux/swap.h>
19#include <linux/smp.h>
20#include <linux/init.h>
21#include <linux/pagemap.h>
22#include <linux/bootmem.h>
23#include <linux/proc_fs.h>
Andi Kleen59170892005-11-05 17:25:53 +010024#include <linux/pci.h>
Jan Beulich6fb14752007-05-02 19:27:10 +020025#include <linux/pfn.h>
Randy Dunlapc9cf5522006-06-27 02:53:52 -070026#include <linux/poison.h>
Muli Ben-Yehuda17a941d2006-01-11 22:44:42 +010027#include <linux/dma-mapping.h>
Matt Tolentino44df75e2006-01-17 07:03:41 +010028#include <linux/module.h>
29#include <linux/memory_hotplug.h>
Konrad Rzeszutekae32b122007-05-02 19:27:11 +020030#include <linux/nmi.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070031
32#include <asm/processor.h>
33#include <asm/system.h>
34#include <asm/uaccess.h>
35#include <asm/pgtable.h>
36#include <asm/pgalloc.h>
37#include <asm/dma.h>
38#include <asm/fixmap.h>
39#include <asm/e820.h>
40#include <asm/apic.h>
41#include <asm/tlb.h>
42#include <asm/mmu_context.h>
43#include <asm/proto.h>
44#include <asm/smp.h>
Andi Kleen2bc04142005-11-05 17:25:53 +010045#include <asm/sections.h>
Thomas Gleixner718fc132008-01-30 13:30:17 +010046#include <asm/kdebug.h>
Thomas Gleixneraaa64e02008-01-30 13:30:17 +010047#include <asm/numa.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070048
49#ifndef Dprintk
50#define Dprintk(x...)
51#endif
52
Stephen Hemmingere6584502007-05-02 19:27:06 +020053const struct dma_mapping_ops* dma_ops;
Muli Ben-Yehuda17a941d2006-01-11 22:44:42 +010054EXPORT_SYMBOL(dma_ops);
55
Andi Kleene18c6872005-11-05 17:25:53 +010056static unsigned long dma_reserve __initdata;
57
Linus Torvalds1da177e2005-04-16 15:20:36 -070058DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
59
60/*
61 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
62 * physical space so we can cache the place of the first one and move
63 * around without checking the pgd every time.
64 */
65
66void show_mem(void)
67{
Andi Kleene92343c2005-09-12 18:49:24 +020068 long i, total = 0, reserved = 0;
69 long shared = 0, cached = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -070070 pg_data_t *pgdat;
71 struct page *page;
72
Andi Kleene92343c2005-09-12 18:49:24 +020073 printk(KERN_INFO "Mem-info:\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -070074 show_free_areas();
Andi Kleene92343c2005-09-12 18:49:24 +020075 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
Linus Torvalds1da177e2005-04-16 15:20:36 -070076
KAMEZAWA Hiroyukiec936fc2006-03-27 01:15:59 -080077 for_each_online_pgdat(pgdat) {
Linus Torvalds1da177e2005-04-16 15:20:36 -070078 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
Konrad Rzeszutekae32b122007-05-02 19:27:11 +020079 /* this loop can take a while with 256 GB and 4k pages
80 so update the NMI watchdog */
81 if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
82 touch_nmi_watchdog();
83 }
Bob Picco12710a52007-06-08 13:47:00 -070084 if (!pfn_valid(pgdat->node_start_pfn + i))
85 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -070086 page = pfn_to_page(pgdat->node_start_pfn + i);
87 total++;
Andi Kleene92343c2005-09-12 18:49:24 +020088 if (PageReserved(page))
89 reserved++;
90 else if (PageSwapCache(page))
91 cached++;
92 else if (page_count(page))
93 shared += page_count(page) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -070094 }
95 }
Andi Kleene92343c2005-09-12 18:49:24 +020096 printk(KERN_INFO "%lu pages of RAM\n", total);
97 printk(KERN_INFO "%lu reserved pages\n",reserved);
98 printk(KERN_INFO "%lu pages shared\n",shared);
99 printk(KERN_INFO "%lu pages swap cached\n",cached);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100}
101
Linus Torvalds1da177e2005-04-16 15:20:36 -0700102int after_bootmem;
103
Andi Kleen5f44a662006-03-25 16:30:25 +0100104static __init void *spp_getpage(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105{
106 void *ptr;
107 if (after_bootmem)
108 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
109 else
110 ptr = alloc_bootmem_pages(PAGE_SIZE);
111 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
112 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
113
114 Dprintk("spp_getpage %p\n", ptr);
115 return ptr;
116}
117
Andi Kleen5f44a662006-03-25 16:30:25 +0100118static __init void set_pte_phys(unsigned long vaddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119 unsigned long phys, pgprot_t prot)
120{
121 pgd_t *pgd;
122 pud_t *pud;
123 pmd_t *pmd;
124 pte_t *pte, new_pte;
125
126 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
127
128 pgd = pgd_offset_k(vaddr);
129 if (pgd_none(*pgd)) {
130 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
131 return;
132 }
133 pud = pud_offset(pgd, vaddr);
134 if (pud_none(*pud)) {
135 pmd = (pmd_t *) spp_getpage();
136 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
137 if (pmd != pmd_offset(pud, 0)) {
138 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
139 return;
140 }
141 }
142 pmd = pmd_offset(pud, vaddr);
143 if (pmd_none(*pmd)) {
144 pte = (pte_t *) spp_getpage();
145 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
146 if (pte != pte_offset_kernel(pmd, 0)) {
147 printk("PAGETABLE BUG #02!\n");
148 return;
149 }
150 }
151 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
152
153 pte = pte_offset_kernel(pmd, vaddr);
154 if (!pte_none(*pte) &&
155 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
156 pte_ERROR(*pte);
157 set_pte(pte, new_pte);
158
159 /*
160 * It's enough to flush this one mapping.
161 * (PGE mappings get flushed as well)
162 */
163 __flush_tlb_one(vaddr);
164}
165
166/* NOTE: this is meant to be run only at boot */
Andi Kleen5f44a662006-03-25 16:30:25 +0100167void __init
168__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169{
170 unsigned long address = __fix_to_virt(idx);
171
172 if (idx >= __end_of_fixed_addresses) {
173 printk("Invalid __set_fixmap\n");
174 return;
175 }
176 set_pte_phys(address, phys, prot);
177}
178
Andi Kleen75175272008-01-30 13:33:17 +0100179static unsigned long __initdata table_start;
180static unsigned long __meminitdata table_end;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200182static __meminit void *alloc_low_page(unsigned long *phys)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200184 unsigned long pfn = table_end++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185 void *adr;
186
Matt Tolentino44df75e2006-01-17 07:03:41 +0100187 if (after_bootmem) {
188 adr = (void *)get_zeroed_page(GFP_ATOMIC);
189 *phys = __pa(adr);
190 return adr;
191 }
192
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193 if (pfn >= end_pfn)
194 panic("alloc_low_page: ran out of memory");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200196 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
197 memset(adr, 0, PAGE_SIZE);
198 *phys = pfn * PAGE_SIZE;
199 return adr;
200}
201
202static __meminit void unmap_low_page(void *adr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700203{
Matt Tolentino44df75e2006-01-17 07:03:41 +0100204
205 if (after_bootmem)
206 return;
207
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200208 early_iounmap(adr, PAGE_SIZE);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209}
210
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100211/* Must run before zap_low_mappings */
Yasunori Gotoa3142c82007-05-08 00:23:07 -0700212__meminit void *early_ioremap(unsigned long addr, unsigned long size)
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100213{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200214 unsigned long vaddr;
215 pmd_t *pmd, *last_pmd;
216 int i, pmds;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100217
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200218 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
219 vaddr = __START_KERNEL_map;
220 pmd = level2_kernel_pgt;
221 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
222 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
223 for (i = 0; i < pmds; i++) {
224 if (pmd_present(pmd[i]))
225 goto next;
226 }
227 vaddr += addr & ~PMD_MASK;
228 addr &= PMD_MASK;
229 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
Joerg Roedel929fd582008-01-30 13:31:08 +0100230 set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
Andi Kleen1a2b4412008-01-30 13:33:54 +0100231 __flush_tlb_all();
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200232 return (void *)vaddr;
233 next:
234 ;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100235 }
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200236 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
237 return NULL;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100238}
239
240/* To avoid virtual aliases later */
Yasunori Gotoa3142c82007-05-08 00:23:07 -0700241__meminit void early_iounmap(void *addr, unsigned long size)
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100242{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200243 unsigned long vaddr;
244 pmd_t *pmd;
245 int i, pmds;
246
247 vaddr = (unsigned long)addr;
248 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
249 pmd = level2_kernel_pgt + pmd_index(vaddr);
250 for (i = 0; i < pmds; i++)
251 pmd_clear(pmd + i);
Andi Kleen1a2b4412008-01-30 13:33:54 +0100252 __flush_tlb_all();
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100253}
254
Matt Tolentino44df75e2006-01-17 07:03:41 +0100255static void __meminit
Keith Mannthey6ad91652006-09-26 10:52:36 +0200256phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
Matt Tolentino44df75e2006-01-17 07:03:41 +0100257{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200258 int i = pmd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259
Keith Mannthey6ad91652006-09-26 10:52:36 +0200260 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
Matt Tolentino44df75e2006-01-17 07:03:41 +0100261 unsigned long entry;
Keith Mannthey6ad91652006-09-26 10:52:36 +0200262 pmd_t *pmd = pmd_page + pmd_index(address);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100263
Jan Beulich5f51e132006-06-26 13:59:02 +0200264 if (address >= end) {
265 if (!after_bootmem)
266 for (; i < PTRS_PER_PMD; i++, pmd++)
267 set_pmd(pmd, __pmd(0));
Matt Tolentino44df75e2006-01-17 07:03:41 +0100268 break;
269 }
Keith Mannthey6ad91652006-09-26 10:52:36 +0200270
271 if (pmd_val(*pmd))
272 continue;
273
Joerg Roedel40842bf2008-01-30 13:31:02 +0100274 entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address;
Matt Tolentino44df75e2006-01-17 07:03:41 +0100275 entry &= __supported_pte_mask;
276 set_pmd(pmd, __pmd(entry));
277 }
278}
279
280static void __meminit
281phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
282{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200283 pmd_t *pmd = pmd_offset(pud,0);
284 spin_lock(&init_mm.page_table_lock);
285 phys_pmd_init(pmd, address, end);
286 spin_unlock(&init_mm.page_table_lock);
287 __flush_tlb_all();
Matt Tolentino44df75e2006-01-17 07:03:41 +0100288}
289
Keith Mannthey6ad91652006-09-26 10:52:36 +0200290static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
Matt Tolentino44df75e2006-01-17 07:03:41 +0100291{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200292 int i = pud_index(addr);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100293
Matt Tolentino44df75e2006-01-17 07:03:41 +0100294
Keith Mannthey6ad91652006-09-26 10:52:36 +0200295 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
Keith Mannthey6ad91652006-09-26 10:52:36 +0200296 unsigned long pmd_phys;
297 pud_t *pud = pud_page + pud_index(addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298 pmd_t *pmd;
299
Keith Mannthey6ad91652006-09-26 10:52:36 +0200300 if (addr >= end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302
Keith Mannthey6ad91652006-09-26 10:52:36 +0200303 if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304 set_pud(pud, __pud(0));
305 continue;
306 }
307
Keith Mannthey6ad91652006-09-26 10:52:36 +0200308 if (pud_val(*pud)) {
309 phys_pmd_update(pud, addr, end);
310 continue;
311 }
312
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200313 pmd = alloc_low_page(&pmd_phys);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100314 spin_lock(&init_mm.page_table_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700315 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
Keith Mannthey6ad91652006-09-26 10:52:36 +0200316 phys_pmd_init(pmd, addr, end);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100317 spin_unlock(&init_mm.page_table_lock);
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200318 unmap_low_page(pmd);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319 }
Andi Kleen1a2b4412008-01-30 13:33:54 +0100320 __flush_tlb_all();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321}
322
323static void __init find_early_table_space(unsigned long end)
324{
Andi Kleen6c5acd12006-01-11 22:46:57 +0100325 unsigned long puds, pmds, tables, start;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700326
327 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
328 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
329 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
330 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
331
Andi Kleenee408c72006-01-16 01:56:51 +0100332 /* RED-PEN putting page tables only on node 0 could
333 cause a hotspot and fill up ZONE_DMA. The page tables
334 need roughly 0.5KB per GB. */
335 start = 0x8000;
336 table_start = find_e820_area(start, end, tables);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700337 if (table_start == -1UL)
338 panic("Cannot find space for the kernel page tables");
339
340 table_start >>= PAGE_SHIFT;
341 table_end = table_start;
Matt Tolentino44df75e2006-01-17 07:03:41 +0100342
343 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
Jan Beulich5f51e132006-06-26 13:59:02 +0200344 end, table_start << PAGE_SHIFT,
345 (table_start << PAGE_SHIFT) + tables);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700346}
347
348/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
349 This runs before bootmem is initialized and gets pages directly from the
350 physical memory. To access them they are temporarily mapped. */
KAMEZAWA Hiroyukib6fd6ec2007-11-28 16:21:58 -0800351void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352{
353 unsigned long next;
354
355 Dprintk("init_memory_mapping\n");
356
357 /*
358 * Find space for the kernel direct mapping tables.
359 * Later we should allocate these tables in the local node of the memory
360 * mapped. Unfortunately this is done currently before the nodes are
361 * discovered.
362 */
Matt Tolentino44df75e2006-01-17 07:03:41 +0100363 if (!after_bootmem)
364 find_early_table_space(end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700365
366 start = (unsigned long)__va(start);
367 end = (unsigned long)__va(end);
368
369 for (; start < end; start = next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370 unsigned long pud_phys;
Matt Tolentino44df75e2006-01-17 07:03:41 +0100371 pgd_t *pgd = pgd_offset_k(start);
372 pud_t *pud;
373
374 if (after_bootmem)
Andi Kleend2ae5b52006-06-26 13:57:56 +0200375 pud = pud_offset(pgd, start & PGDIR_MASK);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100376 else
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200377 pud = alloc_low_page(&pud_phys);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100378
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 next = start + PGDIR_SIZE;
380 if (next > end)
381 next = end;
382 phys_pud_init(pud, __pa(start), __pa(next));
Matt Tolentino44df75e2006-01-17 07:03:41 +0100383 if (!after_bootmem)
384 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200385 unmap_low_page(pud);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386 }
387
Matt Tolentino44df75e2006-01-17 07:03:41 +0100388 if (!after_bootmem)
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200389 mmu_cr4_features = read_cr4();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390 __flush_tlb_all();
Andi Kleen75175272008-01-30 13:33:17 +0100391
392 reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393}
394
Matt Tolentino2b976902005-06-23 00:08:06 -0700395#ifndef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700396void __init paging_init(void)
397{
Mel Gorman6391af12006-10-11 01:20:39 -0700398 unsigned long max_zone_pfns[MAX_NR_ZONES];
399 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
400 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
401 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
402 max_zone_pfns[ZONE_NORMAL] = end_pfn;
403
Matt Tolentino44df75e2006-01-17 07:03:41 +0100404 memory_present(0, 0, end_pfn);
405 sparse_init();
Mel Gorman5cb248a2006-09-27 01:49:52 -0700406 free_area_init_nodes(max_zone_pfns);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700407}
408#endif
409
410/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
411 from the CPU leading to inconsistent cache lines. address and size
412 must be aligned to 2MB boundaries.
413 Does nothing when the mapping doesn't exist. */
414void __init clear_kernel_mapping(unsigned long address, unsigned long size)
415{
416 unsigned long end = address + size;
417
418 BUG_ON(address & ~LARGE_PAGE_MASK);
419 BUG_ON(size & ~LARGE_PAGE_MASK);
420
421 for (; address < end; address += LARGE_PAGE_SIZE) {
422 pgd_t *pgd = pgd_offset_k(address);
423 pud_t *pud;
424 pmd_t *pmd;
425 if (pgd_none(*pgd))
426 continue;
427 pud = pud_offset(pgd, address);
428 if (pud_none(*pud))
429 continue;
430 pmd = pmd_offset(pud, address);
431 if (!pmd || pmd_none(*pmd))
432 continue;
433 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
434 /* Could handle this, but it should not happen currently. */
435 printk(KERN_ERR
436 "clear_kernel_mapping: mapping has been split. will leak memory\n");
437 pmd_ERROR(*pmd);
438 }
439 set_pmd(pmd, __pmd(0));
440 }
441 __flush_tlb_all();
442}
443
Matt Tolentino44df75e2006-01-17 07:03:41 +0100444/*
445 * Memory hotplug specific functions
Matt Tolentino44df75e2006-01-17 07:03:41 +0100446 */
Matt Tolentino44df75e2006-01-17 07:03:41 +0100447void online_page(struct page *page)
448{
449 ClearPageReserved(page);
Nick Piggin7835e982006-03-22 00:08:40 -0800450 init_page_count(page);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100451 __free_page(page);
452 totalram_pages++;
453 num_physpages++;
454}
455
Yasunori Gotobc02af92006-06-27 02:53:30 -0700456#ifdef CONFIG_MEMORY_HOTPLUG
457/*
Yasunori Gotobc02af92006-06-27 02:53:30 -0700458 * Memory is added always to NORMAL zone. This means you will never get
459 * additional DMA/DMA32 memory.
460 */
461int arch_add_memory(int nid, u64 start, u64 size)
462{
463 struct pglist_data *pgdat = NODE_DATA(nid);
Christoph Lameter776ed982006-09-25 23:31:09 -0700464 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
Yasunori Gotobc02af92006-06-27 02:53:30 -0700465 unsigned long start_pfn = start >> PAGE_SHIFT;
466 unsigned long nr_pages = size >> PAGE_SHIFT;
467 int ret;
468
Keith Mannthey45e0b782006-09-30 23:27:09 -0700469 init_memory_mapping(start, (start + size -1));
470
Yasunori Gotobc02af92006-06-27 02:53:30 -0700471 ret = __add_pages(zone, start_pfn, nr_pages);
472 if (ret)
473 goto error;
474
Yasunori Gotobc02af92006-06-27 02:53:30 -0700475 return ret;
476error:
477 printk("%s: Problem encountered in __add_pages!\n", __func__);
478 return ret;
479}
480EXPORT_SYMBOL_GPL(arch_add_memory);
481
Yasunori Goto82432292006-11-18 22:19:40 -0800482#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
Keith Mannthey4942e992006-09-30 23:27:06 -0700483int memory_add_physaddr_to_nid(u64 start)
484{
485 return 0;
486}
Keith Mannthey8c2676a2006-09-30 23:27:07 -0700487EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
Keith Mannthey4942e992006-09-30 23:27:06 -0700488#endif
489
Keith Mannthey45e0b782006-09-30 23:27:09 -0700490#endif /* CONFIG_MEMORY_HOTPLUG */
491
Linus Torvalds1da177e2005-04-16 15:20:36 -0700492static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
493 kcore_vsyscall;
494
495void __init mem_init(void)
496{
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200497 long codesize, reservedpages, datasize, initsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498
Jon Mason0dc243a2006-06-26 13:58:11 +0200499 pci_iommu_alloc();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500
Yinghai Lu48ddb152008-01-30 13:32:36 +0100501 /* clear_bss() already clear the empty_zero_page */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502
Ingo Molnarf2633102008-01-30 13:32:36 +0100503 /* temporary debugging - double check it's true: */
504 {
505 int i;
506
507 for (i = 0; i < 1024; i++)
508 WARN_ON_ONCE(empty_zero_page[i]);
509 }
510
Linus Torvalds1da177e2005-04-16 15:20:36 -0700511 reservedpages = 0;
512
513 /* this will put all low memory onto the freelists */
Matt Tolentino2b976902005-06-23 00:08:06 -0700514#ifdef CONFIG_NUMA
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200515 totalram_pages = numa_free_all_bootmem();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700516#else
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200517 totalram_pages = free_all_bootmem();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700518#endif
Mel Gorman5cb248a2006-09-27 01:49:52 -0700519 reservedpages = end_pfn - totalram_pages -
520 absent_pages_in_range(0, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521
522 after_bootmem = 1;
523
524 codesize = (unsigned long) &_etext - (unsigned long) &_text;
525 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
526 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
527
528 /* Register memory areas for /proc/kcore */
529 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
530 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
531 VMALLOC_END-VMALLOC_START);
532 kclist_add(&kcore_kernel, &_stext, _end - _stext);
533 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
534 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
535 VSYSCALL_END - VSYSCALL_START);
536
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200537 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700538 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
539 end_pfn << (PAGE_SHIFT-10),
540 codesize >> 10,
541 reservedpages << (PAGE_SHIFT-10),
542 datasize >> 10,
543 initsize >> 10);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700544}
545
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200546void free_init_pages(char *what, unsigned long begin, unsigned long end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700547{
548 unsigned long addr;
549
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200550 if (begin >= end)
551 return;
552
Jan Beulich6fb14752007-05-02 19:27:10 +0200553 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200554 for (addr = begin; addr < end; addr += PAGE_SIZE) {
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700555 ClearPageReserved(virt_to_page(addr));
556 init_page_count(virt_to_page(addr));
557 memset((void *)(addr & ~(PAGE_SIZE-1)),
558 POISON_FREE_INITMEM, PAGE_SIZE);
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700559 free_page(addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700560 totalram_pages++;
561 }
Ingo Molnarf62d0f02008-01-30 13:34:07 +0100562#ifdef CONFIG_DEBUG_RODATA
563 /*
564 * This will make the __init pages not present and
565 * not executable, so that any attempt to use a
566 * __init function from now on will fault immediately
567 * rather than supriously later when memory gets reused.
568 *
569 * We only do this for DEBUG_RODATA to not break up the
570 * 2Mb kernel mapping just for this debug feature.
571 */
572 if (begin >= __START_KERNEL_map) {
Arjan van de Ven3c1df682008-01-30 13:34:07 +0100573 set_memory_rw(begin, (end - begin)/PAGE_SIZE);
Ingo Molnarf62d0f02008-01-30 13:34:07 +0100574 set_memory_np(begin, (end - begin)/PAGE_SIZE);
575 set_memory_nx(begin, (end - begin)/PAGE_SIZE);
576 }
577#endif
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200578}
579
580void free_initmem(void)
581{
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200582 free_init_pages("unused kernel memory",
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700583 (unsigned long)(&__init_begin),
584 (unsigned long)(&__init_end));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700585}
586
Arjan van de Ven67df1972006-01-06 00:12:04 -0800587#ifdef CONFIG_DEBUG_RODATA
588
Arjan van de Ven67df1972006-01-06 00:12:04 -0800589void mark_rodata_ro(void)
590{
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700591 unsigned long start = (unsigned long)_stext, end;
Arjan van de Ven67df1972006-01-06 00:12:04 -0800592
Linus Torvalds602033e2007-07-26 12:07:21 -0700593#ifdef CONFIG_HOTPLUG_CPU
594 /* It must still be possible to apply SMP alternatives. */
595 if (num_possible_cpus() > 1)
596 start = (unsigned long)_etext;
597#endif
598
599#ifdef CONFIG_KPROBES
600 start = (unsigned long)__start_rodata;
601#endif
602
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700603 end = (unsigned long)__end_rodata;
604 start = (start + PAGE_SIZE - 1) & PAGE_MASK;
605 end &= PAGE_MASK;
606 if (end <= start)
607 return;
608
Arjan van de Ven6d238cc2008-01-30 13:34:06 +0100609 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
Arjan van de Ven67df1972006-01-06 00:12:04 -0800610
Jan Beulich6fb14752007-05-02 19:27:10 +0200611 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700612 (end - start) >> 10);
Arjan van de Ven67df1972006-01-06 00:12:04 -0800613
Andi Kleen0c42f392008-01-30 13:33:42 +0100614#ifdef CONFIG_CPA_DEBUG
615 printk("Testing CPA: undo %lx-%lx\n", start, end);
Arjan van de Ven6d238cc2008-01-30 13:34:06 +0100616 set_memory_rw(start, (end-start) >> PAGE_SHIFT);
Andi Kleen0c42f392008-01-30 13:33:42 +0100617
618 printk("Testing CPA: again\n");
Arjan van de Ven6d238cc2008-01-30 13:34:06 +0100619 set_memory_ro(start, (end-start) >> PAGE_SHIFT);
Andi Kleen0c42f392008-01-30 13:33:42 +0100620#endif
Arjan van de Ven67df1972006-01-06 00:12:04 -0800621}
622#endif
623
Linus Torvalds1da177e2005-04-16 15:20:36 -0700624#ifdef CONFIG_BLK_DEV_INITRD
625void free_initrd_mem(unsigned long start, unsigned long end)
626{
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700627 free_init_pages("initrd memory", start, end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700628}
629#endif
630
631void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
632{
Matt Tolentino2b976902005-06-23 00:08:06 -0700633#ifdef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700634 int nid = phys_to_nid(phys);
Andi Kleen5e58a022006-11-14 16:57:46 +0100635#endif
636 unsigned long pfn = phys >> PAGE_SHIFT;
637 if (pfn >= end_pfn) {
638 /* This can happen with kdump kernels when accessing firmware
639 tables. */
640 if (pfn < end_pfn_map)
641 return;
642 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
643 phys, len);
644 return;
645 }
646
647 /* Should check here against the e820 map to avoid double free */
648#ifdef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700649 reserve_bootmem_node(NODE_DATA(nid), phys, len);
650#else
651 reserve_bootmem(phys, len);
652#endif
Mel Gorman0e0b8642006-09-27 01:49:56 -0700653 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
Andi Kleene18c6872005-11-05 17:25:53 +0100654 dma_reserve += len / PAGE_SIZE;
Mel Gorman0e0b8642006-09-27 01:49:56 -0700655 set_dma_reserve(dma_reserve);
656 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700657}
658
659int kern_addr_valid(unsigned long addr)
660{
661 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
662 pgd_t *pgd;
663 pud_t *pud;
664 pmd_t *pmd;
665 pte_t *pte;
666
667 if (above != 0 && above != -1UL)
668 return 0;
669
670 pgd = pgd_offset_k(addr);
671 if (pgd_none(*pgd))
672 return 0;
673
674 pud = pud_offset(pgd, addr);
675 if (pud_none(*pud))
676 return 0;
677
678 pmd = pmd_offset(pud, addr);
679 if (pmd_none(*pmd))
680 return 0;
681 if (pmd_large(*pmd))
682 return pfn_valid(pmd_pfn(*pmd));
683
684 pte = pte_offset_kernel(pmd, addr);
685 if (pte_none(*pte))
686 return 0;
687 return pfn_valid(pte_pfn(*pte));
688}
689
Ernie Petrides103efcd2006-12-07 02:14:09 +0100690/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
Andi Kleen1e014412005-04-16 15:24:55 -0700691 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
692 not need special handling anymore. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700693
694static struct vm_area_struct gate_vma = {
695 .vm_start = VSYSCALL_START,
Ernie Petrides103efcd2006-12-07 02:14:09 +0100696 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
697 .vm_page_prot = PAGE_READONLY_EXEC,
698 .vm_flags = VM_READ | VM_EXEC
Linus Torvalds1da177e2005-04-16 15:20:36 -0700699};
700
Linus Torvalds1da177e2005-04-16 15:20:36 -0700701struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
702{
703#ifdef CONFIG_IA32_EMULATION
Andi Kleen1e014412005-04-16 15:24:55 -0700704 if (test_tsk_thread_flag(tsk, TIF_IA32))
705 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700706#endif
707 return &gate_vma;
708}
709
710int in_gate_area(struct task_struct *task, unsigned long addr)
711{
712 struct vm_area_struct *vma = get_gate_vma(task);
Andi Kleen1e014412005-04-16 15:24:55 -0700713 if (!vma)
714 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700715 return (addr >= vma->vm_start) && (addr < vma->vm_end);
716}
717
718/* Use this when you have no reliable task/vma, typically from interrupt
719 * context. It is less reliable than using the task's vma and may give
720 * false positives.
721 */
722int in_gate_area_no_task(unsigned long addr)
723{
Andi Kleen1e014412005-04-16 15:24:55 -0700724 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700725}
Zou Nan hai2e1c49d2007-06-01 00:46:28 -0700726
Andi Kleen2aae9502007-07-21 17:10:01 +0200727const char *arch_vma_name(struct vm_area_struct *vma)
728{
729 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
730 return "[vdso]";
731 if (vma == &gate_vma)
732 return "[vsyscall]";
733 return NULL;
734}
Christoph Lameter0889eba2007-10-16 01:24:15 -0700735
736#ifdef CONFIG_SPARSEMEM_VMEMMAP
737/*
738 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
739 */
740int __meminit vmemmap_populate(struct page *start_page,
741 unsigned long size, int node)
742{
743 unsigned long addr = (unsigned long)start_page;
744 unsigned long end = (unsigned long)(start_page + size);
745 unsigned long next;
746 pgd_t *pgd;
747 pud_t *pud;
748 pmd_t *pmd;
749
750 for (; addr < end; addr = next) {
751 next = pmd_addr_end(addr, end);
752
753 pgd = vmemmap_pgd_populate(addr, node);
754 if (!pgd)
755 return -ENOMEM;
756 pud = vmemmap_pud_populate(pgd, addr, node);
757 if (!pud)
758 return -ENOMEM;
759
760 pmd = pmd_offset(pud, addr);
761 if (pmd_none(*pmd)) {
762 pte_t entry;
763 void *p = vmemmap_alloc_block(PMD_SIZE, node);
764 if (!p)
765 return -ENOMEM;
766
Jeremy Fitzhardinge27ec1612008-01-30 13:31:09 +0100767 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL_LARGE);
Christoph Lameter0889eba2007-10-16 01:24:15 -0700768 set_pmd(pmd, __pmd(pte_val(entry)));
769
770 printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
771 addr, addr + PMD_SIZE - 1, p, node);
772 } else
773 vmemmap_verify((pte_t *)pmd, node, addr, next);
774 }
775
776 return 0;
777}
778#endif