blob: 0fd9d7f77786193a7fc77f18e1093fa150118d84 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */
8
Linus Torvalds1da177e2005-04-16 15:20:36 -07009#include <linux/signal.h>
10#include <linux/sched.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/string.h>
14#include <linux/types.h>
15#include <linux/ptrace.h>
16#include <linux/mman.h>
17#include <linux/mm.h>
18#include <linux/swap.h>
19#include <linux/smp.h>
20#include <linux/init.h>
21#include <linux/pagemap.h>
22#include <linux/bootmem.h>
23#include <linux/proc_fs.h>
Andi Kleen59170892005-11-05 17:25:53 +010024#include <linux/pci.h>
Jan Beulich6fb14752007-05-02 19:27:10 +020025#include <linux/pfn.h>
Randy Dunlapc9cf5522006-06-27 02:53:52 -070026#include <linux/poison.h>
Muli Ben-Yehuda17a941d2006-01-11 22:44:42 +010027#include <linux/dma-mapping.h>
Matt Tolentino44df75e2006-01-17 07:03:41 +010028#include <linux/module.h>
29#include <linux/memory_hotplug.h>
Konrad Rzeszutekae32b122007-05-02 19:27:11 +020030#include <linux/nmi.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070031
32#include <asm/processor.h>
33#include <asm/system.h>
34#include <asm/uaccess.h>
35#include <asm/pgtable.h>
36#include <asm/pgalloc.h>
37#include <asm/dma.h>
38#include <asm/fixmap.h>
39#include <asm/e820.h>
40#include <asm/apic.h>
41#include <asm/tlb.h>
42#include <asm/mmu_context.h>
43#include <asm/proto.h>
44#include <asm/smp.h>
Andi Kleen2bc04142005-11-05 17:25:53 +010045#include <asm/sections.h>
Thomas Gleixner718fc132008-01-30 13:30:17 +010046#include <asm/kdebug.h>
Thomas Gleixneraaa64e02008-01-30 13:30:17 +010047#include <asm/numa.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070048
49#ifndef Dprintk
50#define Dprintk(x...)
51#endif
52
Stephen Hemmingere6584502007-05-02 19:27:06 +020053const struct dma_mapping_ops* dma_ops;
Muli Ben-Yehuda17a941d2006-01-11 22:44:42 +010054EXPORT_SYMBOL(dma_ops);
55
Andi Kleene18c6872005-11-05 17:25:53 +010056static unsigned long dma_reserve __initdata;
57
Linus Torvalds1da177e2005-04-16 15:20:36 -070058DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
59
60/*
61 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
62 * physical space so we can cache the place of the first one and move
63 * around without checking the pgd every time.
64 */
65
66void show_mem(void)
67{
Andi Kleene92343c2005-09-12 18:49:24 +020068 long i, total = 0, reserved = 0;
69 long shared = 0, cached = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -070070 pg_data_t *pgdat;
71 struct page *page;
72
Andi Kleene92343c2005-09-12 18:49:24 +020073 printk(KERN_INFO "Mem-info:\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -070074 show_free_areas();
Andi Kleene92343c2005-09-12 18:49:24 +020075 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
Linus Torvalds1da177e2005-04-16 15:20:36 -070076
KAMEZAWA Hiroyukiec936fc2006-03-27 01:15:59 -080077 for_each_online_pgdat(pgdat) {
Linus Torvalds1da177e2005-04-16 15:20:36 -070078 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
Konrad Rzeszutekae32b122007-05-02 19:27:11 +020079 /* this loop can take a while with 256 GB and 4k pages
80 so update the NMI watchdog */
81 if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
82 touch_nmi_watchdog();
83 }
Bob Picco12710a52007-06-08 13:47:00 -070084 if (!pfn_valid(pgdat->node_start_pfn + i))
85 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -070086 page = pfn_to_page(pgdat->node_start_pfn + i);
87 total++;
Andi Kleene92343c2005-09-12 18:49:24 +020088 if (PageReserved(page))
89 reserved++;
90 else if (PageSwapCache(page))
91 cached++;
92 else if (page_count(page))
93 shared += page_count(page) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -070094 }
95 }
Andi Kleene92343c2005-09-12 18:49:24 +020096 printk(KERN_INFO "%lu pages of RAM\n", total);
97 printk(KERN_INFO "%lu reserved pages\n",reserved);
98 printk(KERN_INFO "%lu pages shared\n",shared);
99 printk(KERN_INFO "%lu pages swap cached\n",cached);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100}
101
Linus Torvalds1da177e2005-04-16 15:20:36 -0700102int after_bootmem;
103
Andi Kleen5f44a662006-03-25 16:30:25 +0100104static __init void *spp_getpage(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105{
106 void *ptr;
107 if (after_bootmem)
108 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
109 else
110 ptr = alloc_bootmem_pages(PAGE_SIZE);
111 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
112 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
113
114 Dprintk("spp_getpage %p\n", ptr);
115 return ptr;
116}
117
Andi Kleen5f44a662006-03-25 16:30:25 +0100118static __init void set_pte_phys(unsigned long vaddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119 unsigned long phys, pgprot_t prot)
120{
121 pgd_t *pgd;
122 pud_t *pud;
123 pmd_t *pmd;
124 pte_t *pte, new_pte;
125
126 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
127
128 pgd = pgd_offset_k(vaddr);
129 if (pgd_none(*pgd)) {
130 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
131 return;
132 }
133 pud = pud_offset(pgd, vaddr);
134 if (pud_none(*pud)) {
135 pmd = (pmd_t *) spp_getpage();
136 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
137 if (pmd != pmd_offset(pud, 0)) {
138 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
139 return;
140 }
141 }
142 pmd = pmd_offset(pud, vaddr);
143 if (pmd_none(*pmd)) {
144 pte = (pte_t *) spp_getpage();
145 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
146 if (pte != pte_offset_kernel(pmd, 0)) {
147 printk("PAGETABLE BUG #02!\n");
148 return;
149 }
150 }
151 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
152
153 pte = pte_offset_kernel(pmd, vaddr);
154 if (!pte_none(*pte) &&
155 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
156 pte_ERROR(*pte);
157 set_pte(pte, new_pte);
158
159 /*
160 * It's enough to flush this one mapping.
161 * (PGE mappings get flushed as well)
162 */
163 __flush_tlb_one(vaddr);
164}
165
166/* NOTE: this is meant to be run only at boot */
Andi Kleen5f44a662006-03-25 16:30:25 +0100167void __init
168__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169{
170 unsigned long address = __fix_to_virt(idx);
171
172 if (idx >= __end_of_fixed_addresses) {
173 printk("Invalid __set_fixmap\n");
174 return;
175 }
176 set_pte_phys(address, phys, prot);
177}
178
Andi Kleen75175272008-01-30 13:33:17 +0100179static unsigned long __initdata table_start;
180static unsigned long __meminitdata table_end;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200182static __meminit void *alloc_low_page(unsigned long *phys)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200184 unsigned long pfn = table_end++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185 void *adr;
186
Matt Tolentino44df75e2006-01-17 07:03:41 +0100187 if (after_bootmem) {
188 adr = (void *)get_zeroed_page(GFP_ATOMIC);
189 *phys = __pa(adr);
190 return adr;
191 }
192
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193 if (pfn >= end_pfn)
194 panic("alloc_low_page: ran out of memory");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200196 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
197 memset(adr, 0, PAGE_SIZE);
198 *phys = pfn * PAGE_SIZE;
199 return adr;
200}
201
202static __meminit void unmap_low_page(void *adr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700203{
Matt Tolentino44df75e2006-01-17 07:03:41 +0100204
205 if (after_bootmem)
206 return;
207
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200208 early_iounmap(adr, PAGE_SIZE);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209}
210
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100211/* Must run before zap_low_mappings */
Yasunori Gotoa3142c82007-05-08 00:23:07 -0700212__meminit void *early_ioremap(unsigned long addr, unsigned long size)
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100213{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200214 unsigned long vaddr;
215 pmd_t *pmd, *last_pmd;
216 int i, pmds;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100217
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200218 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
219 vaddr = __START_KERNEL_map;
220 pmd = level2_kernel_pgt;
221 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
222 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
223 for (i = 0; i < pmds; i++) {
224 if (pmd_present(pmd[i]))
225 goto next;
226 }
227 vaddr += addr & ~PMD_MASK;
228 addr &= PMD_MASK;
229 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
Joerg Roedel929fd582008-01-30 13:31:08 +0100230 set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200231 __flush_tlb();
232 return (void *)vaddr;
233 next:
234 ;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100235 }
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200236 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
237 return NULL;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100238}
239
240/* To avoid virtual aliases later */
Yasunori Gotoa3142c82007-05-08 00:23:07 -0700241__meminit void early_iounmap(void *addr, unsigned long size)
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100242{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200243 unsigned long vaddr;
244 pmd_t *pmd;
245 int i, pmds;
246
247 vaddr = (unsigned long)addr;
248 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
249 pmd = level2_kernel_pgt + pmd_index(vaddr);
250 for (i = 0; i < pmds; i++)
251 pmd_clear(pmd + i);
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100252 __flush_tlb();
253}
254
Matt Tolentino44df75e2006-01-17 07:03:41 +0100255static void __meminit
Keith Mannthey6ad91652006-09-26 10:52:36 +0200256phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
Matt Tolentino44df75e2006-01-17 07:03:41 +0100257{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200258 int i = pmd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259
Keith Mannthey6ad91652006-09-26 10:52:36 +0200260 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
Matt Tolentino44df75e2006-01-17 07:03:41 +0100261 unsigned long entry;
Keith Mannthey6ad91652006-09-26 10:52:36 +0200262 pmd_t *pmd = pmd_page + pmd_index(address);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100263
Jan Beulich5f51e132006-06-26 13:59:02 +0200264 if (address >= end) {
265 if (!after_bootmem)
266 for (; i < PTRS_PER_PMD; i++, pmd++)
267 set_pmd(pmd, __pmd(0));
Matt Tolentino44df75e2006-01-17 07:03:41 +0100268 break;
269 }
Keith Mannthey6ad91652006-09-26 10:52:36 +0200270
271 if (pmd_val(*pmd))
272 continue;
273
Joerg Roedel40842bf2008-01-30 13:31:02 +0100274 entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address;
Matt Tolentino44df75e2006-01-17 07:03:41 +0100275 entry &= __supported_pte_mask;
276 set_pmd(pmd, __pmd(entry));
277 }
278}
279
280static void __meminit
281phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
282{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200283 pmd_t *pmd = pmd_offset(pud,0);
284 spin_lock(&init_mm.page_table_lock);
285 phys_pmd_init(pmd, address, end);
286 spin_unlock(&init_mm.page_table_lock);
287 __flush_tlb_all();
Matt Tolentino44df75e2006-01-17 07:03:41 +0100288}
289
Keith Mannthey6ad91652006-09-26 10:52:36 +0200290static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
Matt Tolentino44df75e2006-01-17 07:03:41 +0100291{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200292 int i = pud_index(addr);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100293
Matt Tolentino44df75e2006-01-17 07:03:41 +0100294
Keith Mannthey6ad91652006-09-26 10:52:36 +0200295 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
Keith Mannthey6ad91652006-09-26 10:52:36 +0200296 unsigned long pmd_phys;
297 pud_t *pud = pud_page + pud_index(addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298 pmd_t *pmd;
299
Keith Mannthey6ad91652006-09-26 10:52:36 +0200300 if (addr >= end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302
Keith Mannthey6ad91652006-09-26 10:52:36 +0200303 if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304 set_pud(pud, __pud(0));
305 continue;
306 }
307
Keith Mannthey6ad91652006-09-26 10:52:36 +0200308 if (pud_val(*pud)) {
309 phys_pmd_update(pud, addr, end);
310 continue;
311 }
312
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200313 pmd = alloc_low_page(&pmd_phys);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100314 spin_lock(&init_mm.page_table_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700315 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
Keith Mannthey6ad91652006-09-26 10:52:36 +0200316 phys_pmd_init(pmd, addr, end);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100317 spin_unlock(&init_mm.page_table_lock);
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200318 unmap_low_page(pmd);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319 }
320 __flush_tlb();
321}
322
323static void __init find_early_table_space(unsigned long end)
324{
Andi Kleen6c5acd12006-01-11 22:46:57 +0100325 unsigned long puds, pmds, tables, start;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700326
327 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
328 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
329 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
330 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
331
Andi Kleenee408c72006-01-16 01:56:51 +0100332 /* RED-PEN putting page tables only on node 0 could
333 cause a hotspot and fill up ZONE_DMA. The page tables
334 need roughly 0.5KB per GB. */
335 start = 0x8000;
336 table_start = find_e820_area(start, end, tables);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700337 if (table_start == -1UL)
338 panic("Cannot find space for the kernel page tables");
339
340 table_start >>= PAGE_SHIFT;
341 table_end = table_start;
Matt Tolentino44df75e2006-01-17 07:03:41 +0100342
343 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
Jan Beulich5f51e132006-06-26 13:59:02 +0200344 end, table_start << PAGE_SHIFT,
345 (table_start << PAGE_SHIFT) + tables);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700346}
347
348/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
349 This runs before bootmem is initialized and gets pages directly from the
350 physical memory. To access them they are temporarily mapped. */
KAMEZAWA Hiroyukib6fd6ec2007-11-28 16:21:58 -0800351void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352{
353 unsigned long next;
354
355 Dprintk("init_memory_mapping\n");
356
357 /*
358 * Find space for the kernel direct mapping tables.
359 * Later we should allocate these tables in the local node of the memory
360 * mapped. Unfortunately this is done currently before the nodes are
361 * discovered.
362 */
Matt Tolentino44df75e2006-01-17 07:03:41 +0100363 if (!after_bootmem)
364 find_early_table_space(end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700365
366 start = (unsigned long)__va(start);
367 end = (unsigned long)__va(end);
368
369 for (; start < end; start = next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370 unsigned long pud_phys;
Matt Tolentino44df75e2006-01-17 07:03:41 +0100371 pgd_t *pgd = pgd_offset_k(start);
372 pud_t *pud;
373
374 if (after_bootmem)
Andi Kleend2ae5b52006-06-26 13:57:56 +0200375 pud = pud_offset(pgd, start & PGDIR_MASK);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100376 else
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200377 pud = alloc_low_page(&pud_phys);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100378
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 next = start + PGDIR_SIZE;
380 if (next > end)
381 next = end;
382 phys_pud_init(pud, __pa(start), __pa(next));
Matt Tolentino44df75e2006-01-17 07:03:41 +0100383 if (!after_bootmem)
384 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200385 unmap_low_page(pud);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386 }
387
Matt Tolentino44df75e2006-01-17 07:03:41 +0100388 if (!after_bootmem)
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200389 mmu_cr4_features = read_cr4();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390 __flush_tlb_all();
Andi Kleen75175272008-01-30 13:33:17 +0100391
392 reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393}
394
Matt Tolentino2b976902005-06-23 00:08:06 -0700395#ifndef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700396void __init paging_init(void)
397{
Mel Gorman6391af12006-10-11 01:20:39 -0700398 unsigned long max_zone_pfns[MAX_NR_ZONES];
399 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
400 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
401 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
402 max_zone_pfns[ZONE_NORMAL] = end_pfn;
403
Matt Tolentino44df75e2006-01-17 07:03:41 +0100404 memory_present(0, 0, end_pfn);
405 sparse_init();
Mel Gorman5cb248a2006-09-27 01:49:52 -0700406 free_area_init_nodes(max_zone_pfns);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700407}
408#endif
409
410/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
411 from the CPU leading to inconsistent cache lines. address and size
412 must be aligned to 2MB boundaries.
413 Does nothing when the mapping doesn't exist. */
414void __init clear_kernel_mapping(unsigned long address, unsigned long size)
415{
416 unsigned long end = address + size;
417
418 BUG_ON(address & ~LARGE_PAGE_MASK);
419 BUG_ON(size & ~LARGE_PAGE_MASK);
420
421 for (; address < end; address += LARGE_PAGE_SIZE) {
422 pgd_t *pgd = pgd_offset_k(address);
423 pud_t *pud;
424 pmd_t *pmd;
425 if (pgd_none(*pgd))
426 continue;
427 pud = pud_offset(pgd, address);
428 if (pud_none(*pud))
429 continue;
430 pmd = pmd_offset(pud, address);
431 if (!pmd || pmd_none(*pmd))
432 continue;
433 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
434 /* Could handle this, but it should not happen currently. */
435 printk(KERN_ERR
436 "clear_kernel_mapping: mapping has been split. will leak memory\n");
437 pmd_ERROR(*pmd);
438 }
439 set_pmd(pmd, __pmd(0));
440 }
441 __flush_tlb_all();
442}
443
Matt Tolentino44df75e2006-01-17 07:03:41 +0100444/*
445 * Memory hotplug specific functions
Matt Tolentino44df75e2006-01-17 07:03:41 +0100446 */
Matt Tolentino44df75e2006-01-17 07:03:41 +0100447void online_page(struct page *page)
448{
449 ClearPageReserved(page);
Nick Piggin7835e982006-03-22 00:08:40 -0800450 init_page_count(page);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100451 __free_page(page);
452 totalram_pages++;
453 num_physpages++;
454}
455
Yasunori Gotobc02af92006-06-27 02:53:30 -0700456#ifdef CONFIG_MEMORY_HOTPLUG
457/*
Yasunori Gotobc02af92006-06-27 02:53:30 -0700458 * Memory is added always to NORMAL zone. This means you will never get
459 * additional DMA/DMA32 memory.
460 */
461int arch_add_memory(int nid, u64 start, u64 size)
462{
463 struct pglist_data *pgdat = NODE_DATA(nid);
Christoph Lameter776ed982006-09-25 23:31:09 -0700464 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
Yasunori Gotobc02af92006-06-27 02:53:30 -0700465 unsigned long start_pfn = start >> PAGE_SHIFT;
466 unsigned long nr_pages = size >> PAGE_SHIFT;
467 int ret;
468
Keith Mannthey45e0b782006-09-30 23:27:09 -0700469 init_memory_mapping(start, (start + size -1));
470
Yasunori Gotobc02af92006-06-27 02:53:30 -0700471 ret = __add_pages(zone, start_pfn, nr_pages);
472 if (ret)
473 goto error;
474
Yasunori Gotobc02af92006-06-27 02:53:30 -0700475 return ret;
476error:
477 printk("%s: Problem encountered in __add_pages!\n", __func__);
478 return ret;
479}
480EXPORT_SYMBOL_GPL(arch_add_memory);
481
Yasunori Goto82432292006-11-18 22:19:40 -0800482#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
Keith Mannthey4942e992006-09-30 23:27:06 -0700483int memory_add_physaddr_to_nid(u64 start)
484{
485 return 0;
486}
Keith Mannthey8c2676a2006-09-30 23:27:07 -0700487EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
Keith Mannthey4942e992006-09-30 23:27:06 -0700488#endif
489
Keith Mannthey45e0b782006-09-30 23:27:09 -0700490#endif /* CONFIG_MEMORY_HOTPLUG */
491
Linus Torvalds1da177e2005-04-16 15:20:36 -0700492static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
493 kcore_vsyscall;
494
495void __init mem_init(void)
496{
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200497 long codesize, reservedpages, datasize, initsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498
Jon Mason0dc243a2006-06-26 13:58:11 +0200499 pci_iommu_alloc();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500
Yinghai Lu48ddb152008-01-30 13:32:36 +0100501 /* clear_bss() already clear the empty_zero_page */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502
Ingo Molnarf2633102008-01-30 13:32:36 +0100503 /* temporary debugging - double check it's true: */
504 {
505 int i;
506
507 for (i = 0; i < 1024; i++)
508 WARN_ON_ONCE(empty_zero_page[i]);
509 }
510
Linus Torvalds1da177e2005-04-16 15:20:36 -0700511 reservedpages = 0;
512
513 /* this will put all low memory onto the freelists */
Matt Tolentino2b976902005-06-23 00:08:06 -0700514#ifdef CONFIG_NUMA
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200515 totalram_pages = numa_free_all_bootmem();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700516#else
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200517 totalram_pages = free_all_bootmem();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700518#endif
Mel Gorman5cb248a2006-09-27 01:49:52 -0700519 reservedpages = end_pfn - totalram_pages -
520 absent_pages_in_range(0, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521
522 after_bootmem = 1;
523
524 codesize = (unsigned long) &_etext - (unsigned long) &_text;
525 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
526 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
527
528 /* Register memory areas for /proc/kcore */
529 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
530 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
531 VMALLOC_END-VMALLOC_START);
532 kclist_add(&kcore_kernel, &_stext, _end - _stext);
533 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
534 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
535 VSYSCALL_END - VSYSCALL_START);
536
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200537 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700538 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
539 end_pfn << (PAGE_SHIFT-10),
540 codesize >> 10,
541 reservedpages << (PAGE_SHIFT-10),
542 datasize >> 10,
543 initsize >> 10);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700544}
545
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200546void free_init_pages(char *what, unsigned long begin, unsigned long end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700547{
548 unsigned long addr;
549
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200550 if (begin >= end)
551 return;
552
Jan Beulich6fb14752007-05-02 19:27:10 +0200553 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200554 for (addr = begin; addr < end; addr += PAGE_SIZE) {
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700555 ClearPageReserved(virt_to_page(addr));
556 init_page_count(virt_to_page(addr));
557 memset((void *)(addr & ~(PAGE_SIZE-1)),
558 POISON_FREE_INITMEM, PAGE_SIZE);
Jan Beulich6fb14752007-05-02 19:27:10 +0200559 if (addr >= __START_KERNEL_map)
560 change_page_attr_addr(addr, 1, __pgprot(0));
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700561 free_page(addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700562 totalram_pages++;
563 }
Jan Beulich6fb14752007-05-02 19:27:10 +0200564 if (addr > __START_KERNEL_map)
565 global_flush_tlb();
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200566}
567
568void free_initmem(void)
569{
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200570 free_init_pages("unused kernel memory",
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700571 (unsigned long)(&__init_begin),
572 (unsigned long)(&__init_end));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700573}
574
Arjan van de Ven67df1972006-01-06 00:12:04 -0800575#ifdef CONFIG_DEBUG_RODATA
576
Arjan van de Ven67df1972006-01-06 00:12:04 -0800577void mark_rodata_ro(void)
578{
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700579 unsigned long start = (unsigned long)_stext, end;
Arjan van de Ven67df1972006-01-06 00:12:04 -0800580
Linus Torvalds602033e2007-07-26 12:07:21 -0700581#ifdef CONFIG_HOTPLUG_CPU
582 /* It must still be possible to apply SMP alternatives. */
583 if (num_possible_cpus() > 1)
584 start = (unsigned long)_etext;
585#endif
586
587#ifdef CONFIG_KPROBES
588 start = (unsigned long)__start_rodata;
589#endif
590
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700591 end = (unsigned long)__end_rodata;
592 start = (start + PAGE_SIZE - 1) & PAGE_MASK;
593 end &= PAGE_MASK;
594 if (end <= start)
595 return;
596
597 change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
Arjan van de Ven67df1972006-01-06 00:12:04 -0800598
Jan Beulich6fb14752007-05-02 19:27:10 +0200599 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700600 (end - start) >> 10);
Arjan van de Ven67df1972006-01-06 00:12:04 -0800601
602 /*
603 * change_page_attr_addr() requires a global_flush_tlb() call after it.
604 * We do this after the printk so that if something went wrong in the
605 * change, the printk gets out at least to give a better debug hint
606 * of who is the culprit.
607 */
608 global_flush_tlb();
Andi Kleen0c42f392008-01-30 13:33:42 +0100609
610#ifdef CONFIG_CPA_DEBUG
611 printk("Testing CPA: undo %lx-%lx\n", start, end);
612 change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL);
613 global_flush_tlb();
614
615 printk("Testing CPA: again\n");
616 change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
617 global_flush_tlb();
618#endif
Arjan van de Ven67df1972006-01-06 00:12:04 -0800619}
620#endif
621
Linus Torvalds1da177e2005-04-16 15:20:36 -0700622#ifdef CONFIG_BLK_DEV_INITRD
623void free_initrd_mem(unsigned long start, unsigned long end)
624{
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700625 free_init_pages("initrd memory", start, end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700626}
627#endif
628
629void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
630{
Matt Tolentino2b976902005-06-23 00:08:06 -0700631#ifdef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700632 int nid = phys_to_nid(phys);
Andi Kleen5e58a022006-11-14 16:57:46 +0100633#endif
634 unsigned long pfn = phys >> PAGE_SHIFT;
635 if (pfn >= end_pfn) {
636 /* This can happen with kdump kernels when accessing firmware
637 tables. */
638 if (pfn < end_pfn_map)
639 return;
640 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
641 phys, len);
642 return;
643 }
644
645 /* Should check here against the e820 map to avoid double free */
646#ifdef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647 reserve_bootmem_node(NODE_DATA(nid), phys, len);
648#else
649 reserve_bootmem(phys, len);
650#endif
Mel Gorman0e0b8642006-09-27 01:49:56 -0700651 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
Andi Kleene18c6872005-11-05 17:25:53 +0100652 dma_reserve += len / PAGE_SIZE;
Mel Gorman0e0b8642006-09-27 01:49:56 -0700653 set_dma_reserve(dma_reserve);
654 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700655}
656
657int kern_addr_valid(unsigned long addr)
658{
659 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
660 pgd_t *pgd;
661 pud_t *pud;
662 pmd_t *pmd;
663 pte_t *pte;
664
665 if (above != 0 && above != -1UL)
666 return 0;
667
668 pgd = pgd_offset_k(addr);
669 if (pgd_none(*pgd))
670 return 0;
671
672 pud = pud_offset(pgd, addr);
673 if (pud_none(*pud))
674 return 0;
675
676 pmd = pmd_offset(pud, addr);
677 if (pmd_none(*pmd))
678 return 0;
679 if (pmd_large(*pmd))
680 return pfn_valid(pmd_pfn(*pmd));
681
682 pte = pte_offset_kernel(pmd, addr);
683 if (pte_none(*pte))
684 return 0;
685 return pfn_valid(pte_pfn(*pte));
686}
687
Ernie Petrides103efcd2006-12-07 02:14:09 +0100688/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
Andi Kleen1e014412005-04-16 15:24:55 -0700689 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
690 not need special handling anymore. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700691
692static struct vm_area_struct gate_vma = {
693 .vm_start = VSYSCALL_START,
Ernie Petrides103efcd2006-12-07 02:14:09 +0100694 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
695 .vm_page_prot = PAGE_READONLY_EXEC,
696 .vm_flags = VM_READ | VM_EXEC
Linus Torvalds1da177e2005-04-16 15:20:36 -0700697};
698
Linus Torvalds1da177e2005-04-16 15:20:36 -0700699struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
700{
701#ifdef CONFIG_IA32_EMULATION
Andi Kleen1e014412005-04-16 15:24:55 -0700702 if (test_tsk_thread_flag(tsk, TIF_IA32))
703 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700704#endif
705 return &gate_vma;
706}
707
708int in_gate_area(struct task_struct *task, unsigned long addr)
709{
710 struct vm_area_struct *vma = get_gate_vma(task);
Andi Kleen1e014412005-04-16 15:24:55 -0700711 if (!vma)
712 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700713 return (addr >= vma->vm_start) && (addr < vma->vm_end);
714}
715
716/* Use this when you have no reliable task/vma, typically from interrupt
717 * context. It is less reliable than using the task's vma and may give
718 * false positives.
719 */
720int in_gate_area_no_task(unsigned long addr)
721{
Andi Kleen1e014412005-04-16 15:24:55 -0700722 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700723}
Zou Nan hai2e1c49d2007-06-01 00:46:28 -0700724
Andi Kleen2aae9502007-07-21 17:10:01 +0200725const char *arch_vma_name(struct vm_area_struct *vma)
726{
727 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
728 return "[vdso]";
729 if (vma == &gate_vma)
730 return "[vsyscall]";
731 return NULL;
732}
Christoph Lameter0889eba2007-10-16 01:24:15 -0700733
734#ifdef CONFIG_SPARSEMEM_VMEMMAP
735/*
736 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
737 */
738int __meminit vmemmap_populate(struct page *start_page,
739 unsigned long size, int node)
740{
741 unsigned long addr = (unsigned long)start_page;
742 unsigned long end = (unsigned long)(start_page + size);
743 unsigned long next;
744 pgd_t *pgd;
745 pud_t *pud;
746 pmd_t *pmd;
747
748 for (; addr < end; addr = next) {
749 next = pmd_addr_end(addr, end);
750
751 pgd = vmemmap_pgd_populate(addr, node);
752 if (!pgd)
753 return -ENOMEM;
754 pud = vmemmap_pud_populate(pgd, addr, node);
755 if (!pud)
756 return -ENOMEM;
757
758 pmd = pmd_offset(pud, addr);
759 if (pmd_none(*pmd)) {
760 pte_t entry;
761 void *p = vmemmap_alloc_block(PMD_SIZE, node);
762 if (!p)
763 return -ENOMEM;
764
Jeremy Fitzhardinge27ec1612008-01-30 13:31:09 +0100765 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL_LARGE);
Christoph Lameter0889eba2007-10-16 01:24:15 -0700766 set_pmd(pmd, __pmd(pte_val(entry)));
767
768 printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
769 addr, addr + PMD_SIZE - 1, p, node);
770 } else
771 vmemmap_verify((pte_t *)pmd, node, addr, next);
772 }
773
774 return 0;
775}
776#endif