blob: 0fbb657a8b19171c931b2234c5fe0c2d94af2a5d [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */
8
Linus Torvalds1da177e2005-04-16 15:20:36 -07009#include <linux/signal.h>
10#include <linux/sched.h>
11#include <linux/kernel.h>
12#include <linux/errno.h>
13#include <linux/string.h>
14#include <linux/types.h>
15#include <linux/ptrace.h>
16#include <linux/mman.h>
17#include <linux/mm.h>
18#include <linux/swap.h>
19#include <linux/smp.h>
20#include <linux/init.h>
21#include <linux/pagemap.h>
22#include <linux/bootmem.h>
23#include <linux/proc_fs.h>
Andi Kleen59170892005-11-05 17:25:53 +010024#include <linux/pci.h>
Jan Beulich6fb14752007-05-02 19:27:10 +020025#include <linux/pfn.h>
Randy Dunlapc9cf5522006-06-27 02:53:52 -070026#include <linux/poison.h>
Muli Ben-Yehuda17a941d2006-01-11 22:44:42 +010027#include <linux/dma-mapping.h>
Matt Tolentino44df75e2006-01-17 07:03:41 +010028#include <linux/module.h>
29#include <linux/memory_hotplug.h>
Konrad Rzeszutekae32b122007-05-02 19:27:11 +020030#include <linux/nmi.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070031
32#include <asm/processor.h>
33#include <asm/system.h>
34#include <asm/uaccess.h>
35#include <asm/pgtable.h>
36#include <asm/pgalloc.h>
37#include <asm/dma.h>
38#include <asm/fixmap.h>
39#include <asm/e820.h>
40#include <asm/apic.h>
41#include <asm/tlb.h>
42#include <asm/mmu_context.h>
43#include <asm/proto.h>
44#include <asm/smp.h>
Andi Kleen2bc04142005-11-05 17:25:53 +010045#include <asm/sections.h>
Thomas Gleixner718fc132008-01-30 13:30:17 +010046#include <asm/kdebug.h>
Thomas Gleixneraaa64e02008-01-30 13:30:17 +010047#include <asm/numa.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070048
49#ifndef Dprintk
50#define Dprintk(x...)
51#endif
52
Stephen Hemmingere6584502007-05-02 19:27:06 +020053const struct dma_mapping_ops* dma_ops;
Muli Ben-Yehuda17a941d2006-01-11 22:44:42 +010054EXPORT_SYMBOL(dma_ops);
55
Andi Kleene18c6872005-11-05 17:25:53 +010056static unsigned long dma_reserve __initdata;
57
Linus Torvalds1da177e2005-04-16 15:20:36 -070058DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
59
60/*
61 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
62 * physical space so we can cache the place of the first one and move
63 * around without checking the pgd every time.
64 */
65
66void show_mem(void)
67{
Andi Kleene92343c2005-09-12 18:49:24 +020068 long i, total = 0, reserved = 0;
69 long shared = 0, cached = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -070070 pg_data_t *pgdat;
71 struct page *page;
72
Andi Kleene92343c2005-09-12 18:49:24 +020073 printk(KERN_INFO "Mem-info:\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -070074 show_free_areas();
Andi Kleene92343c2005-09-12 18:49:24 +020075 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
Linus Torvalds1da177e2005-04-16 15:20:36 -070076
KAMEZAWA Hiroyukiec936fc2006-03-27 01:15:59 -080077 for_each_online_pgdat(pgdat) {
Linus Torvalds1da177e2005-04-16 15:20:36 -070078 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
Konrad Rzeszutekae32b122007-05-02 19:27:11 +020079 /* this loop can take a while with 256 GB and 4k pages
80 so update the NMI watchdog */
81 if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
82 touch_nmi_watchdog();
83 }
Bob Picco12710a52007-06-08 13:47:00 -070084 if (!pfn_valid(pgdat->node_start_pfn + i))
85 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -070086 page = pfn_to_page(pgdat->node_start_pfn + i);
87 total++;
Andi Kleene92343c2005-09-12 18:49:24 +020088 if (PageReserved(page))
89 reserved++;
90 else if (PageSwapCache(page))
91 cached++;
92 else if (page_count(page))
93 shared += page_count(page) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -070094 }
95 }
Andi Kleene92343c2005-09-12 18:49:24 +020096 printk(KERN_INFO "%lu pages of RAM\n", total);
97 printk(KERN_INFO "%lu reserved pages\n",reserved);
98 printk(KERN_INFO "%lu pages shared\n",shared);
99 printk(KERN_INFO "%lu pages swap cached\n",cached);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100}
101
Linus Torvalds1da177e2005-04-16 15:20:36 -0700102int after_bootmem;
103
Andi Kleen5f44a662006-03-25 16:30:25 +0100104static __init void *spp_getpage(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105{
106 void *ptr;
107 if (after_bootmem)
108 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
109 else
110 ptr = alloc_bootmem_pages(PAGE_SIZE);
111 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
112 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
113
114 Dprintk("spp_getpage %p\n", ptr);
115 return ptr;
116}
117
Andi Kleen5f44a662006-03-25 16:30:25 +0100118static __init void set_pte_phys(unsigned long vaddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119 unsigned long phys, pgprot_t prot)
120{
121 pgd_t *pgd;
122 pud_t *pud;
123 pmd_t *pmd;
124 pte_t *pte, new_pte;
125
126 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
127
128 pgd = pgd_offset_k(vaddr);
129 if (pgd_none(*pgd)) {
130 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
131 return;
132 }
133 pud = pud_offset(pgd, vaddr);
134 if (pud_none(*pud)) {
135 pmd = (pmd_t *) spp_getpage();
136 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
137 if (pmd != pmd_offset(pud, 0)) {
138 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
139 return;
140 }
141 }
142 pmd = pmd_offset(pud, vaddr);
143 if (pmd_none(*pmd)) {
144 pte = (pte_t *) spp_getpage();
145 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
146 if (pte != pte_offset_kernel(pmd, 0)) {
147 printk("PAGETABLE BUG #02!\n");
148 return;
149 }
150 }
151 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
152
153 pte = pte_offset_kernel(pmd, vaddr);
154 if (!pte_none(*pte) &&
155 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
156 pte_ERROR(*pte);
157 set_pte(pte, new_pte);
158
159 /*
160 * It's enough to flush this one mapping.
161 * (PGE mappings get flushed as well)
162 */
163 __flush_tlb_one(vaddr);
164}
165
166/* NOTE: this is meant to be run only at boot */
Andi Kleen5f44a662006-03-25 16:30:25 +0100167void __init
168__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169{
170 unsigned long address = __fix_to_virt(idx);
171
172 if (idx >= __end_of_fixed_addresses) {
173 printk("Invalid __set_fixmap\n");
174 return;
175 }
176 set_pte_phys(address, phys, prot);
177}
178
Yasunori Gotoa3142c82007-05-08 00:23:07 -0700179unsigned long __meminitdata table_start, table_end;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200181static __meminit void *alloc_low_page(unsigned long *phys)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700182{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200183 unsigned long pfn = table_end++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184 void *adr;
185
Matt Tolentino44df75e2006-01-17 07:03:41 +0100186 if (after_bootmem) {
187 adr = (void *)get_zeroed_page(GFP_ATOMIC);
188 *phys = __pa(adr);
189 return adr;
190 }
191
Linus Torvalds1da177e2005-04-16 15:20:36 -0700192 if (pfn >= end_pfn)
193 panic("alloc_low_page: ran out of memory");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200195 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
196 memset(adr, 0, PAGE_SIZE);
197 *phys = pfn * PAGE_SIZE;
198 return adr;
199}
200
201static __meminit void unmap_low_page(void *adr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202{
Matt Tolentino44df75e2006-01-17 07:03:41 +0100203
204 if (after_bootmem)
205 return;
206
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200207 early_iounmap(adr, PAGE_SIZE);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700208}
209
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100210/* Must run before zap_low_mappings */
Yasunori Gotoa3142c82007-05-08 00:23:07 -0700211__meminit void *early_ioremap(unsigned long addr, unsigned long size)
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100212{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200213 unsigned long vaddr;
214 pmd_t *pmd, *last_pmd;
215 int i, pmds;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100216
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200217 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
218 vaddr = __START_KERNEL_map;
219 pmd = level2_kernel_pgt;
220 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
221 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
222 for (i = 0; i < pmds; i++) {
223 if (pmd_present(pmd[i]))
224 goto next;
225 }
226 vaddr += addr & ~PMD_MASK;
227 addr &= PMD_MASK;
228 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
229 set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
230 __flush_tlb();
231 return (void *)vaddr;
232 next:
233 ;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100234 }
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200235 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
236 return NULL;
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100237}
238
239/* To avoid virtual aliases later */
Yasunori Gotoa3142c82007-05-08 00:23:07 -0700240__meminit void early_iounmap(void *addr, unsigned long size)
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100241{
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200242 unsigned long vaddr;
243 pmd_t *pmd;
244 int i, pmds;
245
246 vaddr = (unsigned long)addr;
247 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
248 pmd = level2_kernel_pgt + pmd_index(vaddr);
249 for (i = 0; i < pmds; i++)
250 pmd_clear(pmd + i);
Andi Kleenf2d3efe2006-03-25 16:30:22 +0100251 __flush_tlb();
252}
253
Matt Tolentino44df75e2006-01-17 07:03:41 +0100254static void __meminit
Keith Mannthey6ad91652006-09-26 10:52:36 +0200255phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
Matt Tolentino44df75e2006-01-17 07:03:41 +0100256{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200257 int i = pmd_index(address);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258
Keith Mannthey6ad91652006-09-26 10:52:36 +0200259 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
Matt Tolentino44df75e2006-01-17 07:03:41 +0100260 unsigned long entry;
Keith Mannthey6ad91652006-09-26 10:52:36 +0200261 pmd_t *pmd = pmd_page + pmd_index(address);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100262
Jan Beulich5f51e132006-06-26 13:59:02 +0200263 if (address >= end) {
264 if (!after_bootmem)
265 for (; i < PTRS_PER_PMD; i++, pmd++)
266 set_pmd(pmd, __pmd(0));
Matt Tolentino44df75e2006-01-17 07:03:41 +0100267 break;
268 }
Keith Mannthey6ad91652006-09-26 10:52:36 +0200269
270 if (pmd_val(*pmd))
271 continue;
272
Matt Tolentino44df75e2006-01-17 07:03:41 +0100273 entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
274 entry &= __supported_pte_mask;
275 set_pmd(pmd, __pmd(entry));
276 }
277}
278
279static void __meminit
280phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
281{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200282 pmd_t *pmd = pmd_offset(pud,0);
283 spin_lock(&init_mm.page_table_lock);
284 phys_pmd_init(pmd, address, end);
285 spin_unlock(&init_mm.page_table_lock);
286 __flush_tlb_all();
Matt Tolentino44df75e2006-01-17 07:03:41 +0100287}
288
Keith Mannthey6ad91652006-09-26 10:52:36 +0200289static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
Matt Tolentino44df75e2006-01-17 07:03:41 +0100290{
Keith Mannthey6ad91652006-09-26 10:52:36 +0200291 int i = pud_index(addr);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100292
Matt Tolentino44df75e2006-01-17 07:03:41 +0100293
Keith Mannthey6ad91652006-09-26 10:52:36 +0200294 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
Keith Mannthey6ad91652006-09-26 10:52:36 +0200295 unsigned long pmd_phys;
296 pud_t *pud = pud_page + pud_index(addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297 pmd_t *pmd;
298
Keith Mannthey6ad91652006-09-26 10:52:36 +0200299 if (addr >= end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301
Keith Mannthey6ad91652006-09-26 10:52:36 +0200302 if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303 set_pud(pud, __pud(0));
304 continue;
305 }
306
Keith Mannthey6ad91652006-09-26 10:52:36 +0200307 if (pud_val(*pud)) {
308 phys_pmd_update(pud, addr, end);
309 continue;
310 }
311
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200312 pmd = alloc_low_page(&pmd_phys);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100313 spin_lock(&init_mm.page_table_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700314 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
Keith Mannthey6ad91652006-09-26 10:52:36 +0200315 phys_pmd_init(pmd, addr, end);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100316 spin_unlock(&init_mm.page_table_lock);
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200317 unmap_low_page(pmd);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318 }
319 __flush_tlb();
320}
321
322static void __init find_early_table_space(unsigned long end)
323{
Andi Kleen6c5acd12006-01-11 22:46:57 +0100324 unsigned long puds, pmds, tables, start;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325
326 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
327 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
328 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
329 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
330
Andi Kleenee408c72006-01-16 01:56:51 +0100331 /* RED-PEN putting page tables only on node 0 could
332 cause a hotspot and fill up ZONE_DMA. The page tables
333 need roughly 0.5KB per GB. */
334 start = 0x8000;
335 table_start = find_e820_area(start, end, tables);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336 if (table_start == -1UL)
337 panic("Cannot find space for the kernel page tables");
338
339 table_start >>= PAGE_SHIFT;
340 table_end = table_start;
Matt Tolentino44df75e2006-01-17 07:03:41 +0100341
342 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
Jan Beulich5f51e132006-06-26 13:59:02 +0200343 end, table_start << PAGE_SHIFT,
344 (table_start << PAGE_SHIFT) + tables);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700345}
346
347/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
348 This runs before bootmem is initialized and gets pages directly from the
349 physical memory. To access them they are temporarily mapped. */
KAMEZAWA Hiroyukib6fd6ec2007-11-28 16:21:58 -0800350void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351{
352 unsigned long next;
353
354 Dprintk("init_memory_mapping\n");
355
356 /*
357 * Find space for the kernel direct mapping tables.
358 * Later we should allocate these tables in the local node of the memory
359 * mapped. Unfortunately this is done currently before the nodes are
360 * discovered.
361 */
Matt Tolentino44df75e2006-01-17 07:03:41 +0100362 if (!after_bootmem)
363 find_early_table_space(end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700364
365 start = (unsigned long)__va(start);
366 end = (unsigned long)__va(end);
367
368 for (; start < end; start = next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700369 unsigned long pud_phys;
Matt Tolentino44df75e2006-01-17 07:03:41 +0100370 pgd_t *pgd = pgd_offset_k(start);
371 pud_t *pud;
372
373 if (after_bootmem)
Andi Kleend2ae5b52006-06-26 13:57:56 +0200374 pud = pud_offset(pgd, start & PGDIR_MASK);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100375 else
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200376 pud = alloc_low_page(&pud_phys);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100377
Linus Torvalds1da177e2005-04-16 15:20:36 -0700378 next = start + PGDIR_SIZE;
379 if (next > end)
380 next = end;
381 phys_pud_init(pud, __pa(start), __pa(next));
Matt Tolentino44df75e2006-01-17 07:03:41 +0100382 if (!after_bootmem)
383 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
Vivek Goyaldafe41e2007-05-02 19:27:06 +0200384 unmap_low_page(pud);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700385 }
386
Matt Tolentino44df75e2006-01-17 07:03:41 +0100387 if (!after_bootmem)
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200388 mmu_cr4_features = read_cr4();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389 __flush_tlb_all();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390}
391
Matt Tolentino2b976902005-06-23 00:08:06 -0700392#ifndef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393void __init paging_init(void)
394{
Mel Gorman6391af12006-10-11 01:20:39 -0700395 unsigned long max_zone_pfns[MAX_NR_ZONES];
396 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
397 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
398 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
399 max_zone_pfns[ZONE_NORMAL] = end_pfn;
400
Matt Tolentino44df75e2006-01-17 07:03:41 +0100401 memory_present(0, 0, end_pfn);
402 sparse_init();
Mel Gorman5cb248a2006-09-27 01:49:52 -0700403 free_area_init_nodes(max_zone_pfns);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404}
405#endif
406
407/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
408 from the CPU leading to inconsistent cache lines. address and size
409 must be aligned to 2MB boundaries.
410 Does nothing when the mapping doesn't exist. */
411void __init clear_kernel_mapping(unsigned long address, unsigned long size)
412{
413 unsigned long end = address + size;
414
415 BUG_ON(address & ~LARGE_PAGE_MASK);
416 BUG_ON(size & ~LARGE_PAGE_MASK);
417
418 for (; address < end; address += LARGE_PAGE_SIZE) {
419 pgd_t *pgd = pgd_offset_k(address);
420 pud_t *pud;
421 pmd_t *pmd;
422 if (pgd_none(*pgd))
423 continue;
424 pud = pud_offset(pgd, address);
425 if (pud_none(*pud))
426 continue;
427 pmd = pmd_offset(pud, address);
428 if (!pmd || pmd_none(*pmd))
429 continue;
430 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
431 /* Could handle this, but it should not happen currently. */
432 printk(KERN_ERR
433 "clear_kernel_mapping: mapping has been split. will leak memory\n");
434 pmd_ERROR(*pmd);
435 }
436 set_pmd(pmd, __pmd(0));
437 }
438 __flush_tlb_all();
439}
440
Matt Tolentino44df75e2006-01-17 07:03:41 +0100441/*
442 * Memory hotplug specific functions
Matt Tolentino44df75e2006-01-17 07:03:41 +0100443 */
Matt Tolentino44df75e2006-01-17 07:03:41 +0100444void online_page(struct page *page)
445{
446 ClearPageReserved(page);
Nick Piggin7835e982006-03-22 00:08:40 -0800447 init_page_count(page);
Matt Tolentino44df75e2006-01-17 07:03:41 +0100448 __free_page(page);
449 totalram_pages++;
450 num_physpages++;
451}
452
Yasunori Gotobc02af92006-06-27 02:53:30 -0700453#ifdef CONFIG_MEMORY_HOTPLUG
454/*
Yasunori Gotobc02af92006-06-27 02:53:30 -0700455 * Memory is added always to NORMAL zone. This means you will never get
456 * additional DMA/DMA32 memory.
457 */
458int arch_add_memory(int nid, u64 start, u64 size)
459{
460 struct pglist_data *pgdat = NODE_DATA(nid);
Christoph Lameter776ed982006-09-25 23:31:09 -0700461 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
Yasunori Gotobc02af92006-06-27 02:53:30 -0700462 unsigned long start_pfn = start >> PAGE_SHIFT;
463 unsigned long nr_pages = size >> PAGE_SHIFT;
464 int ret;
465
Keith Mannthey45e0b782006-09-30 23:27:09 -0700466 init_memory_mapping(start, (start + size -1));
467
Yasunori Gotobc02af92006-06-27 02:53:30 -0700468 ret = __add_pages(zone, start_pfn, nr_pages);
469 if (ret)
470 goto error;
471
Yasunori Gotobc02af92006-06-27 02:53:30 -0700472 return ret;
473error:
474 printk("%s: Problem encountered in __add_pages!\n", __func__);
475 return ret;
476}
477EXPORT_SYMBOL_GPL(arch_add_memory);
478
Yasunori Goto82432292006-11-18 22:19:40 -0800479#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
Keith Mannthey4942e992006-09-30 23:27:06 -0700480int memory_add_physaddr_to_nid(u64 start)
481{
482 return 0;
483}
Keith Mannthey8c2676a2006-09-30 23:27:07 -0700484EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
Keith Mannthey4942e992006-09-30 23:27:06 -0700485#endif
486
Keith Mannthey45e0b782006-09-30 23:27:09 -0700487#endif /* CONFIG_MEMORY_HOTPLUG */
488
489#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
Andi Kleen9d99aaa2006-04-07 19:49:15 +0200490/*
491 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
492 * just online the pages.
493 */
494int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
495{
496 int err = -EIO;
497 unsigned long pfn;
498 unsigned long total = 0, mem = 0;
499 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200500 if (pfn_valid(pfn)) {
Andi Kleen9d99aaa2006-04-07 19:49:15 +0200501 online_page(pfn_to_page(pfn));
502 err = 0;
503 mem++;
504 }
505 total++;
506 }
507 if (!err) {
508 z->spanned_pages += total;
509 z->present_pages += mem;
510 z->zone_pgdat->node_spanned_pages += total;
511 z->zone_pgdat->node_present_pages += mem;
512 }
513 return err;
514}
Keith Mannthey45e0b782006-09-30 23:27:09 -0700515#endif
Matt Tolentino44df75e2006-01-17 07:03:41 +0100516
Linus Torvalds1da177e2005-04-16 15:20:36 -0700517static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
518 kcore_vsyscall;
519
520void __init mem_init(void)
521{
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200522 long codesize, reservedpages, datasize, initsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700523
Jon Mason0dc243a2006-06-26 13:58:11 +0200524 pci_iommu_alloc();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700525
Linus Torvalds1da177e2005-04-16 15:20:36 -0700526 /* clear the zero-page */
527 memset(empty_zero_page, 0, PAGE_SIZE);
528
529 reservedpages = 0;
530
531 /* this will put all low memory onto the freelists */
Matt Tolentino2b976902005-06-23 00:08:06 -0700532#ifdef CONFIG_NUMA
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200533 totalram_pages = numa_free_all_bootmem();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700534#else
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200535 totalram_pages = free_all_bootmem();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700536#endif
Mel Gorman5cb248a2006-09-27 01:49:52 -0700537 reservedpages = end_pfn - totalram_pages -
538 absent_pages_in_range(0, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700539
540 after_bootmem = 1;
541
542 codesize = (unsigned long) &_etext - (unsigned long) &_text;
543 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
544 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
545
546 /* Register memory areas for /proc/kcore */
547 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
548 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
549 VMALLOC_END-VMALLOC_START);
550 kclist_add(&kcore_kernel, &_stext, _end - _stext);
551 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
552 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
553 VSYSCALL_END - VSYSCALL_START);
554
Andi Kleen0a43e4b2005-09-12 18:49:24 +0200555 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700556 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
557 end_pfn << (PAGE_SHIFT-10),
558 codesize >> 10,
559 reservedpages << (PAGE_SHIFT-10),
560 datasize >> 10,
561 initsize >> 10);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700562}
563
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200564void free_init_pages(char *what, unsigned long begin, unsigned long end)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700565{
566 unsigned long addr;
567
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200568 if (begin >= end)
569 return;
570
Jan Beulich6fb14752007-05-02 19:27:10 +0200571 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200572 for (addr = begin; addr < end; addr += PAGE_SIZE) {
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700573 ClearPageReserved(virt_to_page(addr));
574 init_page_count(virt_to_page(addr));
575 memset((void *)(addr & ~(PAGE_SIZE-1)),
576 POISON_FREE_INITMEM, PAGE_SIZE);
Jan Beulich6fb14752007-05-02 19:27:10 +0200577 if (addr >= __START_KERNEL_map)
578 change_page_attr_addr(addr, 1, __pgprot(0));
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700579 free_page(addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700580 totalram_pages++;
581 }
Jan Beulich6fb14752007-05-02 19:27:10 +0200582 if (addr > __START_KERNEL_map)
583 global_flush_tlb();
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200584}
585
586void free_initmem(void)
587{
Gerd Hoffmannd167a512006-06-26 13:56:16 +0200588 free_init_pages("unused kernel memory",
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700589 (unsigned long)(&__init_begin),
590 (unsigned long)(&__init_end));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700591}
592
Arjan van de Ven67df1972006-01-06 00:12:04 -0800593#ifdef CONFIG_DEBUG_RODATA
594
Arjan van de Ven67df1972006-01-06 00:12:04 -0800595void mark_rodata_ro(void)
596{
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700597 unsigned long start = (unsigned long)_stext, end;
Arjan van de Ven67df1972006-01-06 00:12:04 -0800598
Linus Torvalds602033e2007-07-26 12:07:21 -0700599#ifdef CONFIG_HOTPLUG_CPU
600 /* It must still be possible to apply SMP alternatives. */
601 if (num_possible_cpus() > 1)
602 start = (unsigned long)_etext;
603#endif
604
605#ifdef CONFIG_KPROBES
606 start = (unsigned long)__start_rodata;
607#endif
608
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700609 end = (unsigned long)__end_rodata;
610 start = (start + PAGE_SIZE - 1) & PAGE_MASK;
611 end &= PAGE_MASK;
612 if (end <= start)
613 return;
614
615 change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
Arjan van de Ven67df1972006-01-06 00:12:04 -0800616
Jan Beulich6fb14752007-05-02 19:27:10 +0200617 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700618 (end - start) >> 10);
Arjan van de Ven67df1972006-01-06 00:12:04 -0800619
620 /*
621 * change_page_attr_addr() requires a global_flush_tlb() call after it.
622 * We do this after the printk so that if something went wrong in the
623 * change, the printk gets out at least to give a better debug hint
624 * of who is the culprit.
625 */
626 global_flush_tlb();
627}
628#endif
629
Linus Torvalds1da177e2005-04-16 15:20:36 -0700630#ifdef CONFIG_BLK_DEV_INITRD
631void free_initrd_mem(unsigned long start, unsigned long end)
632{
Linus Torvaldse3ebadd2007-05-07 08:44:24 -0700633 free_init_pages("initrd memory", start, end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700634}
635#endif
636
637void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
638{
Matt Tolentino2b976902005-06-23 00:08:06 -0700639#ifdef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700640 int nid = phys_to_nid(phys);
Andi Kleen5e58a022006-11-14 16:57:46 +0100641#endif
642 unsigned long pfn = phys >> PAGE_SHIFT;
643 if (pfn >= end_pfn) {
644 /* This can happen with kdump kernels when accessing firmware
645 tables. */
646 if (pfn < end_pfn_map)
647 return;
648 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
649 phys, len);
650 return;
651 }
652
653 /* Should check here against the e820 map to avoid double free */
654#ifdef CONFIG_NUMA
Linus Torvalds1da177e2005-04-16 15:20:36 -0700655 reserve_bootmem_node(NODE_DATA(nid), phys, len);
656#else
657 reserve_bootmem(phys, len);
658#endif
Mel Gorman0e0b8642006-09-27 01:49:56 -0700659 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
Andi Kleene18c6872005-11-05 17:25:53 +0100660 dma_reserve += len / PAGE_SIZE;
Mel Gorman0e0b8642006-09-27 01:49:56 -0700661 set_dma_reserve(dma_reserve);
662 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663}
664
665int kern_addr_valid(unsigned long addr)
666{
667 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
668 pgd_t *pgd;
669 pud_t *pud;
670 pmd_t *pmd;
671 pte_t *pte;
672
673 if (above != 0 && above != -1UL)
674 return 0;
675
676 pgd = pgd_offset_k(addr);
677 if (pgd_none(*pgd))
678 return 0;
679
680 pud = pud_offset(pgd, addr);
681 if (pud_none(*pud))
682 return 0;
683
684 pmd = pmd_offset(pud, addr);
685 if (pmd_none(*pmd))
686 return 0;
687 if (pmd_large(*pmd))
688 return pfn_valid(pmd_pfn(*pmd));
689
690 pte = pte_offset_kernel(pmd, addr);
691 if (pte_none(*pte))
692 return 0;
693 return pfn_valid(pte_pfn(*pte));
694}
695
Ernie Petrides103efcd2006-12-07 02:14:09 +0100696/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
Andi Kleen1e014412005-04-16 15:24:55 -0700697 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
698 not need special handling anymore. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700699
700static struct vm_area_struct gate_vma = {
701 .vm_start = VSYSCALL_START,
Ernie Petrides103efcd2006-12-07 02:14:09 +0100702 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
703 .vm_page_prot = PAGE_READONLY_EXEC,
704 .vm_flags = VM_READ | VM_EXEC
Linus Torvalds1da177e2005-04-16 15:20:36 -0700705};
706
Linus Torvalds1da177e2005-04-16 15:20:36 -0700707struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
708{
709#ifdef CONFIG_IA32_EMULATION
Andi Kleen1e014412005-04-16 15:24:55 -0700710 if (test_tsk_thread_flag(tsk, TIF_IA32))
711 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700712#endif
713 return &gate_vma;
714}
715
716int in_gate_area(struct task_struct *task, unsigned long addr)
717{
718 struct vm_area_struct *vma = get_gate_vma(task);
Andi Kleen1e014412005-04-16 15:24:55 -0700719 if (!vma)
720 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700721 return (addr >= vma->vm_start) && (addr < vma->vm_end);
722}
723
724/* Use this when you have no reliable task/vma, typically from interrupt
725 * context. It is less reliable than using the task's vma and may give
726 * false positives.
727 */
728int in_gate_area_no_task(unsigned long addr)
729{
Andi Kleen1e014412005-04-16 15:24:55 -0700730 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731}
Zou Nan hai2e1c49d2007-06-01 00:46:28 -0700732
Andi Kleen2aae9502007-07-21 17:10:01 +0200733const char *arch_vma_name(struct vm_area_struct *vma)
734{
735 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
736 return "[vdso]";
737 if (vma == &gate_vma)
738 return "[vsyscall]";
739 return NULL;
740}
Christoph Lameter0889eba2007-10-16 01:24:15 -0700741
742#ifdef CONFIG_SPARSEMEM_VMEMMAP
743/*
744 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
745 */
746int __meminit vmemmap_populate(struct page *start_page,
747 unsigned long size, int node)
748{
749 unsigned long addr = (unsigned long)start_page;
750 unsigned long end = (unsigned long)(start_page + size);
751 unsigned long next;
752 pgd_t *pgd;
753 pud_t *pud;
754 pmd_t *pmd;
755
756 for (; addr < end; addr = next) {
757 next = pmd_addr_end(addr, end);
758
759 pgd = vmemmap_pgd_populate(addr, node);
760 if (!pgd)
761 return -ENOMEM;
762 pud = vmemmap_pud_populate(pgd, addr, node);
763 if (!pud)
764 return -ENOMEM;
765
766 pmd = pmd_offset(pud, addr);
767 if (pmd_none(*pmd)) {
768 pte_t entry;
769 void *p = vmemmap_alloc_block(PMD_SIZE, node);
770 if (!p)
771 return -ENOMEM;
772
773 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
774 mk_pte_huge(entry);
775 set_pmd(pmd, __pmd(pte_val(entry)));
776
777 printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
778 addr, addr + PMD_SIZE - 1, p, node);
779 } else
780 vmemmap_verify((pte_t *)pmd, node, addr, next);
781 }
782
783 return 0;
784}
785#endif