blob: 07471a3eb19076d9989b1f955e32bd593a4c495f [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
Ravikiran G Thirumalai6c231b72005-09-06 15:17:45 -070025struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070026bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28int memnode_shift;
29u8 memnodemap[NODEMAPSIZE];
30
Andi Kleen3f098c22005-09-12 18:49:24 +020031unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
Andi Kleen0b07e982005-09-12 18:49:24 +020033};
Andi Kleen3f098c22005-09-12 18:49:24 +020034unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36};
37cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070038
39int numa_off __initdata;
40
Eric Dumazet529a3402005-11-05 17:25:54 +010041
42/*
43 * Given a shift value, try to populate memnodemap[]
44 * Returns :
45 * 1 if OK
46 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big)
48 */
Andi Kleend18ff472006-01-11 22:44:30 +010049static int __init
Andi Kleenabe059e2006-03-25 16:29:12 +010050populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
Linus Torvalds1da177e2005-04-16 15:20:36 -070051{
52 int i;
Eric Dumazet529a3402005-11-05 17:25:54 +010053 int res = -1;
54 unsigned long addr, end;
Keith Manntheyb6846642005-07-28 21:15:38 -070055
Eric Dumazet8309cf62005-12-12 22:17:14 -080056 if (shift >= 64)
57 return -1;
Eric Dumazet529a3402005-11-05 17:25:54 +010058 memset(memnodemap, 0xff, sizeof(memnodemap));
59 for (i = 0; i < numnodes; i++) {
60 addr = nodes[i].start;
61 end = nodes[i].end;
62 if (addr >= end)
63 continue;
64 if ((end >> shift) >= NODEMAPSIZE)
65 return 0;
66 do {
67 if (memnodemap[addr >> shift] != 0xff)
68 return -1;
69 memnodemap[addr >> shift] = i;
Eric Dumazet8309cf62005-12-12 22:17:14 -080070 addr += (1UL << shift);
Eric Dumazet529a3402005-11-05 17:25:54 +010071 } while (addr < end);
72 res = 1;
73 }
74 return res;
75}
76
Andi Kleenabe059e2006-03-25 16:29:12 +010077int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
Eric Dumazet529a3402005-11-05 17:25:54 +010078{
79 int shift = 20;
80
81 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
Keith Manntheyb6846642005-07-28 21:15:38 -070082 shift++;
83
Andi Kleen6b050f82006-01-11 22:44:33 +010084 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
Eric Dumazet529a3402005-11-05 17:25:54 +010085 shift);
86
87 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
88 printk(KERN_INFO
Keith Manntheyb6846642005-07-28 21:15:38 -070089 "Your memory is not aligned you need to rebuild your kernel "
Eric Dumazet529a3402005-11-05 17:25:54 +010090 "with a bigger NODEMAPSIZE shift=%d\n",
91 shift);
92 return -1;
93 }
Keith Manntheyb6846642005-07-28 21:15:38 -070094 return shift;
Linus Torvalds1da177e2005-04-16 15:20:36 -070095}
96
Matt Tolentinobbfceef2005-06-23 00:08:07 -070097#ifdef CONFIG_SPARSEMEM
98int early_pfn_to_nid(unsigned long pfn)
99{
100 return phys_to_nid(pfn << PAGE_SHIFT);
101}
102#endif
103
Linus Torvalds1da177e2005-04-16 15:20:36 -0700104/* Initialize bootmem allocator for a node */
105void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
106{
107 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
108 unsigned long nodedata_phys;
109 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
110
111 start = round_up(start, ZONE_ALIGN);
112
Andi Kleen6b050f82006-01-11 22:44:33 +0100113 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114
115 start_pfn = start >> PAGE_SHIFT;
116 end_pfn = end >> PAGE_SHIFT;
117
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118 nodedata_phys = find_e820_area(start, end, pgdat_size);
119 if (nodedata_phys == -1L)
120 panic("Cannot find memory pgdat in node %d\n", nodeid);
121
122 Dprintk("nodedata_phys %lx\n", nodedata_phys);
123
124 node_data[nodeid] = phys_to_virt(nodedata_phys);
125 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
126 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
127 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
128 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
129
130 /* Find a place for the bootmem map */
131 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
132 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
133 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
134 if (bootmap_start == -1L)
135 panic("Not enough continuous space for bootmap on node %d", nodeid);
136 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
137
138 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
139 bootmap_start >> PAGE_SHIFT,
140 start_pfn, end_pfn);
141
142 e820_bootmem_free(NODE_DATA(nodeid), start, end);
143
144 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
145 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
146 node_set_online(nodeid);
147}
148
149/* Initialize final allocator for a zone */
150void __init setup_node_zones(int nodeid)
151{
Andi Kleen267b4802006-03-25 16:31:10 +0100152 unsigned long start_pfn, end_pfn, memmapsize, limit;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153 unsigned long zones[MAX_NR_ZONES];
Andi Kleen485761b2005-08-26 18:34:10 -0700154 unsigned long holes[MAX_NR_ZONES];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155
Andi Kleena2f1b422005-11-05 17:25:53 +0100156 start_pfn = node_start_pfn(nodeid);
157 end_pfn = node_end_pfn(nodeid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158
Andi Kleen6b050f82006-01-11 22:44:33 +0100159 Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
Andi Kleena2f1b422005-11-05 17:25:53 +0100160 nodeid, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700161
Andi Kleen267b4802006-03-25 16:31:10 +0100162 /* Try to allocate mem_map at end to not fill up precious <4GB
163 memory. */
164 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
165 limit = end_pfn << PAGE_SHIFT;
166 NODE_DATA(nodeid)->node_mem_map =
167 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
168 memmapsize, SMP_CACHE_BYTES,
169 round_down(limit - memmapsize, PAGE_SIZE),
170 limit);
171
Andi Kleena2f1b422005-11-05 17:25:53 +0100172 size_zones(zones, holes, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
Andi Kleen485761b2005-08-26 18:34:10 -0700174 start_pfn, holes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700175}
176
177void __init numa_init_array(void)
178{
179 int rr, i;
180 /* There are unfortunately some poorly designed mainboards around
181 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
182 mapping. To avoid this fill in the mapping for all possible
183 CPUs, as the number of CPUs is not known yet.
184 We round robin the existing nodes. */
Ravikiran G Thirumalai85cc5132005-09-30 11:59:22 -0700185 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186 for (i = 0; i < NR_CPUS; i++) {
187 if (cpu_to_node[i] != NUMA_NO_NODE)
188 continue;
Andi Kleen69d81fc2005-11-05 17:25:53 +0100189 numa_set_node(i, rr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700190 rr = next_node(rr, node_online_map);
191 if (rr == MAX_NUMNODES)
192 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193 }
194
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195}
196
197#ifdef CONFIG_NUMA_EMU
198int numa_fake __initdata = 0;
199
200/* Numa emulation */
201static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
202{
203 int i;
Andi Kleenabe059e2006-03-25 16:29:12 +0100204 struct bootnode nodes[MAX_NUMNODES];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700205 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
206
207 /* Kludge needed for the hash function */
208 if (hweight64(sz) > 1) {
209 unsigned long x = 1;
210 while ((x << 1) < sz)
211 x <<= 1;
212 if (x < sz/2)
Andi Kleen6b050f82006-01-11 22:44:33 +0100213 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700214 sz = x;
215 }
216
217 memset(&nodes,0,sizeof(nodes));
218 for (i = 0; i < numa_fake; i++) {
219 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
220 if (i == numa_fake-1)
221 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
222 nodes[i].end = nodes[i].start + sz;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700223 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
224 i,
225 nodes[i].start, nodes[i].end,
226 (nodes[i].end - nodes[i].start) >> 20);
227 node_set_online(i);
228 }
229 memnode_shift = compute_hash_shift(nodes, numa_fake);
230 if (memnode_shift < 0) {
231 memnode_shift = 0;
232 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
233 return -1;
234 }
235 for_each_online_node(i)
236 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
237 numa_init_array();
238 return 0;
239}
240#endif
241
242void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
243{
244 int i;
245
246#ifdef CONFIG_NUMA_EMU
247 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
248 return;
249#endif
250
251#ifdef CONFIG_ACPI_NUMA
252 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
253 end_pfn << PAGE_SHIFT))
254 return;
255#endif
256
257#ifdef CONFIG_K8_NUMA
258 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
259 return;
260#endif
261 printk(KERN_INFO "%s\n",
262 numa_off ? "NUMA turned off" : "No NUMA configuration found");
263
264 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
265 start_pfn << PAGE_SHIFT,
266 end_pfn << PAGE_SHIFT);
267 /* setup dummy node covering all memory */
268 memnode_shift = 63;
269 memnodemap[0] = 0;
270 nodes_clear(node_online_map);
271 node_set_online(0);
272 for (i = 0; i < NR_CPUS; i++)
Andi Kleen69d81fc2005-11-05 17:25:53 +0100273 numa_set_node(i, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700274 node_to_cpumask[0] = cpumask_of_cpu(0);
275 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
276}
277
Ashok Raje6982c62005-06-25 14:54:58 -0700278__cpuinit void numa_add_cpu(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700279{
Ravikiran G Thirumalaie6a045a2005-09-30 11:59:21 -0700280 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700281}
282
Andi Kleen69d81fc2005-11-05 17:25:53 +0100283void __cpuinit numa_set_node(int cpu, int node)
284{
Ravikiran G Thirumalaidf79efd2006-01-11 22:45:39 +0100285 cpu_pda(cpu)->nodenumber = node;
Andi Kleen69d81fc2005-11-05 17:25:53 +0100286 cpu_to_node[cpu] = node;
287}
288
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289unsigned long __init numa_free_all_bootmem(void)
290{
291 int i;
292 unsigned long pages = 0;
293 for_each_online_node(i) {
294 pages += free_all_bootmem_node(NODE_DATA(i));
295 }
296 return pages;
297}
298
Bob Piccod3ee8712005-11-05 17:25:54 +0100299#ifdef CONFIG_SPARSEMEM
300static void __init arch_sparse_init(void)
301{
302 int i;
303
304 for_each_online_node(i)
305 memory_present(i, node_start_pfn(i), node_end_pfn(i));
306
307 sparse_init();
308}
309#else
310#define arch_sparse_init() do {} while (0)
311#endif
312
Linus Torvalds1da177e2005-04-16 15:20:36 -0700313void __init paging_init(void)
314{
315 int i;
Bob Piccod3ee8712005-11-05 17:25:54 +0100316
317 arch_sparse_init();
318
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319 for_each_online_node(i) {
320 setup_node_zones(i);
321 }
322}
323
324/* [numa=off] */
325__init int numa_setup(char *opt)
326{
327 if (!strncmp(opt,"off",3))
328 numa_off = 1;
329#ifdef CONFIG_NUMA_EMU
330 if(!strncmp(opt, "fake=", 5)) {
331 numa_fake = simple_strtoul(opt+5,NULL,0); ;
332 if (numa_fake >= MAX_NUMNODES)
333 numa_fake = MAX_NUMNODES;
334 }
335#endif
336#ifdef CONFIG_ACPI_NUMA
337 if (!strncmp(opt,"noacpi",6))
338 acpi_numa = -1;
339#endif
340 return 1;
341}
342
Ravikiran Thirumalai05b3cbd2006-01-11 22:45:36 +0100343/*
344 * Setup early cpu_to_node.
345 *
346 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
347 * and apicid_to_node[] tables have valid entries for a CPU.
348 * This means we skip cpu_to_node[] initialisation for NUMA
349 * emulation and faking node case (when running a kernel compiled
350 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
351 * is already initialized in a round robin manner at numa_init_array,
352 * prior to this call, and this initialization is good enough
353 * for the fake NUMA cases.
354 */
355void __init init_cpu_to_node(void)
356{
357 int i;
358 for (i = 0; i < NR_CPUS; i++) {
359 u8 apicid = x86_cpu_to_apicid[i];
360 if (apicid == BAD_APICID)
361 continue;
362 if (apicid_to_node[apicid] == NUMA_NO_NODE)
363 continue;
Daniel Yeisleyd1db4ec2006-02-15 15:17:41 -0800364 numa_set_node(i,apicid_to_node[apicid]);
Ravikiran Thirumalai05b3cbd2006-01-11 22:45:36 +0100365 }
366}
367
Linus Torvalds1da177e2005-04-16 15:20:36 -0700368EXPORT_SYMBOL(cpu_to_node);
369EXPORT_SYMBOL(node_to_cpumask);
370EXPORT_SYMBOL(memnode_shift);
371EXPORT_SYMBOL(memnodemap);
372EXPORT_SYMBOL(node_data);
Andi Kleencf050132006-01-11 22:46:27 +0100373
374#ifdef CONFIG_DISCONTIGMEM
375/*
376 * Functions to convert PFNs from/to per node page addresses.
377 * These are out of line because they are quite big.
378 * They could be all tuned by pre caching more state.
379 * Should do that.
380 */
381
382/* Requires pfn_valid(pfn) to be true */
383struct page *pfn_to_page(unsigned long pfn)
384{
385 int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT);
386 return (pfn - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map;
387}
388EXPORT_SYMBOL(pfn_to_page);
389
390unsigned long page_to_pfn(struct page *page)
391{
392 return (long)(((page) - page_zone(page)->zone_mem_map) +
393 page_zone(page)->zone_start_pfn);
394}
395EXPORT_SYMBOL(page_to_pfn);
396
397int pfn_valid(unsigned long pfn)
398{
399 unsigned nid;
400 if (pfn >= num_physpages)
401 return 0;
402 nid = pfn_to_nid(pfn);
403 if (nid == 0xff)
404 return 0;
405 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
406}
407EXPORT_SYMBOL(pfn_valid);
408#endif