blob: 0ae2d9d5d7eae9c8ee0069429f5ffb4897887bca [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
Ravikiran G Thirumalai6c231b72005-09-06 15:17:45 -070025struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070026bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
Eric Dumazetdcf36bf2006-03-25 16:31:46 +010028struct memnode memnode;
Linus Torvalds1da177e2005-04-16 15:20:36 -070029
Andi Kleen3f098c22005-09-12 18:49:24 +020030unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
31 [0 ... NR_CPUS-1] = NUMA_NO_NODE
Andi Kleen0b07e982005-09-12 18:49:24 +020032};
Andi Kleen3f098c22005-09-12 18:49:24 +020033unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
34 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
35};
36cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070037
38int numa_off __initdata;
Amul Shah076422d2007-02-13 13:26:19 +010039unsigned long __initdata nodemap_addr;
40unsigned long __initdata nodemap_size;
Linus Torvalds1da177e2005-04-16 15:20:36 -070041
Eric Dumazet529a3402005-11-05 17:25:54 +010042
43/*
44 * Given a shift value, try to populate memnodemap[]
45 * Returns :
46 * 1 if OK
47 * 0 if memnodmap[] too small (of shift too small)
48 * -1 if node overlap or lost ram (shift too big)
49 */
Andi Kleend18ff472006-01-11 22:44:30 +010050static int __init
Andi Kleenabe059e2006-03-25 16:29:12 +010051populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
Linus Torvalds1da177e2005-04-16 15:20:36 -070052{
53 int i;
Eric Dumazet529a3402005-11-05 17:25:54 +010054 int res = -1;
55 unsigned long addr, end;
Keith Manntheyb6846642005-07-28 21:15:38 -070056
Amul Shah076422d2007-02-13 13:26:19 +010057 memset(memnodemap, 0xff, memnodemapsize);
Eric Dumazet529a3402005-11-05 17:25:54 +010058 for (i = 0; i < numnodes; i++) {
59 addr = nodes[i].start;
60 end = nodes[i].end;
61 if (addr >= end)
62 continue;
Amul Shah076422d2007-02-13 13:26:19 +010063 if ((end >> shift) >= memnodemapsize)
Eric Dumazet529a3402005-11-05 17:25:54 +010064 return 0;
65 do {
66 if (memnodemap[addr >> shift] != 0xff)
67 return -1;
68 memnodemap[addr >> shift] = i;
Amul Shah076422d2007-02-13 13:26:19 +010069 addr += (1UL << shift);
Eric Dumazet529a3402005-11-05 17:25:54 +010070 } while (addr < end);
71 res = 1;
72 }
73 return res;
74}
75
Amul Shah076422d2007-02-13 13:26:19 +010076static int __init allocate_cachealigned_memnodemap(void)
77{
78 unsigned long pad, pad_addr;
79
80 memnodemap = memnode.embedded_map;
Amul Shah54413922007-02-13 13:26:20 +010081 if (memnodemapsize <= 48)
Amul Shah076422d2007-02-13 13:26:19 +010082 return 0;
Amul Shah076422d2007-02-13 13:26:19 +010083
84 pad = L1_CACHE_BYTES - 1;
85 pad_addr = 0x8000;
86 nodemap_size = pad + memnodemapsize;
87 nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
88 nodemap_size);
89 if (nodemap_addr == -1UL) {
90 printk(KERN_ERR
91 "NUMA: Unable to allocate Memory to Node hash map\n");
92 nodemap_addr = nodemap_size = 0;
93 return -1;
94 }
95 pad_addr = (nodemap_addr + pad) & ~pad;
96 memnodemap = phys_to_virt(pad_addr);
97
98 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
99 nodemap_addr, nodemap_addr + nodemap_size);
100 return 0;
101}
102
103/*
104 * The LSB of all start and end addresses in the node map is the value of the
105 * maximum possible shift.
106 */
107static int __init
108extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
109{
Amul Shah54413922007-02-13 13:26:20 +0100110 int i, nodes_used = 0;
Amul Shah076422d2007-02-13 13:26:19 +0100111 unsigned long start, end;
112 unsigned long bitfield = 0, memtop = 0;
113
114 for (i = 0; i < numnodes; i++) {
115 start = nodes[i].start;
116 end = nodes[i].end;
117 if (start >= end)
118 continue;
Amul Shah54413922007-02-13 13:26:20 +0100119 bitfield |= start;
120 nodes_used++;
Amul Shah076422d2007-02-13 13:26:19 +0100121 if (end > memtop)
122 memtop = end;
123 }
Amul Shah54413922007-02-13 13:26:20 +0100124 if (nodes_used <= 1)
125 i = 63;
126 else
127 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
Amul Shah076422d2007-02-13 13:26:19 +0100128 memnodemapsize = (memtop >> i)+1;
129 return i;
130}
131
Andi Kleenabe059e2006-03-25 16:29:12 +0100132int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
Eric Dumazet529a3402005-11-05 17:25:54 +0100133{
Amul Shah076422d2007-02-13 13:26:19 +0100134 int shift;
Eric Dumazet529a3402005-11-05 17:25:54 +0100135
Amul Shah076422d2007-02-13 13:26:19 +0100136 shift = extract_lsb_from_nodes(nodes, numnodes);
137 if (allocate_cachealigned_memnodemap())
138 return -1;
Andi Kleen6b050f82006-01-11 22:44:33 +0100139 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
Eric Dumazet529a3402005-11-05 17:25:54 +0100140 shift);
141
142 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
143 printk(KERN_INFO
Keith Manntheyb6846642005-07-28 21:15:38 -0700144 "Your memory is not aligned you need to rebuild your kernel "
Eric Dumazet529a3402005-11-05 17:25:54 +0100145 "with a bigger NODEMAPSIZE shift=%d\n",
146 shift);
147 return -1;
148 }
Keith Manntheyb6846642005-07-28 21:15:38 -0700149 return shift;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700150}
151
Matt Tolentinobbfceef2005-06-23 00:08:07 -0700152#ifdef CONFIG_SPARSEMEM
153int early_pfn_to_nid(unsigned long pfn)
154{
155 return phys_to_nid(pfn << PAGE_SHIFT);
156}
157#endif
158
Andi Kleena8062232006-04-07 19:49:21 +0200159static void * __init
160early_node_mem(int nodeid, unsigned long start, unsigned long end,
161 unsigned long size)
162{
163 unsigned long mem = find_e820_area(start, end, size);
164 void *ptr;
165 if (mem != -1L)
166 return __va(mem);
167 ptr = __alloc_bootmem_nopanic(size,
168 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
169 if (ptr == 0) {
170 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
171 size, nodeid);
172 return NULL;
173 }
174 return ptr;
175}
176
Linus Torvalds1da177e2005-04-16 15:20:36 -0700177/* Initialize bootmem allocator for a node */
178void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
179{
180 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
181 unsigned long nodedata_phys;
Andi Kleena8062232006-04-07 19:49:21 +0200182 void *bootmap;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
184
185 start = round_up(start, ZONE_ALIGN);
186
Andi Kleen6b050f82006-01-11 22:44:33 +0100187 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188
189 start_pfn = start >> PAGE_SHIFT;
190 end_pfn = end >> PAGE_SHIFT;
191
Andi Kleena8062232006-04-07 19:49:21 +0200192 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
193 if (node_data[nodeid] == NULL)
194 return;
195 nodedata_phys = __pa(node_data[nodeid]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
198 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
199 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
200 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
201
202 /* Find a place for the bootmem map */
203 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
204 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
Andi Kleena8062232006-04-07 19:49:21 +0200205 bootmap = early_node_mem(nodeid, bootmap_start, end,
206 bootmap_pages<<PAGE_SHIFT);
207 if (bootmap == NULL) {
208 if (nodedata_phys < start || nodedata_phys >= end)
209 free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
210 node_data[nodeid] = NULL;
211 return;
212 }
213 bootmap_start = __pa(bootmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700214 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
215
216 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
217 bootmap_start >> PAGE_SHIFT,
218 start_pfn, end_pfn);
219
Mel Gorman5cb248a2006-09-27 01:49:52 -0700220 free_bootmem_with_active_regions(nodeid, end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700221
222 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
223 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200224#ifdef CONFIG_ACPI_NUMA
225 srat_reserve_add_area(nodeid);
226#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700227 node_set_online(nodeid);
228}
229
230/* Initialize final allocator for a zone */
231void __init setup_node_zones(int nodeid)
232{
Andi Kleen267b4802006-03-25 16:31:10 +0100233 unsigned long start_pfn, end_pfn, memmapsize, limit;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700234
Andi Kleena2f1b422005-11-05 17:25:53 +0100235 start_pfn = node_start_pfn(nodeid);
236 end_pfn = node_end_pfn(nodeid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237
Mel Gorman5cb248a2006-09-27 01:49:52 -0700238 Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
Andi Kleena2f1b422005-11-05 17:25:53 +0100239 nodeid, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700240
Andi Kleen267b4802006-03-25 16:31:10 +0100241 /* Try to allocate mem_map at end to not fill up precious <4GB
242 memory. */
243 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
244 limit = end_pfn << PAGE_SHIFT;
Andy Whitcroft3b5fd592006-04-22 02:35:41 -0700245#ifdef CONFIG_FLAT_NODE_MEM_MAP
Andi Kleen267b4802006-03-25 16:31:10 +0100246 NODE_DATA(nodeid)->node_mem_map =
247 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
248 memmapsize, SMP_CACHE_BYTES,
249 round_down(limit - memmapsize, PAGE_SIZE),
250 limit);
Andy Whitcroft3b5fd592006-04-22 02:35:41 -0700251#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700252}
253
254void __init numa_init_array(void)
255{
256 int rr, i;
257 /* There are unfortunately some poorly designed mainboards around
258 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
259 mapping. To avoid this fill in the mapping for all possible
260 CPUs, as the number of CPUs is not known yet.
261 We round robin the existing nodes. */
Ravikiran G Thirumalai85cc5132005-09-30 11:59:22 -0700262 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263 for (i = 0; i < NR_CPUS; i++) {
264 if (cpu_to_node[i] != NUMA_NO_NODE)
265 continue;
Andi Kleen69d81fc2005-11-05 17:25:53 +0100266 numa_set_node(i, rr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267 rr = next_node(rr, node_online_map);
268 if (rr == MAX_NUMNODES)
269 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700270 }
271
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272}
273
274#ifdef CONFIG_NUMA_EMU
Rohit Seth53fee042007-02-13 13:26:22 +0100275/* Numa emulation */
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200276#define E820_ADDR_HOLE_SIZE(start, end) \
277 (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \
278 PAGE_SHIFT)
279char *cmdline __initdata;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700280
Rohit Seth53fee042007-02-13 13:26:22 +0100281/*
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200282 * Setups up nid to range from addr to addr + size. If the end boundary is
283 * greater than max_addr, then max_addr is used instead. The return value is 0
284 * if there is additional memory left for allocation past addr and -1 otherwise.
285 * addr is adjusted to be at the end of the node.
Rohit Seth53fee042007-02-13 13:26:22 +0100286 */
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200287static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
288 u64 size, u64 max_addr)
Rohit Seth53fee042007-02-13 13:26:22 +0100289{
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200290 int ret = 0;
291 nodes[nid].start = *addr;
292 *addr += size;
293 if (*addr >= max_addr) {
294 *addr = max_addr;
295 ret = -1;
296 }
297 nodes[nid].end = *addr;
298 node_set_online(nid);
299 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
300 nodes[nid].start, nodes[nid].end,
301 (nodes[nid].end - nodes[nid].start) >> 20);
302 return ret;
Rohit Seth53fee042007-02-13 13:26:22 +0100303}
304
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200305/*
306 * Splits num_nodes nodes up equally starting at node_start. The return value
307 * is the number of nodes split up and addr is adjusted to be at the end of the
308 * last node allocated.
309 */
310static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
311 u64 max_addr, int node_start,
312 int num_nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700313{
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200314 unsigned int big;
315 u64 size;
316 int i;
Rohit Seth53fee042007-02-13 13:26:22 +0100317
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200318 if (num_nodes <= 0)
319 return -1;
320 if (num_nodes > MAX_NUMNODES)
321 num_nodes = MAX_NUMNODES;
322 size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) /
323 num_nodes;
Rohit Seth53fee042007-02-13 13:26:22 +0100324 /*
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200325 * Calculate the number of big nodes that can be allocated as a result
326 * of consolidating the leftovers.
Rohit Seth53fee042007-02-13 13:26:22 +0100327 */
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200328 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
329 FAKE_NODE_MIN_SIZE;
Rohit Seth53fee042007-02-13 13:26:22 +0100330
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200331 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
332 size &= FAKE_NODE_MIN_HASH_MASK;
333 if (!size) {
334 printk(KERN_ERR "Not enough memory for each node. "
335 "NUMA emulation disabled.\n");
336 return -1;
Rohit Seth53fee042007-02-13 13:26:22 +0100337 }
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200338
339 for (i = node_start; i < num_nodes + node_start; i++) {
340 u64 end = *addr + size;
Rohit Seth53fee042007-02-13 13:26:22 +0100341 if (i < big)
342 end += FAKE_NODE_MIN_SIZE;
343 /*
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200344 * The final node can have the remaining system RAM. Other
345 * nodes receive roughly the same amount of available pages.
Rohit Seth53fee042007-02-13 13:26:22 +0100346 */
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200347 if (i == num_nodes + node_start - 1)
Rohit Seth53fee042007-02-13 13:26:22 +0100348 end = max_addr;
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200349 else
350 while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) <
351 size) {
352 end += FAKE_NODE_MIN_SIZE;
353 if (end > max_addr) {
354 end = max_addr;
355 break;
356 }
357 }
358 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
359 break;
360 }
361 return i - node_start + 1;
362}
363
364/*
365 * Sets up the system RAM area from start_pfn to end_pfn according to the
366 * numa=fake command-line option.
367 */
368static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
369{
370 struct bootnode nodes[MAX_NUMNODES];
371 u64 addr = start_pfn << PAGE_SHIFT;
372 u64 max_addr = end_pfn << PAGE_SHIFT;
373 unsigned int coeff;
374 unsigned int num = 0;
375 int num_nodes = 0;
376 u64 size;
377 int i;
378
379 memset(&nodes, 0, sizeof(nodes));
380 /*
381 * If the numa=fake command-line is just a single number N, split the
382 * system RAM into N fake nodes.
383 */
384 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
385 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
386 simple_strtol(cmdline, NULL, 0));
387 if (num_nodes < 0)
388 return num_nodes;
389 goto out;
390 }
391
392 /* Parse the command line. */
393 for (coeff = 1; ; cmdline++) {
394 if (*cmdline && isdigit(*cmdline)) {
395 num = num * 10 + *cmdline - '0';
396 continue;
397 }
398 if (*cmdline == '*')
399 coeff = num;
400 if (!*cmdline || *cmdline == ',') {
401 /*
402 * Round down to the nearest FAKE_NODE_MIN_SIZE.
403 * Command-line coefficients are in megabytes.
404 */
405 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
406 if (size) {
407 for (i = 0; i < coeff; i++, num_nodes++)
408 if (setup_node_range(num_nodes, nodes,
409 &addr, size, max_addr) < 0)
410 goto done;
411 coeff = 1;
412 }
413 }
414 if (!*cmdline)
415 break;
416 num = 0;
417 }
418done:
419 if (!num_nodes)
420 return -1;
David Rientjes14694d72007-05-02 19:27:09 +0200421 /* Fill remainder of system RAM, if appropriate. */
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200422 if (addr < max_addr) {
David Rientjes14694d72007-05-02 19:27:09 +0200423 switch (*(cmdline - 1)) {
424 case '*':
425 /* Split remaining nodes into coeff chunks */
426 if (coeff <= 0)
427 break;
428 num_nodes += split_nodes_equally(nodes, &addr, max_addr,
429 num_nodes, coeff);
430 break;
431 case ',':
432 /* Do not allocate remaining system RAM */
433 break;
434 default:
435 /* Give one final node */
436 setup_node_range(num_nodes, nodes, &addr,
437 max_addr - addr, max_addr);
438 num_nodes++;
439 }
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200440 }
441out:
442 memnode_shift = compute_hash_shift(nodes, num_nodes);
443 if (memnode_shift < 0) {
444 memnode_shift = 0;
445 printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
446 "disabled.\n");
447 return -1;
448 }
449
450 /*
451 * We need to vacate all active ranges that may have been registered by
452 * SRAT.
453 */
454 remove_all_active_ranges();
455 for_each_online_node(i) {
Mel Gorman5cb248a2006-09-27 01:49:52 -0700456 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
457 nodes[i].end >> PAGE_SHIFT);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700458 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
Mel Gorman5cb248a2006-09-27 01:49:52 -0700459 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460 numa_init_array();
461 return 0;
462}
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200463#undef E820_ADDR_HOLE_SIZE
464#endif /* CONFIG_NUMA_EMU */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700465
466void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
467{
468 int i;
469
470#ifdef CONFIG_NUMA_EMU
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200471 if (cmdline && !numa_emulation(start_pfn, end_pfn))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472 return;
473#endif
474
475#ifdef CONFIG_ACPI_NUMA
476 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
477 end_pfn << PAGE_SHIFT))
478 return;
479#endif
480
481#ifdef CONFIG_K8_NUMA
482 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
483 return;
484#endif
485 printk(KERN_INFO "%s\n",
486 numa_off ? "NUMA turned off" : "No NUMA configuration found");
487
488 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
489 start_pfn << PAGE_SHIFT,
490 end_pfn << PAGE_SHIFT);
491 /* setup dummy node covering all memory */
492 memnode_shift = 63;
Amul Shah076422d2007-02-13 13:26:19 +0100493 memnodemap = memnode.embedded_map;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700494 memnodemap[0] = 0;
495 nodes_clear(node_online_map);
496 node_set_online(0);
497 for (i = 0; i < NR_CPUS; i++)
Andi Kleen69d81fc2005-11-05 17:25:53 +0100498 numa_set_node(i, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700499 node_to_cpumask[0] = cpumask_of_cpu(0);
Mel Gorman5cb248a2006-09-27 01:49:52 -0700500 e820_register_active_regions(0, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700501 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
502}
503
Ashok Raje6982c62005-06-25 14:54:58 -0700504__cpuinit void numa_add_cpu(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700505{
Ravikiran G Thirumalaie6a045a2005-09-30 11:59:21 -0700506 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700507}
508
Andi Kleen69d81fc2005-11-05 17:25:53 +0100509void __cpuinit numa_set_node(int cpu, int node)
510{
Ravikiran G Thirumalaidf79efd2006-01-11 22:45:39 +0100511 cpu_pda(cpu)->nodenumber = node;
Andi Kleen69d81fc2005-11-05 17:25:53 +0100512 cpu_to_node[cpu] = node;
513}
514
Linus Torvalds1da177e2005-04-16 15:20:36 -0700515unsigned long __init numa_free_all_bootmem(void)
516{
517 int i;
518 unsigned long pages = 0;
519 for_each_online_node(i) {
520 pages += free_all_bootmem_node(NODE_DATA(i));
521 }
522 return pages;
523}
524
525void __init paging_init(void)
526{
527 int i;
Mel Gorman6391af12006-10-11 01:20:39 -0700528 unsigned long max_zone_pfns[MAX_NR_ZONES];
529 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
530 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
531 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
532 max_zone_pfns[ZONE_NORMAL] = end_pfn;
Bob Piccod3ee8712005-11-05 17:25:54 +0100533
Bob Piccof0a5a582007-02-13 13:26:25 +0100534 sparse_memory_present_with_active_regions(MAX_NUMNODES);
535 sparse_init();
Bob Piccod3ee8712005-11-05 17:25:54 +0100536
Linus Torvalds1da177e2005-04-16 15:20:36 -0700537 for_each_online_node(i) {
538 setup_node_zones(i);
539 }
Mel Gorman5cb248a2006-09-27 01:49:52 -0700540
541 free_area_init_nodes(max_zone_pfns);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700542}
543
Andi Kleen2c8c0e62006-09-26 10:52:32 +0200544static __init int numa_setup(char *opt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700545{
Andi Kleen2c8c0e62006-09-26 10:52:32 +0200546 if (!opt)
547 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700548 if (!strncmp(opt,"off",3))
549 numa_off = 1;
550#ifdef CONFIG_NUMA_EMU
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200551 if (!strncmp(opt, "fake=", 5))
552 cmdline = opt + 5;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700553#endif
554#ifdef CONFIG_ACPI_NUMA
555 if (!strncmp(opt,"noacpi",6))
556 acpi_numa = -1;
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200557 if (!strncmp(opt,"hotadd=", 7))
558 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700559#endif
Andi Kleen2c8c0e62006-09-26 10:52:32 +0200560 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700561}
562
Andi Kleen2c8c0e62006-09-26 10:52:32 +0200563early_param("numa", numa_setup);
564
Ravikiran Thirumalai05b3cbd2006-01-11 22:45:36 +0100565/*
566 * Setup early cpu_to_node.
567 *
568 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
569 * and apicid_to_node[] tables have valid entries for a CPU.
570 * This means we skip cpu_to_node[] initialisation for NUMA
571 * emulation and faking node case (when running a kernel compiled
572 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
573 * is already initialized in a round robin manner at numa_init_array,
574 * prior to this call, and this initialization is good enough
575 * for the fake NUMA cases.
576 */
577void __init init_cpu_to_node(void)
578{
579 int i;
580 for (i = 0; i < NR_CPUS; i++) {
581 u8 apicid = x86_cpu_to_apicid[i];
582 if (apicid == BAD_APICID)
583 continue;
584 if (apicid_to_node[apicid] == NUMA_NO_NODE)
585 continue;
Daniel Yeisleyd1db4ec2006-02-15 15:17:41 -0800586 numa_set_node(i,apicid_to_node[apicid]);
Ravikiran Thirumalai05b3cbd2006-01-11 22:45:36 +0100587 }
588}
589
Linus Torvalds1da177e2005-04-16 15:20:36 -0700590EXPORT_SYMBOL(cpu_to_node);
591EXPORT_SYMBOL(node_to_cpumask);
Eric Dumazetdcf36bf2006-03-25 16:31:46 +0100592EXPORT_SYMBOL(memnode);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700593EXPORT_SYMBOL(node_data);
Andi Kleencf050132006-01-11 22:46:27 +0100594
595#ifdef CONFIG_DISCONTIGMEM
596/*
597 * Functions to convert PFNs from/to per node page addresses.
598 * These are out of line because they are quite big.
599 * They could be all tuned by pre caching more state.
600 * Should do that.
601 */
602
Andi Kleencf050132006-01-11 22:46:27 +0100603int pfn_valid(unsigned long pfn)
604{
605 unsigned nid;
606 if (pfn >= num_physpages)
607 return 0;
608 nid = pfn_to_nid(pfn);
609 if (nid == 0xff)
610 return 0;
611 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
612}
613EXPORT_SYMBOL(pfn_valid);
614#endif