blob: b2fac14baac0fc151838b32ad54f6eaf741b8047 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
Ravikiran G Thirumalai6c231b72005-09-06 15:17:45 -070025struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070026bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
Eric Dumazetdcf36bf2006-03-25 16:31:46 +010028struct memnode memnode;
Linus Torvalds1da177e2005-04-16 15:20:36 -070029
Andi Kleen3f098c22005-09-12 18:49:24 +020030unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
31 [0 ... NR_CPUS-1] = NUMA_NO_NODE
Andi Kleen0b07e982005-09-12 18:49:24 +020032};
Andi Kleen3f098c22005-09-12 18:49:24 +020033unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
34 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
35};
36cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070037
38int numa_off __initdata;
39
Eric Dumazet529a3402005-11-05 17:25:54 +010040
41/*
42 * Given a shift value, try to populate memnodemap[]
43 * Returns :
44 * 1 if OK
45 * 0 if memnodmap[] too small (of shift too small)
46 * -1 if node overlap or lost ram (shift too big)
47 */
Andi Kleend18ff472006-01-11 22:44:30 +010048static int __init
Andi Kleenabe059e2006-03-25 16:29:12 +010049populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
Linus Torvalds1da177e2005-04-16 15:20:36 -070050{
51 int i;
Eric Dumazet529a3402005-11-05 17:25:54 +010052 int res = -1;
53 unsigned long addr, end;
Keith Manntheyb6846642005-07-28 21:15:38 -070054
Eric Dumazet8309cf62005-12-12 22:17:14 -080055 if (shift >= 64)
56 return -1;
Eric Dumazet529a3402005-11-05 17:25:54 +010057 memset(memnodemap, 0xff, sizeof(memnodemap));
58 for (i = 0; i < numnodes; i++) {
59 addr = nodes[i].start;
60 end = nodes[i].end;
61 if (addr >= end)
62 continue;
63 if ((end >> shift) >= NODEMAPSIZE)
64 return 0;
65 do {
66 if (memnodemap[addr >> shift] != 0xff)
67 return -1;
68 memnodemap[addr >> shift] = i;
Eric Dumazet8309cf62005-12-12 22:17:14 -080069 addr += (1UL << shift);
Eric Dumazet529a3402005-11-05 17:25:54 +010070 } while (addr < end);
71 res = 1;
72 }
73 return res;
74}
75
Andi Kleenabe059e2006-03-25 16:29:12 +010076int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
Eric Dumazet529a3402005-11-05 17:25:54 +010077{
78 int shift = 20;
79
80 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
Keith Manntheyb6846642005-07-28 21:15:38 -070081 shift++;
82
Andi Kleen6b050f82006-01-11 22:44:33 +010083 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
Eric Dumazet529a3402005-11-05 17:25:54 +010084 shift);
85
86 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
87 printk(KERN_INFO
Keith Manntheyb6846642005-07-28 21:15:38 -070088 "Your memory is not aligned you need to rebuild your kernel "
Eric Dumazet529a3402005-11-05 17:25:54 +010089 "with a bigger NODEMAPSIZE shift=%d\n",
90 shift);
91 return -1;
92 }
Keith Manntheyb6846642005-07-28 21:15:38 -070093 return shift;
Linus Torvalds1da177e2005-04-16 15:20:36 -070094}
95
Matt Tolentinobbfceef2005-06-23 00:08:07 -070096#ifdef CONFIG_SPARSEMEM
97int early_pfn_to_nid(unsigned long pfn)
98{
99 return phys_to_nid(pfn << PAGE_SHIFT);
100}
101#endif
102
Andi Kleena8062232006-04-07 19:49:21 +0200103static void * __init
104early_node_mem(int nodeid, unsigned long start, unsigned long end,
105 unsigned long size)
106{
107 unsigned long mem = find_e820_area(start, end, size);
108 void *ptr;
109 if (mem != -1L)
110 return __va(mem);
111 ptr = __alloc_bootmem_nopanic(size,
112 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
113 if (ptr == 0) {
114 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
115 size, nodeid);
116 return NULL;
117 }
118 return ptr;
119}
120
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121/* Initialize bootmem allocator for a node */
122void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
123{
124 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
125 unsigned long nodedata_phys;
Andi Kleena8062232006-04-07 19:49:21 +0200126 void *bootmap;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700127 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
128
129 start = round_up(start, ZONE_ALIGN);
130
Andi Kleen6b050f82006-01-11 22:44:33 +0100131 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132
133 start_pfn = start >> PAGE_SHIFT;
134 end_pfn = end >> PAGE_SHIFT;
135
Andi Kleena8062232006-04-07 19:49:21 +0200136 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
137 if (node_data[nodeid] == NULL)
138 return;
139 nodedata_phys = __pa(node_data[nodeid]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700140
Linus Torvalds1da177e2005-04-16 15:20:36 -0700141 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
142 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
143 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
144 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
145
146 /* Find a place for the bootmem map */
147 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
148 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
Andi Kleena8062232006-04-07 19:49:21 +0200149 bootmap = early_node_mem(nodeid, bootmap_start, end,
150 bootmap_pages<<PAGE_SHIFT);
151 if (bootmap == NULL) {
152 if (nodedata_phys < start || nodedata_phys >= end)
153 free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
154 node_data[nodeid] = NULL;
155 return;
156 }
157 bootmap_start = __pa(bootmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
159
160 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
161 bootmap_start >> PAGE_SHIFT,
162 start_pfn, end_pfn);
163
164 e820_bootmem_free(NODE_DATA(nodeid), start, end);
165
166 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
167 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200168#ifdef CONFIG_ACPI_NUMA
169 srat_reserve_add_area(nodeid);
170#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700171 node_set_online(nodeid);
172}
173
174/* Initialize final allocator for a zone */
175void __init setup_node_zones(int nodeid)
176{
Andi Kleen267b4802006-03-25 16:31:10 +0100177 unsigned long start_pfn, end_pfn, memmapsize, limit;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178 unsigned long zones[MAX_NR_ZONES];
Andi Kleen485761b2005-08-26 18:34:10 -0700179 unsigned long holes[MAX_NR_ZONES];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180
Andi Kleena2f1b422005-11-05 17:25:53 +0100181 start_pfn = node_start_pfn(nodeid);
182 end_pfn = node_end_pfn(nodeid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183
Andi Kleen6b050f82006-01-11 22:44:33 +0100184 Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
Andi Kleena2f1b422005-11-05 17:25:53 +0100185 nodeid, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186
Andi Kleen267b4802006-03-25 16:31:10 +0100187 /* Try to allocate mem_map at end to not fill up precious <4GB
188 memory. */
189 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
190 limit = end_pfn << PAGE_SHIFT;
Andy Whitcroft3b5fd592006-04-22 02:35:41 -0700191#ifdef CONFIG_FLAT_NODE_MEM_MAP
Andi Kleen267b4802006-03-25 16:31:10 +0100192 NODE_DATA(nodeid)->node_mem_map =
193 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
194 memmapsize, SMP_CACHE_BYTES,
195 round_down(limit - memmapsize, PAGE_SIZE),
196 limit);
Andy Whitcroft3b5fd592006-04-22 02:35:41 -0700197#endif
Andi Kleen267b4802006-03-25 16:31:10 +0100198
Andi Kleena2f1b422005-11-05 17:25:53 +0100199 size_zones(zones, holes, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700200 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
Andi Kleen485761b2005-08-26 18:34:10 -0700201 start_pfn, holes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202}
203
204void __init numa_init_array(void)
205{
206 int rr, i;
207 /* There are unfortunately some poorly designed mainboards around
208 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
209 mapping. To avoid this fill in the mapping for all possible
210 CPUs, as the number of CPUs is not known yet.
211 We round robin the existing nodes. */
Ravikiran G Thirumalai85cc5132005-09-30 11:59:22 -0700212 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213 for (i = 0; i < NR_CPUS; i++) {
214 if (cpu_to_node[i] != NUMA_NO_NODE)
215 continue;
Andi Kleen69d81fc2005-11-05 17:25:53 +0100216 numa_set_node(i, rr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700217 rr = next_node(rr, node_online_map);
218 if (rr == MAX_NUMNODES)
219 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700220 }
221
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222}
223
224#ifdef CONFIG_NUMA_EMU
225int numa_fake __initdata = 0;
226
227/* Numa emulation */
228static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
229{
230 int i;
Andi Kleenabe059e2006-03-25 16:29:12 +0100231 struct bootnode nodes[MAX_NUMNODES];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700232 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
233
234 /* Kludge needed for the hash function */
235 if (hweight64(sz) > 1) {
236 unsigned long x = 1;
237 while ((x << 1) < sz)
238 x <<= 1;
239 if (x < sz/2)
Andi Kleen6b050f82006-01-11 22:44:33 +0100240 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241 sz = x;
242 }
243
244 memset(&nodes,0,sizeof(nodes));
245 for (i = 0; i < numa_fake; i++) {
246 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
247 if (i == numa_fake-1)
248 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
249 nodes[i].end = nodes[i].start + sz;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
251 i,
252 nodes[i].start, nodes[i].end,
253 (nodes[i].end - nodes[i].start) >> 20);
254 node_set_online(i);
255 }
256 memnode_shift = compute_hash_shift(nodes, numa_fake);
257 if (memnode_shift < 0) {
258 memnode_shift = 0;
259 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
260 return -1;
261 }
262 for_each_online_node(i)
263 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
264 numa_init_array();
265 return 0;
266}
267#endif
268
269void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
270{
271 int i;
272
273#ifdef CONFIG_NUMA_EMU
274 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
275 return;
276#endif
277
278#ifdef CONFIG_ACPI_NUMA
279 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
280 end_pfn << PAGE_SHIFT))
281 return;
282#endif
283
284#ifdef CONFIG_K8_NUMA
285 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
286 return;
287#endif
288 printk(KERN_INFO "%s\n",
289 numa_off ? "NUMA turned off" : "No NUMA configuration found");
290
291 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
292 start_pfn << PAGE_SHIFT,
293 end_pfn << PAGE_SHIFT);
294 /* setup dummy node covering all memory */
295 memnode_shift = 63;
296 memnodemap[0] = 0;
297 nodes_clear(node_online_map);
298 node_set_online(0);
299 for (i = 0; i < NR_CPUS; i++)
Andi Kleen69d81fc2005-11-05 17:25:53 +0100300 numa_set_node(i, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301 node_to_cpumask[0] = cpumask_of_cpu(0);
302 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
303}
304
Ashok Raje6982c62005-06-25 14:54:58 -0700305__cpuinit void numa_add_cpu(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306{
Ravikiran G Thirumalaie6a045a2005-09-30 11:59:21 -0700307 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308}
309
Andi Kleen69d81fc2005-11-05 17:25:53 +0100310void __cpuinit numa_set_node(int cpu, int node)
311{
Ravikiran G Thirumalaidf79efd2006-01-11 22:45:39 +0100312 cpu_pda(cpu)->nodenumber = node;
Andi Kleen69d81fc2005-11-05 17:25:53 +0100313 cpu_to_node[cpu] = node;
314}
315
Linus Torvalds1da177e2005-04-16 15:20:36 -0700316unsigned long __init numa_free_all_bootmem(void)
317{
318 int i;
319 unsigned long pages = 0;
320 for_each_online_node(i) {
321 pages += free_all_bootmem_node(NODE_DATA(i));
322 }
323 return pages;
324}
325
Bob Piccod3ee8712005-11-05 17:25:54 +0100326#ifdef CONFIG_SPARSEMEM
327static void __init arch_sparse_init(void)
328{
329 int i;
330
331 for_each_online_node(i)
332 memory_present(i, node_start_pfn(i), node_end_pfn(i));
333
334 sparse_init();
335}
336#else
337#define arch_sparse_init() do {} while (0)
338#endif
339
Linus Torvalds1da177e2005-04-16 15:20:36 -0700340void __init paging_init(void)
341{
342 int i;
Bob Piccod3ee8712005-11-05 17:25:54 +0100343
344 arch_sparse_init();
345
Linus Torvalds1da177e2005-04-16 15:20:36 -0700346 for_each_online_node(i) {
347 setup_node_zones(i);
348 }
349}
350
351/* [numa=off] */
352__init int numa_setup(char *opt)
353{
354 if (!strncmp(opt,"off",3))
355 numa_off = 1;
356#ifdef CONFIG_NUMA_EMU
357 if(!strncmp(opt, "fake=", 5)) {
358 numa_fake = simple_strtoul(opt+5,NULL,0); ;
359 if (numa_fake >= MAX_NUMNODES)
360 numa_fake = MAX_NUMNODES;
361 }
362#endif
363#ifdef CONFIG_ACPI_NUMA
364 if (!strncmp(opt,"noacpi",6))
365 acpi_numa = -1;
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200366 if (!strncmp(opt,"hotadd=", 7))
367 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700368#endif
369 return 1;
370}
371
Ravikiran Thirumalai05b3cbd2006-01-11 22:45:36 +0100372/*
373 * Setup early cpu_to_node.
374 *
375 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
376 * and apicid_to_node[] tables have valid entries for a CPU.
377 * This means we skip cpu_to_node[] initialisation for NUMA
378 * emulation and faking node case (when running a kernel compiled
379 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
380 * is already initialized in a round robin manner at numa_init_array,
381 * prior to this call, and this initialization is good enough
382 * for the fake NUMA cases.
383 */
384void __init init_cpu_to_node(void)
385{
386 int i;
387 for (i = 0; i < NR_CPUS; i++) {
388 u8 apicid = x86_cpu_to_apicid[i];
389 if (apicid == BAD_APICID)
390 continue;
391 if (apicid_to_node[apicid] == NUMA_NO_NODE)
392 continue;
Daniel Yeisleyd1db4ec2006-02-15 15:17:41 -0800393 numa_set_node(i,apicid_to_node[apicid]);
Ravikiran Thirumalai05b3cbd2006-01-11 22:45:36 +0100394 }
395}
396
Linus Torvalds1da177e2005-04-16 15:20:36 -0700397EXPORT_SYMBOL(cpu_to_node);
398EXPORT_SYMBOL(node_to_cpumask);
Eric Dumazetdcf36bf2006-03-25 16:31:46 +0100399EXPORT_SYMBOL(memnode);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400EXPORT_SYMBOL(node_data);
Andi Kleencf050132006-01-11 22:46:27 +0100401
402#ifdef CONFIG_DISCONTIGMEM
403/*
404 * Functions to convert PFNs from/to per node page addresses.
405 * These are out of line because they are quite big.
406 * They could be all tuned by pre caching more state.
407 * Should do that.
408 */
409
Andi Kleencf050132006-01-11 22:46:27 +0100410int pfn_valid(unsigned long pfn)
411{
412 unsigned nid;
413 if (pfn >= num_physpages)
414 return 0;
415 nid = pfn_to_nid(pfn);
416 if (nid == 0xff)
417 return 0;
418 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
419}
420EXPORT_SYMBOL(pfn_valid);
421#endif