blob: 779132af29a70b073d408abc7d916c35028525a5 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
Ravikiran G Thirumalai6c231b72005-09-06 15:17:45 -070025struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070026bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
Eric Dumazetdcf36bf2006-03-25 16:31:46 +010028struct memnode memnode;
Linus Torvalds1da177e2005-04-16 15:20:36 -070029
Andi Kleen3f098c22005-09-12 18:49:24 +020030unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
31 [0 ... NR_CPUS-1] = NUMA_NO_NODE
Andi Kleen0b07e982005-09-12 18:49:24 +020032};
Andi Kleen3f098c22005-09-12 18:49:24 +020033unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
34 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
35};
36cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070037
38int numa_off __initdata;
39
Eric Dumazet529a3402005-11-05 17:25:54 +010040
41/*
42 * Given a shift value, try to populate memnodemap[]
43 * Returns :
44 * 1 if OK
45 * 0 if memnodmap[] too small (of shift too small)
46 * -1 if node overlap or lost ram (shift too big)
47 */
Andi Kleend18ff472006-01-11 22:44:30 +010048static int __init
Andi Kleenabe059e2006-03-25 16:29:12 +010049populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
Linus Torvalds1da177e2005-04-16 15:20:36 -070050{
51 int i;
Eric Dumazet529a3402005-11-05 17:25:54 +010052 int res = -1;
53 unsigned long addr, end;
Keith Manntheyb6846642005-07-28 21:15:38 -070054
Eric Dumazet8309cf62005-12-12 22:17:14 -080055 if (shift >= 64)
56 return -1;
Eric Dumazet529a3402005-11-05 17:25:54 +010057 memset(memnodemap, 0xff, sizeof(memnodemap));
58 for (i = 0; i < numnodes; i++) {
59 addr = nodes[i].start;
60 end = nodes[i].end;
61 if (addr >= end)
62 continue;
63 if ((end >> shift) >= NODEMAPSIZE)
64 return 0;
65 do {
66 if (memnodemap[addr >> shift] != 0xff)
67 return -1;
68 memnodemap[addr >> shift] = i;
Eric Dumazet8309cf62005-12-12 22:17:14 -080069 addr += (1UL << shift);
Eric Dumazet529a3402005-11-05 17:25:54 +010070 } while (addr < end);
71 res = 1;
72 }
73 return res;
74}
75
Andi Kleenabe059e2006-03-25 16:29:12 +010076int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
Eric Dumazet529a3402005-11-05 17:25:54 +010077{
78 int shift = 20;
79
80 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
Keith Manntheyb6846642005-07-28 21:15:38 -070081 shift++;
82
Andi Kleen6b050f82006-01-11 22:44:33 +010083 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
Eric Dumazet529a3402005-11-05 17:25:54 +010084 shift);
85
86 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
87 printk(KERN_INFO
Keith Manntheyb6846642005-07-28 21:15:38 -070088 "Your memory is not aligned you need to rebuild your kernel "
Eric Dumazet529a3402005-11-05 17:25:54 +010089 "with a bigger NODEMAPSIZE shift=%d\n",
90 shift);
91 return -1;
92 }
Keith Manntheyb6846642005-07-28 21:15:38 -070093 return shift;
Linus Torvalds1da177e2005-04-16 15:20:36 -070094}
95
Matt Tolentinobbfceef2005-06-23 00:08:07 -070096#ifdef CONFIG_SPARSEMEM
97int early_pfn_to_nid(unsigned long pfn)
98{
99 return phys_to_nid(pfn << PAGE_SHIFT);
100}
101#endif
102
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103/* Initialize bootmem allocator for a node */
104void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
105{
106 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
107 unsigned long nodedata_phys;
108 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
109
110 start = round_up(start, ZONE_ALIGN);
111
Andi Kleen6b050f82006-01-11 22:44:33 +0100112 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113
114 start_pfn = start >> PAGE_SHIFT;
115 end_pfn = end >> PAGE_SHIFT;
116
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117 nodedata_phys = find_e820_area(start, end, pgdat_size);
118 if (nodedata_phys == -1L)
119 panic("Cannot find memory pgdat in node %d\n", nodeid);
120
121 Dprintk("nodedata_phys %lx\n", nodedata_phys);
122
123 node_data[nodeid] = phys_to_virt(nodedata_phys);
124 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
125 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
126 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
127 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
128
129 /* Find a place for the bootmem map */
130 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
131 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
132 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
133 if (bootmap_start == -1L)
134 panic("Not enough continuous space for bootmap on node %d", nodeid);
135 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
136
137 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
138 bootmap_start >> PAGE_SHIFT,
139 start_pfn, end_pfn);
140
141 e820_bootmem_free(NODE_DATA(nodeid), start, end);
142
143 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
144 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200145#ifdef CONFIG_ACPI_NUMA
146 srat_reserve_add_area(nodeid);
147#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148 node_set_online(nodeid);
149}
150
151/* Initialize final allocator for a zone */
152void __init setup_node_zones(int nodeid)
153{
Andi Kleen267b4802006-03-25 16:31:10 +0100154 unsigned long start_pfn, end_pfn, memmapsize, limit;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 unsigned long zones[MAX_NR_ZONES];
Andi Kleen485761b2005-08-26 18:34:10 -0700156 unsigned long holes[MAX_NR_ZONES];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157
Andi Kleena2f1b422005-11-05 17:25:53 +0100158 start_pfn = node_start_pfn(nodeid);
159 end_pfn = node_end_pfn(nodeid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160
Andi Kleen6b050f82006-01-11 22:44:33 +0100161 Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
Andi Kleena2f1b422005-11-05 17:25:53 +0100162 nodeid, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163
Andi Kleen267b4802006-03-25 16:31:10 +0100164 /* Try to allocate mem_map at end to not fill up precious <4GB
165 memory. */
166 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
167 limit = end_pfn << PAGE_SHIFT;
168 NODE_DATA(nodeid)->node_mem_map =
169 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
170 memmapsize, SMP_CACHE_BYTES,
171 round_down(limit - memmapsize, PAGE_SIZE),
172 limit);
173
Andi Kleena2f1b422005-11-05 17:25:53 +0100174 size_zones(zones, holes, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700175 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
Andi Kleen485761b2005-08-26 18:34:10 -0700176 start_pfn, holes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700177}
178
179void __init numa_init_array(void)
180{
181 int rr, i;
182 /* There are unfortunately some poorly designed mainboards around
183 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
184 mapping. To avoid this fill in the mapping for all possible
185 CPUs, as the number of CPUs is not known yet.
186 We round robin the existing nodes. */
Ravikiran G Thirumalai85cc5132005-09-30 11:59:22 -0700187 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188 for (i = 0; i < NR_CPUS; i++) {
189 if (cpu_to_node[i] != NUMA_NO_NODE)
190 continue;
Andi Kleen69d81fc2005-11-05 17:25:53 +0100191 numa_set_node(i, rr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700192 rr = next_node(rr, node_online_map);
193 if (rr == MAX_NUMNODES)
194 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195 }
196
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197}
198
199#ifdef CONFIG_NUMA_EMU
200int numa_fake __initdata = 0;
201
202/* Numa emulation */
203static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
204{
205 int i;
Andi Kleenabe059e2006-03-25 16:29:12 +0100206 struct bootnode nodes[MAX_NUMNODES];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
208
209 /* Kludge needed for the hash function */
210 if (hweight64(sz) > 1) {
211 unsigned long x = 1;
212 while ((x << 1) < sz)
213 x <<= 1;
214 if (x < sz/2)
Andi Kleen6b050f82006-01-11 22:44:33 +0100215 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700216 sz = x;
217 }
218
219 memset(&nodes,0,sizeof(nodes));
220 for (i = 0; i < numa_fake; i++) {
221 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
222 if (i == numa_fake-1)
223 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
224 nodes[i].end = nodes[i].start + sz;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700225 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
226 i,
227 nodes[i].start, nodes[i].end,
228 (nodes[i].end - nodes[i].start) >> 20);
229 node_set_online(i);
230 }
231 memnode_shift = compute_hash_shift(nodes, numa_fake);
232 if (memnode_shift < 0) {
233 memnode_shift = 0;
234 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
235 return -1;
236 }
237 for_each_online_node(i)
238 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
239 numa_init_array();
240 return 0;
241}
242#endif
243
244void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
245{
246 int i;
247
248#ifdef CONFIG_NUMA_EMU
249 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
250 return;
251#endif
252
253#ifdef CONFIG_ACPI_NUMA
254 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
255 end_pfn << PAGE_SHIFT))
256 return;
257#endif
258
259#ifdef CONFIG_K8_NUMA
260 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
261 return;
262#endif
263 printk(KERN_INFO "%s\n",
264 numa_off ? "NUMA turned off" : "No NUMA configuration found");
265
266 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
267 start_pfn << PAGE_SHIFT,
268 end_pfn << PAGE_SHIFT);
269 /* setup dummy node covering all memory */
270 memnode_shift = 63;
271 memnodemap[0] = 0;
272 nodes_clear(node_online_map);
273 node_set_online(0);
274 for (i = 0; i < NR_CPUS; i++)
Andi Kleen69d81fc2005-11-05 17:25:53 +0100275 numa_set_node(i, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700276 node_to_cpumask[0] = cpumask_of_cpu(0);
277 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
278}
279
Ashok Raje6982c62005-06-25 14:54:58 -0700280__cpuinit void numa_add_cpu(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700281{
Ravikiran G Thirumalaie6a045a2005-09-30 11:59:21 -0700282 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700283}
284
Andi Kleen69d81fc2005-11-05 17:25:53 +0100285void __cpuinit numa_set_node(int cpu, int node)
286{
Ravikiran G Thirumalaidf79efd2006-01-11 22:45:39 +0100287 cpu_pda(cpu)->nodenumber = node;
Andi Kleen69d81fc2005-11-05 17:25:53 +0100288 cpu_to_node[cpu] = node;
289}
290
Linus Torvalds1da177e2005-04-16 15:20:36 -0700291unsigned long __init numa_free_all_bootmem(void)
292{
293 int i;
294 unsigned long pages = 0;
295 for_each_online_node(i) {
296 pages += free_all_bootmem_node(NODE_DATA(i));
297 }
298 return pages;
299}
300
Bob Piccod3ee8712005-11-05 17:25:54 +0100301#ifdef CONFIG_SPARSEMEM
302static void __init arch_sparse_init(void)
303{
304 int i;
305
306 for_each_online_node(i)
307 memory_present(i, node_start_pfn(i), node_end_pfn(i));
308
309 sparse_init();
310}
311#else
312#define arch_sparse_init() do {} while (0)
313#endif
314
Linus Torvalds1da177e2005-04-16 15:20:36 -0700315void __init paging_init(void)
316{
317 int i;
Bob Piccod3ee8712005-11-05 17:25:54 +0100318
319 arch_sparse_init();
320
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321 for_each_online_node(i) {
322 setup_node_zones(i);
323 }
324}
325
326/* [numa=off] */
327__init int numa_setup(char *opt)
328{
329 if (!strncmp(opt,"off",3))
330 numa_off = 1;
331#ifdef CONFIG_NUMA_EMU
332 if(!strncmp(opt, "fake=", 5)) {
333 numa_fake = simple_strtoul(opt+5,NULL,0); ;
334 if (numa_fake >= MAX_NUMNODES)
335 numa_fake = MAX_NUMNODES;
336 }
337#endif
338#ifdef CONFIG_ACPI_NUMA
339 if (!strncmp(opt,"noacpi",6))
340 acpi_numa = -1;
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200341 if (!strncmp(opt,"hotadd=", 7))
342 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343#endif
344 return 1;
345}
346
Ravikiran Thirumalai05b3cbd2006-01-11 22:45:36 +0100347/*
348 * Setup early cpu_to_node.
349 *
350 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
351 * and apicid_to_node[] tables have valid entries for a CPU.
352 * This means we skip cpu_to_node[] initialisation for NUMA
353 * emulation and faking node case (when running a kernel compiled
354 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
355 * is already initialized in a round robin manner at numa_init_array,
356 * prior to this call, and this initialization is good enough
357 * for the fake NUMA cases.
358 */
359void __init init_cpu_to_node(void)
360{
361 int i;
362 for (i = 0; i < NR_CPUS; i++) {
363 u8 apicid = x86_cpu_to_apicid[i];
364 if (apicid == BAD_APICID)
365 continue;
366 if (apicid_to_node[apicid] == NUMA_NO_NODE)
367 continue;
Daniel Yeisleyd1db4ec2006-02-15 15:17:41 -0800368 numa_set_node(i,apicid_to_node[apicid]);
Ravikiran Thirumalai05b3cbd2006-01-11 22:45:36 +0100369 }
370}
371
Linus Torvalds1da177e2005-04-16 15:20:36 -0700372EXPORT_SYMBOL(cpu_to_node);
373EXPORT_SYMBOL(node_to_cpumask);
Eric Dumazetdcf36bf2006-03-25 16:31:46 +0100374EXPORT_SYMBOL(memnode);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700375EXPORT_SYMBOL(node_data);
Andi Kleencf050132006-01-11 22:46:27 +0100376
377#ifdef CONFIG_DISCONTIGMEM
378/*
379 * Functions to convert PFNs from/to per node page addresses.
380 * These are out of line because they are quite big.
381 * They could be all tuned by pre caching more state.
382 * Should do that.
383 */
384
Andi Kleencf050132006-01-11 22:46:27 +0100385int pfn_valid(unsigned long pfn)
386{
387 unsigned nid;
388 if (pfn >= num_physpages)
389 return 0;
390 nid = pfn_to_nid(pfn);
391 if (nid == 0xff)
392 return 0;
393 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
394}
395EXPORT_SYMBOL(pfn_valid);
396#endif