blob: a828a01739cc791015d1bd02e8e04a555bdf288f [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
Ravikiran G Thirumalai6c231b72005-09-06 15:17:45 -070025struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070026bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28int memnode_shift;
29u8 memnodemap[NODEMAPSIZE];
30
Andi Kleen3f098c22005-09-12 18:49:24 +020031unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
Andi Kleen0b07e982005-09-12 18:49:24 +020033};
Andi Kleen3f098c22005-09-12 18:49:24 +020034unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36};
37cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070038
39int numa_off __initdata;
40
Eric Dumazet529a3402005-11-05 17:25:54 +010041
42/*
43 * Given a shift value, try to populate memnodemap[]
44 * Returns :
45 * 1 if OK
46 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big)
48 */
49static int __init populate_memnodemap(
50 const struct node *nodes, int numnodes, int shift)
Linus Torvalds1da177e2005-04-16 15:20:36 -070051{
52 int i;
Eric Dumazet529a3402005-11-05 17:25:54 +010053 int res = -1;
54 unsigned long addr, end;
Keith Manntheyb6846642005-07-28 21:15:38 -070055
Eric Dumazet529a3402005-11-05 17:25:54 +010056 memset(memnodemap, 0xff, sizeof(memnodemap));
57 for (i = 0; i < numnodes; i++) {
58 addr = nodes[i].start;
59 end = nodes[i].end;
60 if (addr >= end)
61 continue;
62 if ((end >> shift) >= NODEMAPSIZE)
63 return 0;
64 do {
65 if (memnodemap[addr >> shift] != 0xff)
66 return -1;
67 memnodemap[addr >> shift] = i;
68 addr += (1 << shift);
69 } while (addr < end);
70 res = 1;
71 }
72 return res;
73}
74
75int __init compute_hash_shift(struct node *nodes, int numnodes)
76{
77 int shift = 20;
78
79 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
Keith Manntheyb6846642005-07-28 21:15:38 -070080 shift++;
81
Eric Dumazet529a3402005-11-05 17:25:54 +010082 printk(KERN_DEBUG "Using %d for the hash shift.\n",
83 shift);
84
85 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
86 printk(KERN_INFO
Keith Manntheyb6846642005-07-28 21:15:38 -070087 "Your memory is not aligned you need to rebuild your kernel "
Eric Dumazet529a3402005-11-05 17:25:54 +010088 "with a bigger NODEMAPSIZE shift=%d\n",
89 shift);
90 return -1;
91 }
Keith Manntheyb6846642005-07-28 21:15:38 -070092 return shift;
Linus Torvalds1da177e2005-04-16 15:20:36 -070093}
94
Matt Tolentinobbfceef2005-06-23 00:08:07 -070095#ifdef CONFIG_SPARSEMEM
96int early_pfn_to_nid(unsigned long pfn)
97{
98 return phys_to_nid(pfn << PAGE_SHIFT);
99}
100#endif
101
Linus Torvalds1da177e2005-04-16 15:20:36 -0700102/* Initialize bootmem allocator for a node */
103void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
104{
105 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
106 unsigned long nodedata_phys;
107 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
108
109 start = round_up(start, ZONE_ALIGN);
110
111 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
112
113 start_pfn = start >> PAGE_SHIFT;
114 end_pfn = end >> PAGE_SHIFT;
115
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116 nodedata_phys = find_e820_area(start, end, pgdat_size);
117 if (nodedata_phys == -1L)
118 panic("Cannot find memory pgdat in node %d\n", nodeid);
119
120 Dprintk("nodedata_phys %lx\n", nodedata_phys);
121
122 node_data[nodeid] = phys_to_virt(nodedata_phys);
123 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
124 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
125 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
126 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
127
128 /* Find a place for the bootmem map */
129 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
130 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
131 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
132 if (bootmap_start == -1L)
133 panic("Not enough continuous space for bootmap on node %d", nodeid);
134 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
135
136 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
137 bootmap_start >> PAGE_SHIFT,
138 start_pfn, end_pfn);
139
140 e820_bootmem_free(NODE_DATA(nodeid), start, end);
141
142 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
143 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
144 node_set_online(nodeid);
145}
146
147/* Initialize final allocator for a zone */
148void __init setup_node_zones(int nodeid)
149{
150 unsigned long start_pfn, end_pfn;
151 unsigned long zones[MAX_NR_ZONES];
Andi Kleen485761b2005-08-26 18:34:10 -0700152 unsigned long holes[MAX_NR_ZONES];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153
Andi Kleena2f1b422005-11-05 17:25:53 +0100154 start_pfn = node_start_pfn(nodeid);
155 end_pfn = node_end_pfn(nodeid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156
Andi Kleena2f1b422005-11-05 17:25:53 +0100157 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n",
158 nodeid, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700159
Andi Kleena2f1b422005-11-05 17:25:53 +0100160 size_zones(zones, holes, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700161 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
Andi Kleen485761b2005-08-26 18:34:10 -0700162 start_pfn, holes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163}
164
165void __init numa_init_array(void)
166{
167 int rr, i;
168 /* There are unfortunately some poorly designed mainboards around
169 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
170 mapping. To avoid this fill in the mapping for all possible
171 CPUs, as the number of CPUs is not known yet.
172 We round robin the existing nodes. */
Ravikiran G Thirumalai85cc5132005-09-30 11:59:22 -0700173 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700174 for (i = 0; i < NR_CPUS; i++) {
175 if (cpu_to_node[i] != NUMA_NO_NODE)
176 continue;
Andi Kleen69d81fc2005-11-05 17:25:53 +0100177 numa_set_node(i, rr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178 rr = next_node(rr, node_online_map);
179 if (rr == MAX_NUMNODES)
180 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181 }
182
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183}
184
185#ifdef CONFIG_NUMA_EMU
186int numa_fake __initdata = 0;
187
188/* Numa emulation */
189static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
190{
191 int i;
192 struct node nodes[MAX_NUMNODES];
193 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
194
195 /* Kludge needed for the hash function */
196 if (hweight64(sz) > 1) {
197 unsigned long x = 1;
198 while ((x << 1) < sz)
199 x <<= 1;
200 if (x < sz/2)
201 printk("Numa emulation unbalanced. Complain to maintainer\n");
202 sz = x;
203 }
204
205 memset(&nodes,0,sizeof(nodes));
206 for (i = 0; i < numa_fake; i++) {
207 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
208 if (i == numa_fake-1)
209 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
210 nodes[i].end = nodes[i].start + sz;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700211 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
212 i,
213 nodes[i].start, nodes[i].end,
214 (nodes[i].end - nodes[i].start) >> 20);
215 node_set_online(i);
216 }
217 memnode_shift = compute_hash_shift(nodes, numa_fake);
218 if (memnode_shift < 0) {
219 memnode_shift = 0;
220 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
221 return -1;
222 }
223 for_each_online_node(i)
224 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
225 numa_init_array();
226 return 0;
227}
228#endif
229
230void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
231{
232 int i;
233
234#ifdef CONFIG_NUMA_EMU
235 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
236 return;
237#endif
238
239#ifdef CONFIG_ACPI_NUMA
240 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
241 end_pfn << PAGE_SHIFT))
242 return;
243#endif
244
245#ifdef CONFIG_K8_NUMA
246 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
247 return;
248#endif
249 printk(KERN_INFO "%s\n",
250 numa_off ? "NUMA turned off" : "No NUMA configuration found");
251
252 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
253 start_pfn << PAGE_SHIFT,
254 end_pfn << PAGE_SHIFT);
255 /* setup dummy node covering all memory */
256 memnode_shift = 63;
257 memnodemap[0] = 0;
258 nodes_clear(node_online_map);
259 node_set_online(0);
260 for (i = 0; i < NR_CPUS; i++)
Andi Kleen69d81fc2005-11-05 17:25:53 +0100261 numa_set_node(i, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700262 node_to_cpumask[0] = cpumask_of_cpu(0);
263 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
264}
265
Ashok Raje6982c62005-06-25 14:54:58 -0700266__cpuinit void numa_add_cpu(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267{
Ravikiran G Thirumalaie6a045a2005-09-30 11:59:21 -0700268 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700269}
270
Andi Kleen69d81fc2005-11-05 17:25:53 +0100271void __cpuinit numa_set_node(int cpu, int node)
272{
273 cpu_pda[cpu].nodenumber = node;
274 cpu_to_node[cpu] = node;
275}
276
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277unsigned long __init numa_free_all_bootmem(void)
278{
279 int i;
280 unsigned long pages = 0;
281 for_each_online_node(i) {
282 pages += free_all_bootmem_node(NODE_DATA(i));
283 }
284 return pages;
285}
286
Bob Piccod3ee8712005-11-05 17:25:54 +0100287#ifdef CONFIG_SPARSEMEM
288static void __init arch_sparse_init(void)
289{
290 int i;
291
292 for_each_online_node(i)
293 memory_present(i, node_start_pfn(i), node_end_pfn(i));
294
295 sparse_init();
296}
297#else
298#define arch_sparse_init() do {} while (0)
299#endif
300
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301void __init paging_init(void)
302{
303 int i;
Bob Piccod3ee8712005-11-05 17:25:54 +0100304
305 arch_sparse_init();
306
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307 for_each_online_node(i) {
308 setup_node_zones(i);
309 }
310}
311
312/* [numa=off] */
313__init int numa_setup(char *opt)
314{
315 if (!strncmp(opt,"off",3))
316 numa_off = 1;
317#ifdef CONFIG_NUMA_EMU
318 if(!strncmp(opt, "fake=", 5)) {
319 numa_fake = simple_strtoul(opt+5,NULL,0); ;
320 if (numa_fake >= MAX_NUMNODES)
321 numa_fake = MAX_NUMNODES;
322 }
323#endif
324#ifdef CONFIG_ACPI_NUMA
325 if (!strncmp(opt,"noacpi",6))
326 acpi_numa = -1;
327#endif
328 return 1;
329}
330
331EXPORT_SYMBOL(cpu_to_node);
332EXPORT_SYMBOL(node_to_cpumask);
333EXPORT_SYMBOL(memnode_shift);
334EXPORT_SYMBOL(memnodemap);
335EXPORT_SYMBOL(node_data);