blob: 629ff0621b3d07709617c06bd838d10fd2be53b6 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
Ravikiran G Thirumalai6c231b72005-09-06 15:17:45 -070025struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070026bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28int memnode_shift;
29u8 memnodemap[NODEMAPSIZE];
30
Andi Kleen3f098c22005-09-12 18:49:24 +020031unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
Andi Kleen0b07e982005-09-12 18:49:24 +020033};
Andi Kleen3f098c22005-09-12 18:49:24 +020034unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36};
37cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070038
39int numa_off __initdata;
40
Eric Dumazet529a3402005-11-05 17:25:54 +010041
42/*
43 * Given a shift value, try to populate memnodemap[]
44 * Returns :
45 * 1 if OK
46 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big)
48 */
49static int __init populate_memnodemap(
50 const struct node *nodes, int numnodes, int shift)
Linus Torvalds1da177e2005-04-16 15:20:36 -070051{
52 int i;
Eric Dumazet529a3402005-11-05 17:25:54 +010053 int res = -1;
54 unsigned long addr, end;
Keith Manntheyb6846642005-07-28 21:15:38 -070055
Eric Dumazet529a3402005-11-05 17:25:54 +010056 memset(memnodemap, 0xff, sizeof(memnodemap));
57 for (i = 0; i < numnodes; i++) {
58 addr = nodes[i].start;
59 end = nodes[i].end;
60 if (addr >= end)
61 continue;
62 if ((end >> shift) >= NODEMAPSIZE)
63 return 0;
64 do {
65 if (memnodemap[addr >> shift] != 0xff)
66 return -1;
67 memnodemap[addr >> shift] = i;
68 addr += (1 << shift);
69 } while (addr < end);
70 res = 1;
71 }
72 return res;
73}
74
75int __init compute_hash_shift(struct node *nodes, int numnodes)
76{
77 int shift = 20;
78
79 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
Keith Manntheyb6846642005-07-28 21:15:38 -070080 shift++;
81
Eric Dumazet529a3402005-11-05 17:25:54 +010082 printk(KERN_DEBUG "Using %d for the hash shift.\n",
83 shift);
84
85 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
86 printk(KERN_INFO
Keith Manntheyb6846642005-07-28 21:15:38 -070087 "Your memory is not aligned you need to rebuild your kernel "
Eric Dumazet529a3402005-11-05 17:25:54 +010088 "with a bigger NODEMAPSIZE shift=%d\n",
89 shift);
90 return -1;
91 }
Keith Manntheyb6846642005-07-28 21:15:38 -070092 return shift;
Linus Torvalds1da177e2005-04-16 15:20:36 -070093}
94
Matt Tolentinobbfceef2005-06-23 00:08:07 -070095#ifdef CONFIG_SPARSEMEM
96int early_pfn_to_nid(unsigned long pfn)
97{
98 return phys_to_nid(pfn << PAGE_SHIFT);
99}
100#endif
101
Linus Torvalds1da177e2005-04-16 15:20:36 -0700102/* Initialize bootmem allocator for a node */
103void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
104{
105 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
106 unsigned long nodedata_phys;
107 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
108
109 start = round_up(start, ZONE_ALIGN);
110
111 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
112
113 start_pfn = start >> PAGE_SHIFT;
114 end_pfn = end >> PAGE_SHIFT;
115
Matt Tolentinobbfceef2005-06-23 00:08:07 -0700116 memory_present(nodeid, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117 nodedata_phys = find_e820_area(start, end, pgdat_size);
118 if (nodedata_phys == -1L)
119 panic("Cannot find memory pgdat in node %d\n", nodeid);
120
121 Dprintk("nodedata_phys %lx\n", nodedata_phys);
122
123 node_data[nodeid] = phys_to_virt(nodedata_phys);
124 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
125 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
126 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
127 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
128
129 /* Find a place for the bootmem map */
130 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
131 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
132 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
133 if (bootmap_start == -1L)
134 panic("Not enough continuous space for bootmap on node %d", nodeid);
135 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
136
137 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
138 bootmap_start >> PAGE_SHIFT,
139 start_pfn, end_pfn);
140
141 e820_bootmem_free(NODE_DATA(nodeid), start, end);
142
143 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
144 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
145 node_set_online(nodeid);
146}
147
148/* Initialize final allocator for a zone */
149void __init setup_node_zones(int nodeid)
150{
151 unsigned long start_pfn, end_pfn;
152 unsigned long zones[MAX_NR_ZONES];
Andi Kleen485761b2005-08-26 18:34:10 -0700153 unsigned long holes[MAX_NR_ZONES];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154
Andi Kleena2f1b422005-11-05 17:25:53 +0100155 start_pfn = node_start_pfn(nodeid);
156 end_pfn = node_end_pfn(nodeid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157
Andi Kleena2f1b422005-11-05 17:25:53 +0100158 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n",
159 nodeid, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160
Andi Kleena2f1b422005-11-05 17:25:53 +0100161 size_zones(zones, holes, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700162 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
Andi Kleen485761b2005-08-26 18:34:10 -0700163 start_pfn, holes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700164}
165
166void __init numa_init_array(void)
167{
168 int rr, i;
169 /* There are unfortunately some poorly designed mainboards around
170 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
171 mapping. To avoid this fill in the mapping for all possible
172 CPUs, as the number of CPUs is not known yet.
173 We round robin the existing nodes. */
Ravikiran G Thirumalai85cc5132005-09-30 11:59:22 -0700174 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700175 for (i = 0; i < NR_CPUS; i++) {
176 if (cpu_to_node[i] != NUMA_NO_NODE)
177 continue;
Andi Kleen69d81fc2005-11-05 17:25:53 +0100178 numa_set_node(i, rr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700179 rr = next_node(rr, node_online_map);
180 if (rr == MAX_NUMNODES)
181 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700182 }
183
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184}
185
186#ifdef CONFIG_NUMA_EMU
187int numa_fake __initdata = 0;
188
189/* Numa emulation */
190static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
191{
192 int i;
193 struct node nodes[MAX_NUMNODES];
194 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
195
196 /* Kludge needed for the hash function */
197 if (hweight64(sz) > 1) {
198 unsigned long x = 1;
199 while ((x << 1) < sz)
200 x <<= 1;
201 if (x < sz/2)
202 printk("Numa emulation unbalanced. Complain to maintainer\n");
203 sz = x;
204 }
205
206 memset(&nodes,0,sizeof(nodes));
207 for (i = 0; i < numa_fake; i++) {
208 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
209 if (i == numa_fake-1)
210 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
211 nodes[i].end = nodes[i].start + sz;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700212 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
213 i,
214 nodes[i].start, nodes[i].end,
215 (nodes[i].end - nodes[i].start) >> 20);
216 node_set_online(i);
217 }
218 memnode_shift = compute_hash_shift(nodes, numa_fake);
219 if (memnode_shift < 0) {
220 memnode_shift = 0;
221 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
222 return -1;
223 }
224 for_each_online_node(i)
225 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
226 numa_init_array();
227 return 0;
228}
229#endif
230
231void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
232{
233 int i;
234
235#ifdef CONFIG_NUMA_EMU
236 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
237 return;
238#endif
239
240#ifdef CONFIG_ACPI_NUMA
241 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
242 end_pfn << PAGE_SHIFT))
243 return;
244#endif
245
246#ifdef CONFIG_K8_NUMA
247 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
248 return;
249#endif
250 printk(KERN_INFO "%s\n",
251 numa_off ? "NUMA turned off" : "No NUMA configuration found");
252
253 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
254 start_pfn << PAGE_SHIFT,
255 end_pfn << PAGE_SHIFT);
256 /* setup dummy node covering all memory */
257 memnode_shift = 63;
258 memnodemap[0] = 0;
259 nodes_clear(node_online_map);
260 node_set_online(0);
261 for (i = 0; i < NR_CPUS; i++)
Andi Kleen69d81fc2005-11-05 17:25:53 +0100262 numa_set_node(i, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263 node_to_cpumask[0] = cpumask_of_cpu(0);
264 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
265}
266
Ashok Raje6982c62005-06-25 14:54:58 -0700267__cpuinit void numa_add_cpu(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268{
Ravikiran G Thirumalaie6a045a2005-09-30 11:59:21 -0700269 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700270}
271
Andi Kleen69d81fc2005-11-05 17:25:53 +0100272void __cpuinit numa_set_node(int cpu, int node)
273{
274 cpu_pda[cpu].nodenumber = node;
275 cpu_to_node[cpu] = node;
276}
277
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278unsigned long __init numa_free_all_bootmem(void)
279{
280 int i;
281 unsigned long pages = 0;
282 for_each_online_node(i) {
283 pages += free_all_bootmem_node(NODE_DATA(i));
284 }
285 return pages;
286}
287
288void __init paging_init(void)
289{
290 int i;
291 for_each_online_node(i) {
292 setup_node_zones(i);
293 }
294}
295
296/* [numa=off] */
297__init int numa_setup(char *opt)
298{
299 if (!strncmp(opt,"off",3))
300 numa_off = 1;
301#ifdef CONFIG_NUMA_EMU
302 if(!strncmp(opt, "fake=", 5)) {
303 numa_fake = simple_strtoul(opt+5,NULL,0); ;
304 if (numa_fake >= MAX_NUMNODES)
305 numa_fake = MAX_NUMNODES;
306 }
307#endif
308#ifdef CONFIG_ACPI_NUMA
309 if (!strncmp(opt,"noacpi",6))
310 acpi_numa = -1;
311#endif
312 return 1;
313}
314
315EXPORT_SYMBOL(cpu_to_node);
316EXPORT_SYMBOL(node_to_cpumask);
317EXPORT_SYMBOL(memnode_shift);
318EXPORT_SYMBOL(memnodemap);
319EXPORT_SYMBOL(node_data);