blob: 15ae9fcd65a700c9f9060860f0fce56f7f6f3fcf [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 */
11
12#include <linux/kernel.h>
13#include <linux/acpi.h>
14#include <linux/mmzone.h>
15#include <linux/bitmap.h>
16#include <linux/module.h>
17#include <linux/topology.h>
Andi Kleen68a3a7f2006-04-07 19:49:18 +020018#include <linux/bootmem.h>
19#include <linux/mm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070020#include <asm/proto.h>
21#include <asm/numa.h>
Andi Kleen8a6fdd32006-01-11 22:44:39 +010022#include <asm/e820.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070023
Andi Kleen68a3a7f2006-04-07 19:49:18 +020024#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
25 defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
26 && !defined(CONFIG_MEMORY_HOTPLUG)
27#define RESERVE_HOTADD 1
28#endif
29
Linus Torvalds1da177e2005-04-16 15:20:36 -070030static struct acpi_table_slit *acpi_slit;
31
32static nodemask_t nodes_parsed __initdata;
33static nodemask_t nodes_found __initdata;
Andi Kleenabe059e2006-03-25 16:29:12 +010034static struct bootnode nodes[MAX_NUMNODES] __initdata;
Andi Kleen68a3a7f2006-04-07 19:49:18 +020035static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
36static int found_add_area __initdata;
37int hotadd_percent __initdata = 10;
Andi Kleene4e94072006-01-11 22:43:48 +010038static u8 pxm2node[256] = { [0 ... 255] = 0xff };
Linus Torvalds1da177e2005-04-16 15:20:36 -070039
Andi Kleen9391a3f2006-02-03 21:51:17 +010040/* Too small nodes confuse the VM badly. Usually they result
41 from BIOS bugs. */
42#define NODE_MIN_SIZE (4*1024*1024)
43
Andi Kleen05d1fa42005-09-12 18:49:24 +020044static int node_to_pxm(int n);
45
Andi Kleen69e1a332005-09-12 18:49:24 +020046int pxm_to_node(int pxm)
47{
48 if ((unsigned)pxm >= 256)
Andi Kleene4e94072006-01-11 22:43:48 +010049 return -1;
50 /* Extend 0xff to (int)-1 */
51 return (signed char)pxm2node[pxm];
Andi Kleen69e1a332005-09-12 18:49:24 +020052}
53
Linus Torvalds1da177e2005-04-16 15:20:36 -070054static __init int setup_node(int pxm)
55{
56 unsigned node = pxm2node[pxm];
57 if (node == 0xff) {
58 if (nodes_weight(nodes_found) >= MAX_NUMNODES)
59 return -1;
60 node = first_unset_node(nodes_found);
61 node_set(node, nodes_found);
62 pxm2node[pxm] = node;
63 }
64 return pxm2node[pxm];
65}
66
67static __init int conflicting_nodes(unsigned long start, unsigned long end)
68{
69 int i;
Andi Kleen4b6a4552005-09-12 18:49:25 +020070 for_each_node_mask(i, nodes_parsed) {
Andi Kleenabe059e2006-03-25 16:29:12 +010071 struct bootnode *nd = &nodes[i];
Linus Torvalds1da177e2005-04-16 15:20:36 -070072 if (nd->start == nd->end)
73 continue;
74 if (nd->end > start && nd->start < end)
Andi Kleen05d1fa42005-09-12 18:49:24 +020075 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -070076 if (nd->end == end && nd->start == start)
Andi Kleen05d1fa42005-09-12 18:49:24 +020077 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -070078 }
79 return -1;
80}
81
82static __init void cutoff_node(int i, unsigned long start, unsigned long end)
83{
Andi Kleenabe059e2006-03-25 16:29:12 +010084 struct bootnode *nd = &nodes[i];
Andi Kleen68a3a7f2006-04-07 19:49:18 +020085
86 if (found_add_area)
87 return;
88
Linus Torvalds1da177e2005-04-16 15:20:36 -070089 if (nd->start < start) {
90 nd->start = start;
91 if (nd->end < nd->start)
92 nd->start = nd->end;
93 }
94 if (nd->end > end) {
Linus Torvalds1da177e2005-04-16 15:20:36 -070095 nd->end = end;
96 if (nd->start > nd->end)
97 nd->start = nd->end;
98 }
99}
100
101static __init void bad_srat(void)
102{
Andi Kleen2bce2b52005-09-12 18:49:25 +0200103 int i;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700104 printk(KERN_ERR "SRAT: SRAT not used.\n");
105 acpi_numa = -1;
Andi Kleen2bce2b52005-09-12 18:49:25 +0200106 for (i = 0; i < MAX_LOCAL_APIC; i++)
107 apicid_to_node[i] = NUMA_NO_NODE;
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200108 for (i = 0; i < MAX_NUMNODES; i++)
109 nodes_add[i].start = nodes[i].end = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110}
111
112static __init inline int srat_disabled(void)
113{
114 return numa_off || acpi_numa < 0;
115}
116
Andi Kleen1584b892006-01-11 22:43:42 +0100117/*
118 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
119 * up the NUMA heuristics which wants the local node to have a smaller
120 * distance than the others.
121 * Do some quick checks here and only use the SLIT if it passes.
122 */
123static __init int slit_valid(struct acpi_table_slit *slit)
124{
125 int i, j;
126 int d = slit->localities;
127 for (i = 0; i < d; i++) {
128 for (j = 0; j < d; j++) {
129 u8 val = slit->entry[d*i + j];
130 if (i == j) {
131 if (val != 10)
132 return 0;
133 } else if (val <= 10)
134 return 0;
135 }
136 }
137 return 1;
138}
139
Linus Torvalds1da177e2005-04-16 15:20:36 -0700140/* Callback for SLIT parsing */
141void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
142{
Andi Kleen1584b892006-01-11 22:43:42 +0100143 if (!slit_valid(slit)) {
144 printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
145 return;
146 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147 acpi_slit = slit;
148}
149
150/* Callback for Proximity Domain -> LAPIC mapping */
151void __init
152acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
153{
154 int pxm, node;
Andi Kleend22fe802006-02-03 21:51:26 +0100155 if (srat_disabled())
156 return;
157 if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) { bad_srat();
158 return;
159 }
160 if (pa->flags.enabled == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700161 return;
162 pxm = pa->proximity_domain;
163 node = setup_node(pxm);
164 if (node < 0) {
165 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
166 bad_srat();
167 return;
168 }
Andi Kleen0b07e982005-09-12 18:49:24 +0200169 apicid_to_node[pa->apic_id] = node;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700170 acpi_numa = 1;
Andi Kleen0b07e982005-09-12 18:49:24 +0200171 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
172 pxm, pa->apic_id, node);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173}
174
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200175#ifdef RESERVE_HOTADD
176/*
177 * Protect against too large hotadd areas that would fill up memory.
178 */
179static int hotadd_enough_memory(struct bootnode *nd)
180{
181 static unsigned long allocated;
182 static unsigned long last_area_end;
183 unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
184 long mem = pages * sizeof(struct page);
185 unsigned long addr;
186 unsigned long allowed;
187 unsigned long oldpages = pages;
188
189 if (mem < 0)
190 return 0;
191 allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
192 allowed = (allowed / 100) * hotadd_percent;
193 if (allocated + mem > allowed) {
194 /* Give them at least part of their hotadd memory upto hotadd_percent
195 It would be better to spread the limit out
196 over multiple hotplug areas, but that is too complicated
197 right now */
198 if (allocated >= allowed)
199 return 0;
200 pages = (allowed - allocated + mem) / sizeof(struct page);
201 mem = pages * sizeof(struct page);
202 nd->end = nd->start + pages*PAGE_SIZE;
203 }
204 /* Not completely fool proof, but a good sanity check */
205 addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
206 if (addr == -1UL)
207 return 0;
208 if (pages != oldpages)
209 printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
210 pages << PAGE_SHIFT);
211 last_area_end = addr + mem;
212 allocated += mem;
213 return 1;
214}
215
216/*
217 * It is fine to add this area to the nodes data it will be used later
218 * This code supports one contigious hot add area per node.
219 */
220static int reserve_hotadd(int node, unsigned long start, unsigned long end)
221{
222 unsigned long s_pfn = start >> PAGE_SHIFT;
223 unsigned long e_pfn = end >> PAGE_SHIFT;
224 int changed = 0;
225 struct bootnode *nd = &nodes_add[node];
226
227 /* I had some trouble with strange memory hotadd regions breaking
228 the boot. Be very strict here and reject anything unexpected.
229 If you want working memory hotadd write correct SRATs.
230
231 The node size check is a basic sanity check to guard against
232 mistakes */
233 if ((signed long)(end - start) < NODE_MIN_SIZE) {
234 printk(KERN_ERR "SRAT: Hotplug area too small\n");
235 return -1;
236 }
237
238 /* This check might be a bit too strict, but I'm keeping it for now. */
239 if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
240 printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
241 return -1;
242 }
243
244 if (!hotadd_enough_memory(&nodes_add[node])) {
245 printk(KERN_ERR "SRAT: Hotplug area too large\n");
246 return -1;
247 }
248
249 /* Looks good */
250
251 found_add_area = 1;
252 if (nd->start == nd->end) {
253 nd->start = start;
254 nd->end = end;
255 changed = 1;
256 } else {
257 if (nd->start == end) {
258 nd->start = start;
259 changed = 1;
260 }
261 if (nd->end == start) {
262 nd->end = end;
263 changed = 1;
264 }
265 if (!changed)
266 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
267 }
268
269 if ((nd->end >> PAGE_SHIFT) > end_pfn)
270 end_pfn = nd->end >> PAGE_SHIFT;
271
272 if (changed)
273 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
274 return 0;
275}
276#endif
277
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
279void __init
280acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
281{
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200282 struct bootnode *nd, oldnode;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700283 unsigned long start, end;
284 int node, pxm;
285 int i;
286
Andi Kleend22fe802006-02-03 21:51:26 +0100287 if (srat_disabled())
Linus Torvalds1da177e2005-04-16 15:20:36 -0700288 return;
Andi Kleend22fe802006-02-03 21:51:26 +0100289 if (ma->header.length != sizeof(struct acpi_table_memory_affinity)) {
290 bad_srat();
291 return;
292 }
293 if (ma->flags.enabled == 0)
294 return;
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200295 if (ma->flags.hot_pluggable && hotadd_percent == 0)
296 return;
Andi Kleend22fe802006-02-03 21:51:26 +0100297 start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
298 end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700299 pxm = ma->proximity_domain;
300 node = setup_node(pxm);
301 if (node < 0) {
302 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
303 bad_srat();
304 return;
305 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306 i = conflicting_nodes(start, end);
Andi Kleen05d1fa42005-09-12 18:49:24 +0200307 if (i == node) {
308 printk(KERN_WARNING
309 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
310 pxm, start, end, nodes[i].start, nodes[i].end);
311 } else if (i >= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700312 printk(KERN_ERR
Andi Kleen05d1fa42005-09-12 18:49:24 +0200313 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
314 pxm, start, end, node_to_pxm(i),
315 nodes[i].start, nodes[i].end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700316 bad_srat();
317 return;
318 }
319 nd = &nodes[node];
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200320 oldnode = *nd;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321 if (!node_test_and_set(node, nodes_parsed)) {
322 nd->start = start;
323 nd->end = end;
324 } else {
325 if (start < nd->start)
326 nd->start = start;
327 if (nd->end < end)
328 nd->end = end;
329 }
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200330
Linus Torvalds1da177e2005-04-16 15:20:36 -0700331 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
332 nd->start, nd->end);
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200333
334#ifdef RESERVE_HOTADD
335 if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
336 /* Ignore hotadd region. Undo damage */
337 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
338 *nd = oldnode;
339 if ((nd->start | nd->end) == 0)
340 node_clear(node, nodes_parsed);
341 }
342#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343}
344
Andi Kleen8a6fdd32006-01-11 22:44:39 +0100345/* Sanity check to catch more bad SRATs (they are amazingly common).
346 Make sure the PXMs cover all memory. */
347static int nodes_cover_memory(void)
348{
349 int i;
350 unsigned long pxmram, e820ram;
351
352 pxmram = 0;
353 for_each_node_mask(i, nodes_parsed) {
354 unsigned long s = nodes[i].start >> PAGE_SHIFT;
355 unsigned long e = nodes[i].end >> PAGE_SHIFT;
356 pxmram += e - s;
357 pxmram -= e820_hole_size(s, e);
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200358 pxmram -= nodes_add[i].end - nodes_add[i].start;
359 if ((long)pxmram < 0)
360 pxmram = 0;
Andi Kleen8a6fdd32006-01-11 22:44:39 +0100361 }
362
363 e820ram = end_pfn - e820_hole_size(0, end_pfn);
Andi Kleenfdb9df92006-02-16 23:42:13 +0100364 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
365 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
Andi Kleen8a6fdd32006-01-11 22:44:39 +0100366 printk(KERN_ERR
367 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
368 (pxmram << PAGE_SHIFT) >> 20,
369 (e820ram << PAGE_SHIFT) >> 20);
370 return 0;
371 }
372 return 1;
373}
374
Andi Kleen9391a3f2006-02-03 21:51:17 +0100375static void unparse_node(int node)
376{
377 int i;
378 node_clear(node, nodes_parsed);
379 for (i = 0; i < MAX_LOCAL_APIC; i++) {
380 if (apicid_to_node[i] == node)
381 apicid_to_node[i] = NUMA_NO_NODE;
382 }
383}
384
Linus Torvalds1da177e2005-04-16 15:20:36 -0700385void __init acpi_numa_arch_fixup(void) {}
386
387/* Use the information discovered above to actually set up the nodes. */
388int __init acpi_scan_nodes(unsigned long start, unsigned long end)
389{
390 int i;
Andi Kleen8a6fdd32006-01-11 22:44:39 +0100391
Andi Kleen9391a3f2006-02-03 21:51:17 +0100392 /* First clean up the node list */
393 for (i = 0; i < MAX_NUMNODES; i++) {
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200394 cutoff_node(i, start, end);
Andi Kleen9391a3f2006-02-03 21:51:17 +0100395 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
396 unparse_node(i);
397 }
398
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399 if (acpi_numa <= 0)
400 return -1;
Andi Kleene58e0d02005-09-12 18:49:25 +0200401
Andi Kleen8a6fdd32006-01-11 22:44:39 +0100402 if (!nodes_cover_memory()) {
403 bad_srat();
404 return -1;
405 }
406
Andi Kleen2aed7112006-02-16 23:42:16 +0100407 memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700408 if (memnode_shift < 0) {
409 printk(KERN_ERR
410 "SRAT: No NUMA node hash function found. Contact maintainer\n");
411 bad_srat();
412 return -1;
413 }
Andi Kleene58e0d02005-09-12 18:49:25 +0200414
415 /* Finally register nodes */
416 for_each_node_mask(i, nodes_parsed)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700417 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
Andi Kleena8062232006-04-07 19:49:21 +0200418 /* Try again in case setup_node_bootmem missed one due
419 to missing bootmem */
420 for_each_node_mask(i, nodes_parsed)
421 if (!node_online(i))
422 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
423
Linus Torvalds1da177e2005-04-16 15:20:36 -0700424 for (i = 0; i < NR_CPUS; i++) {
425 if (cpu_to_node[i] == NUMA_NO_NODE)
426 continue;
427 if (!node_isset(cpu_to_node[i], nodes_parsed))
Andi Kleen69d81fc2005-11-05 17:25:53 +0100428 numa_set_node(i, NUMA_NO_NODE);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429 }
430 numa_init_array();
431 return 0;
432}
433
Andi Kleen05d1fa42005-09-12 18:49:24 +0200434static int node_to_pxm(int n)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700435{
436 int i;
437 if (pxm2node[n] == n)
438 return n;
439 for (i = 0; i < 256; i++)
440 if (pxm2node[i] == n)
441 return i;
442 return 0;
443}
444
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200445void __init srat_reserve_add_area(int nodeid)
446{
447 if (found_add_area && nodes_add[nodeid].end) {
448 u64 total_mb;
449
450 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
451 "for node %d at %Lx-%Lx\n",
452 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
453 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
454 >> PAGE_SHIFT;
455 total_mb *= sizeof(struct page);
456 total_mb >>= 20;
457 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
458 "pre-allocated memory.\n", (unsigned long long)total_mb);
459 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
460 nodes_add[nodeid].end - nodes_add[nodeid].start);
461 }
462}
463
Linus Torvalds1da177e2005-04-16 15:20:36 -0700464int __node_distance(int a, int b)
465{
466 int index;
467
468 if (!acpi_slit)
469 return a == b ? 10 : 20;
470 index = acpi_slit->localities * node_to_pxm(a);
471 return acpi_slit->entry[index + node_to_pxm(b)];
472}
473
474EXPORT_SYMBOL(__node_distance);