blob: 474df22c6ed20fb7aa0f2b6e503dc232d0ec3050 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 */
11
12#include <linux/kernel.h>
13#include <linux/acpi.h>
14#include <linux/mmzone.h>
15#include <linux/bitmap.h>
16#include <linux/module.h>
17#include <linux/topology.h>
Andi Kleen68a3a7f2006-04-07 19:49:18 +020018#include <linux/bootmem.h>
19#include <linux/mm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070020#include <asm/proto.h>
21#include <asm/numa.h>
Andi Kleen8a6fdd32006-01-11 22:44:39 +010022#include <asm/e820.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070023
Andi Kleen68a3a7f2006-04-07 19:49:18 +020024#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
25 defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
26 && !defined(CONFIG_MEMORY_HOTPLUG)
27#define RESERVE_HOTADD 1
28#endif
29
Linus Torvalds1da177e2005-04-16 15:20:36 -070030static struct acpi_table_slit *acpi_slit;
31
32static nodemask_t nodes_parsed __initdata;
33static nodemask_t nodes_found __initdata;
Andi Kleenabe059e2006-03-25 16:29:12 +010034static struct bootnode nodes[MAX_NUMNODES] __initdata;
Andi Kleen68a3a7f2006-04-07 19:49:18 +020035static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
36static int found_add_area __initdata;
Andi Kleenfad79062006-05-15 18:19:44 +020037int hotadd_percent __initdata = 0;
38#ifndef RESERVE_HOTADD
39#define hotadd_percent 0 /* Ignore all settings */
40#endif
Andi Kleene4e94072006-01-11 22:43:48 +010041static u8 pxm2node[256] = { [0 ... 255] = 0xff };
Linus Torvalds1da177e2005-04-16 15:20:36 -070042
Andi Kleen9391a3f2006-02-03 21:51:17 +010043/* Too small nodes confuse the VM badly. Usually they result
44 from BIOS bugs. */
45#define NODE_MIN_SIZE (4*1024*1024)
46
Andi Kleen05d1fa42005-09-12 18:49:24 +020047static int node_to_pxm(int n);
48
Andi Kleen69e1a332005-09-12 18:49:24 +020049int pxm_to_node(int pxm)
50{
51 if ((unsigned)pxm >= 256)
Andi Kleene4e94072006-01-11 22:43:48 +010052 return -1;
53 /* Extend 0xff to (int)-1 */
54 return (signed char)pxm2node[pxm];
Andi Kleen69e1a332005-09-12 18:49:24 +020055}
56
Linus Torvalds1da177e2005-04-16 15:20:36 -070057static __init int setup_node(int pxm)
58{
59 unsigned node = pxm2node[pxm];
60 if (node == 0xff) {
61 if (nodes_weight(nodes_found) >= MAX_NUMNODES)
62 return -1;
63 node = first_unset_node(nodes_found);
64 node_set(node, nodes_found);
65 pxm2node[pxm] = node;
66 }
67 return pxm2node[pxm];
68}
69
70static __init int conflicting_nodes(unsigned long start, unsigned long end)
71{
72 int i;
Andi Kleen4b6a4552005-09-12 18:49:25 +020073 for_each_node_mask(i, nodes_parsed) {
Andi Kleenabe059e2006-03-25 16:29:12 +010074 struct bootnode *nd = &nodes[i];
Linus Torvalds1da177e2005-04-16 15:20:36 -070075 if (nd->start == nd->end)
76 continue;
77 if (nd->end > start && nd->start < end)
Andi Kleen05d1fa42005-09-12 18:49:24 +020078 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -070079 if (nd->end == end && nd->start == start)
Andi Kleen05d1fa42005-09-12 18:49:24 +020080 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -070081 }
82 return -1;
83}
84
85static __init void cutoff_node(int i, unsigned long start, unsigned long end)
86{
Andi Kleenabe059e2006-03-25 16:29:12 +010087 struct bootnode *nd = &nodes[i];
Andi Kleen68a3a7f2006-04-07 19:49:18 +020088
89 if (found_add_area)
90 return;
91
Linus Torvalds1da177e2005-04-16 15:20:36 -070092 if (nd->start < start) {
93 nd->start = start;
94 if (nd->end < nd->start)
95 nd->start = nd->end;
96 }
97 if (nd->end > end) {
Linus Torvalds1da177e2005-04-16 15:20:36 -070098 nd->end = end;
99 if (nd->start > nd->end)
100 nd->start = nd->end;
101 }
102}
103
104static __init void bad_srat(void)
105{
Andi Kleen2bce2b52005-09-12 18:49:25 +0200106 int i;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107 printk(KERN_ERR "SRAT: SRAT not used.\n");
108 acpi_numa = -1;
Andi Kleenfad79062006-05-15 18:19:44 +0200109 found_add_area = 0;
Andi Kleen2bce2b52005-09-12 18:49:25 +0200110 for (i = 0; i < MAX_LOCAL_APIC; i++)
111 apicid_to_node[i] = NUMA_NO_NODE;
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200112 for (i = 0; i < MAX_NUMNODES; i++)
113 nodes_add[i].start = nodes[i].end = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114}
115
116static __init inline int srat_disabled(void)
117{
118 return numa_off || acpi_numa < 0;
119}
120
Andi Kleen1584b892006-01-11 22:43:42 +0100121/*
122 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
123 * up the NUMA heuristics which wants the local node to have a smaller
124 * distance than the others.
125 * Do some quick checks here and only use the SLIT if it passes.
126 */
127static __init int slit_valid(struct acpi_table_slit *slit)
128{
129 int i, j;
130 int d = slit->localities;
131 for (i = 0; i < d; i++) {
132 for (j = 0; j < d; j++) {
133 u8 val = slit->entry[d*i + j];
134 if (i == j) {
135 if (val != 10)
136 return 0;
137 } else if (val <= 10)
138 return 0;
139 }
140 }
141 return 1;
142}
143
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144/* Callback for SLIT parsing */
145void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
146{
Andi Kleen1584b892006-01-11 22:43:42 +0100147 if (!slit_valid(slit)) {
148 printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
149 return;
150 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151 acpi_slit = slit;
152}
153
154/* Callback for Proximity Domain -> LAPIC mapping */
155void __init
156acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
157{
158 int pxm, node;
Andi Kleend22fe802006-02-03 21:51:26 +0100159 if (srat_disabled())
160 return;
Andi Kleenfad79062006-05-15 18:19:44 +0200161 if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) {
162 bad_srat();
Andi Kleend22fe802006-02-03 21:51:26 +0100163 return;
164 }
165 if (pa->flags.enabled == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700166 return;
167 pxm = pa->proximity_domain;
168 node = setup_node(pxm);
169 if (node < 0) {
170 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
171 bad_srat();
172 return;
173 }
Andi Kleen0b07e982005-09-12 18:49:24 +0200174 apicid_to_node[pa->apic_id] = node;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700175 acpi_numa = 1;
Andi Kleen0b07e982005-09-12 18:49:24 +0200176 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
177 pxm, pa->apic_id, node);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178}
179
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200180#ifdef RESERVE_HOTADD
181/*
182 * Protect against too large hotadd areas that would fill up memory.
183 */
184static int hotadd_enough_memory(struct bootnode *nd)
185{
186 static unsigned long allocated;
187 static unsigned long last_area_end;
188 unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
189 long mem = pages * sizeof(struct page);
190 unsigned long addr;
191 unsigned long allowed;
192 unsigned long oldpages = pages;
193
194 if (mem < 0)
195 return 0;
196 allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
197 allowed = (allowed / 100) * hotadd_percent;
198 if (allocated + mem > allowed) {
Andi Kleenfad79062006-05-15 18:19:44 +0200199 unsigned long range;
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200200 /* Give them at least part of their hotadd memory upto hotadd_percent
201 It would be better to spread the limit out
202 over multiple hotplug areas, but that is too complicated
203 right now */
204 if (allocated >= allowed)
205 return 0;
Andi Kleenfad79062006-05-15 18:19:44 +0200206 range = allowed - allocated;
207 pages = (range / PAGE_SIZE);
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200208 mem = pages * sizeof(struct page);
Andi Kleenfad79062006-05-15 18:19:44 +0200209 nd->end = nd->start + range;
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200210 }
211 /* Not completely fool proof, but a good sanity check */
212 addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
213 if (addr == -1UL)
214 return 0;
215 if (pages != oldpages)
216 printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
217 pages << PAGE_SHIFT);
218 last_area_end = addr + mem;
219 allocated += mem;
220 return 1;
221}
222
223/*
224 * It is fine to add this area to the nodes data it will be used later
225 * This code supports one contigious hot add area per node.
226 */
227static int reserve_hotadd(int node, unsigned long start, unsigned long end)
228{
229 unsigned long s_pfn = start >> PAGE_SHIFT;
230 unsigned long e_pfn = end >> PAGE_SHIFT;
231 int changed = 0;
232 struct bootnode *nd = &nodes_add[node];
233
234 /* I had some trouble with strange memory hotadd regions breaking
235 the boot. Be very strict here and reject anything unexpected.
236 If you want working memory hotadd write correct SRATs.
237
238 The node size check is a basic sanity check to guard against
239 mistakes */
240 if ((signed long)(end - start) < NODE_MIN_SIZE) {
241 printk(KERN_ERR "SRAT: Hotplug area too small\n");
242 return -1;
243 }
244
245 /* This check might be a bit too strict, but I'm keeping it for now. */
246 if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
247 printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
248 return -1;
249 }
250
251 if (!hotadd_enough_memory(&nodes_add[node])) {
252 printk(KERN_ERR "SRAT: Hotplug area too large\n");
253 return -1;
254 }
255
256 /* Looks good */
257
258 found_add_area = 1;
259 if (nd->start == nd->end) {
260 nd->start = start;
261 nd->end = end;
262 changed = 1;
263 } else {
264 if (nd->start == end) {
265 nd->start = start;
266 changed = 1;
267 }
268 if (nd->end == start) {
269 nd->end = end;
270 changed = 1;
271 }
272 if (!changed)
273 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
274 }
275
276 if ((nd->end >> PAGE_SHIFT) > end_pfn)
277 end_pfn = nd->end >> PAGE_SHIFT;
278
279 if (changed)
280 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
281 return 0;
282}
283#endif
284
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
286void __init
287acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
288{
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200289 struct bootnode *nd, oldnode;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700290 unsigned long start, end;
291 int node, pxm;
292 int i;
293
Andi Kleend22fe802006-02-03 21:51:26 +0100294 if (srat_disabled())
Linus Torvalds1da177e2005-04-16 15:20:36 -0700295 return;
Andi Kleend22fe802006-02-03 21:51:26 +0100296 if (ma->header.length != sizeof(struct acpi_table_memory_affinity)) {
297 bad_srat();
298 return;
299 }
300 if (ma->flags.enabled == 0)
301 return;
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200302 if (ma->flags.hot_pluggable && hotadd_percent == 0)
303 return;
Andi Kleend22fe802006-02-03 21:51:26 +0100304 start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
305 end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306 pxm = ma->proximity_domain;
307 node = setup_node(pxm);
308 if (node < 0) {
309 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
310 bad_srat();
311 return;
312 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700313 i = conflicting_nodes(start, end);
Andi Kleen05d1fa42005-09-12 18:49:24 +0200314 if (i == node) {
315 printk(KERN_WARNING
316 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
317 pxm, start, end, nodes[i].start, nodes[i].end);
318 } else if (i >= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319 printk(KERN_ERR
Andi Kleen05d1fa42005-09-12 18:49:24 +0200320 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
321 pxm, start, end, node_to_pxm(i),
322 nodes[i].start, nodes[i].end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323 bad_srat();
324 return;
325 }
326 nd = &nodes[node];
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200327 oldnode = *nd;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700328 if (!node_test_and_set(node, nodes_parsed)) {
329 nd->start = start;
330 nd->end = end;
331 } else {
332 if (start < nd->start)
333 nd->start = start;
334 if (nd->end < end)
335 nd->end = end;
336 }
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200337
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
339 nd->start, nd->end);
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200340
341#ifdef RESERVE_HOTADD
342 if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
343 /* Ignore hotadd region. Undo damage */
344 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
345 *nd = oldnode;
346 if ((nd->start | nd->end) == 0)
347 node_clear(node, nodes_parsed);
348 }
349#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350}
351
Andi Kleen8a6fdd32006-01-11 22:44:39 +0100352/* Sanity check to catch more bad SRATs (they are amazingly common).
353 Make sure the PXMs cover all memory. */
354static int nodes_cover_memory(void)
355{
356 int i;
357 unsigned long pxmram, e820ram;
358
359 pxmram = 0;
360 for_each_node_mask(i, nodes_parsed) {
361 unsigned long s = nodes[i].start >> PAGE_SHIFT;
362 unsigned long e = nodes[i].end >> PAGE_SHIFT;
363 pxmram += e - s;
364 pxmram -= e820_hole_size(s, e);
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200365 pxmram -= nodes_add[i].end - nodes_add[i].start;
366 if ((long)pxmram < 0)
367 pxmram = 0;
Andi Kleen8a6fdd32006-01-11 22:44:39 +0100368 }
369
370 e820ram = end_pfn - e820_hole_size(0, end_pfn);
Andi Kleenfdb9df92006-02-16 23:42:13 +0100371 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
372 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
Andi Kleen8a6fdd32006-01-11 22:44:39 +0100373 printk(KERN_ERR
374 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
375 (pxmram << PAGE_SHIFT) >> 20,
376 (e820ram << PAGE_SHIFT) >> 20);
377 return 0;
378 }
379 return 1;
380}
381
Andi Kleen9391a3f2006-02-03 21:51:17 +0100382static void unparse_node(int node)
383{
384 int i;
385 node_clear(node, nodes_parsed);
386 for (i = 0; i < MAX_LOCAL_APIC; i++) {
387 if (apicid_to_node[i] == node)
388 apicid_to_node[i] = NUMA_NO_NODE;
389 }
390}
391
Linus Torvalds1da177e2005-04-16 15:20:36 -0700392void __init acpi_numa_arch_fixup(void) {}
393
394/* Use the information discovered above to actually set up the nodes. */
395int __init acpi_scan_nodes(unsigned long start, unsigned long end)
396{
397 int i;
Andi Kleen8a6fdd32006-01-11 22:44:39 +0100398
Andi Kleen9391a3f2006-02-03 21:51:17 +0100399 /* First clean up the node list */
400 for (i = 0; i < MAX_NUMNODES; i++) {
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200401 cutoff_node(i, start, end);
Daniel Yeisley0d015322006-05-30 22:47:57 +0200402 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
Andi Kleen9391a3f2006-02-03 21:51:17 +0100403 unparse_node(i);
Daniel Yeisley0d015322006-05-30 22:47:57 +0200404 node_set_offline(i);
405 }
Andi Kleen9391a3f2006-02-03 21:51:17 +0100406 }
407
Linus Torvalds1da177e2005-04-16 15:20:36 -0700408 if (acpi_numa <= 0)
409 return -1;
Andi Kleene58e0d02005-09-12 18:49:25 +0200410
Andi Kleen8a6fdd32006-01-11 22:44:39 +0100411 if (!nodes_cover_memory()) {
412 bad_srat();
413 return -1;
414 }
415
Andi Kleen2aed7112006-02-16 23:42:16 +0100416 memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700417 if (memnode_shift < 0) {
418 printk(KERN_ERR
419 "SRAT: No NUMA node hash function found. Contact maintainer\n");
420 bad_srat();
421 return -1;
422 }
Andi Kleene58e0d02005-09-12 18:49:25 +0200423
424 /* Finally register nodes */
425 for_each_node_mask(i, nodes_parsed)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700426 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
Andi Kleena8062232006-04-07 19:49:21 +0200427 /* Try again in case setup_node_bootmem missed one due
428 to missing bootmem */
429 for_each_node_mask(i, nodes_parsed)
430 if (!node_online(i))
431 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
432
Linus Torvalds1da177e2005-04-16 15:20:36 -0700433 for (i = 0; i < NR_CPUS; i++) {
434 if (cpu_to_node[i] == NUMA_NO_NODE)
435 continue;
436 if (!node_isset(cpu_to_node[i], nodes_parsed))
Andi Kleen69d81fc2005-11-05 17:25:53 +0100437 numa_set_node(i, NUMA_NO_NODE);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700438 }
439 numa_init_array();
440 return 0;
441}
442
Andi Kleen05d1fa42005-09-12 18:49:24 +0200443static int node_to_pxm(int n)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700444{
445 int i;
446 if (pxm2node[n] == n)
447 return n;
448 for (i = 0; i < 256; i++)
449 if (pxm2node[i] == n)
450 return i;
451 return 0;
452}
453
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200454void __init srat_reserve_add_area(int nodeid)
455{
456 if (found_add_area && nodes_add[nodeid].end) {
457 u64 total_mb;
458
459 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
460 "for node %d at %Lx-%Lx\n",
461 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
462 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
463 >> PAGE_SHIFT;
464 total_mb *= sizeof(struct page);
465 total_mb >>= 20;
466 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
467 "pre-allocated memory.\n", (unsigned long long)total_mb);
468 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
469 nodes_add[nodeid].end - nodes_add[nodeid].start);
470 }
471}
472
Linus Torvalds1da177e2005-04-16 15:20:36 -0700473int __node_distance(int a, int b)
474{
475 int index;
476
477 if (!acpi_slit)
478 return a == b ? 10 : 20;
479 index = acpi_slit->localities * node_to_pxm(a);
480 return acpi_slit->entry[index + node_to_pxm(b)];
481}
482
483EXPORT_SYMBOL(__node_distance);