blob: 8043d5e7f0d3dabe91b56b3db81c7ebaf2eaa7aa [file] [log] [blame]
Thomas Gleixnere3cfe522008-01-30 13:30:37 +01001/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
Thomas Gleixnere3cfe522008-01-30 13:30:37 +01004 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
Yinghai Lu72d7c3b2010-08-25 13:39:17 -070010#include <linux/memblock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070011#include <linux/mmzone.h>
12#include <linux/ctype.h>
13#include <linux/module.h>
14#include <linux/nodemask.h>
travis@sgi.com3cc87e32008-01-30 13:33:11 +010015#include <linux/sched.h>
Tejun Heod8fc3af2011-02-16 12:13:06 +010016#include <linux/acpi.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070017
18#include <asm/e820.h>
19#include <asm/proto.h>
20#include <asm/dma.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070021#include <asm/acpi.h>
Andreas Herrmann23ac4ae2010-09-17 18:03:43 +020022#include <asm/amd_nb.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070023
Tejun Heob8ef9172011-02-22 11:10:08 +010024#include "numa_internal.h"
Tejun Heo97e7b782011-02-16 17:11:08 +010025
Ravikiran G Thirumalai6c231b72005-09-06 15:17:45 -070026struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
Thomas Gleixnere3cfe522008-01-30 13:30:37 +010027EXPORT_SYMBOL(node_data);
28
Tejun Heo92d4a432011-02-16 17:11:09 +010029nodemask_t numa_nodes_parsed __initdata;
Tejun Heoec8cf29b2011-02-16 12:13:07 +010030
Tejun Heo96886782011-05-02 14:18:51 +020031static struct numa_meminfo numa_meminfo
32#ifndef CONFIG_MEMORY_HOTPLUG
33__initdata
34#endif
35;
36
Tejun Heoac7136b2011-02-16 17:11:09 +010037static int numa_distance_cnt;
38static u8 *numa_distance;
39
Thomas Gleixnere3cfe522008-01-30 13:30:37 +010040static void * __init early_node_mem(int nodeid, unsigned long start,
Yinghai Lu24a5da72008-02-01 17:49:41 +010041 unsigned long end, unsigned long size,
42 unsigned long align)
Andi Kleena8062232006-04-07 19:49:21 +020043{
Yinghai Lucef625e2010-02-10 01:20:18 -080044 unsigned long mem;
Thomas Gleixnere3cfe522008-01-30 13:30:37 +010045
Yinghai Lucef625e2010-02-10 01:20:18 -080046 /*
47 * put it on high as possible
48 * something will go with NODE_DATA
49 */
50 if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
51 start = MAX_DMA_PFN<<PAGE_SHIFT;
52 if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
53 end > (MAX_DMA32_PFN<<PAGE_SHIFT))
54 start = MAX_DMA32_PFN<<PAGE_SHIFT;
Yinghai Lu72d7c3b2010-08-25 13:39:17 -070055 mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
56 if (mem != MEMBLOCK_ERROR)
Andi Kleena8062232006-04-07 19:49:21 +020057 return __va(mem);
Yinghai Lu9347e0b2008-02-01 17:49:42 +010058
Yinghai Lucef625e2010-02-10 01:20:18 -080059 /* extend the search scope */
60 end = max_pfn_mapped << PAGE_SHIFT;
Yinghai Lu419db272010-10-28 09:50:17 -070061 start = MAX_DMA_PFN << PAGE_SHIFT;
62 mem = memblock_find_in_range(start, end, size, align);
Yinghai Lu72d7c3b2010-08-25 13:39:17 -070063 if (mem != MEMBLOCK_ERROR)
Yinghai Lu1842f902010-02-10 01:20:15 -080064 return __va(mem);
65
66 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
Thomas Gleixnere3cfe522008-01-30 13:30:37 +010067 size, nodeid);
Yinghai Lu1842f902010-02-10 01:20:15 -080068
69 return NULL;
Andi Kleena8062232006-04-07 19:49:21 +020070}
71
Tejun Heod9c515e2011-02-16 17:11:10 +010072static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
73 struct numa_meminfo *mi)
Tejun Heoef396ec2011-02-16 17:11:07 +010074{
Tejun Heo56e827f2011-02-16 17:11:09 +010075 /* ignore zero length blks */
76 if (start == end)
77 return 0;
78
79 /* whine about and ignore invalid blks */
80 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
81 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
82 nid, start, end);
83 return 0;
84 }
85
86 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
87 pr_err("NUMA: too many memblk ranges\n");
Tejun Heoef396ec2011-02-16 17:11:07 +010088 return -EINVAL;
89 }
90
Tejun Heo97e7b782011-02-16 17:11:08 +010091 mi->blk[mi->nr_blks].start = start;
92 mi->blk[mi->nr_blks].end = end;
93 mi->blk[mi->nr_blks].nid = nid;
94 mi->nr_blks++;
Tejun Heoef396ec2011-02-16 17:11:07 +010095 return 0;
96}
97
Tejun Heo90e6b672011-02-22 11:10:08 +010098/**
99 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
100 * @idx: Index of memblk to remove
101 * @mi: numa_meminfo to remove memblk from
102 *
103 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
104 * decrementing @mi->nr_blks.
105 */
Tejun Heob8ef9172011-02-22 11:10:08 +0100106void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
Tejun Heo2e756be2011-02-16 17:11:09 +0100107{
108 mi->nr_blks--;
109 memmove(&mi->blk[idx], &mi->blk[idx + 1],
110 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
111}
112
Tejun Heo90e6b672011-02-22 11:10:08 +0100113/**
114 * numa_add_memblk - Add one numa_memblk to numa_meminfo
115 * @nid: NUMA node ID of the new memblk
116 * @start: Start address of the new memblk
117 * @end: End address of the new memblk
118 *
119 * Add a new memblk to the default numa_meminfo.
120 *
121 * RETURNS:
122 * 0 on success, -errno on failure.
123 */
Tejun Heod9c515e2011-02-16 17:11:10 +0100124int __init numa_add_memblk(int nid, u64 start, u64 end)
125{
126 return numa_add_memblk_to(nid, start, end, &numa_meminfo);
127}
128
Linus Torvalds1da177e2005-04-16 15:20:36 -0700129/* Initialize bootmem allocator for a node */
Yinghai Lu7c437692009-05-15 13:59:37 -0700130void __init
Tejun Heoebe685f2011-05-02 14:18:51 +0200131setup_node_bootmem(int nid, unsigned long start, unsigned long end)
Thomas Gleixnere3cfe522008-01-30 13:30:37 +0100132{
Tejun Heoebe685f2011-05-02 14:18:51 +0200133 const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
134 unsigned long nd_pa;
135 int tnid;
Yinghai Lu4c31e922009-04-22 14:19:27 -0700136
Yinghai Lu7c437692009-05-15 13:59:37 -0700137 /*
138 * Don't confuse VM with a node that doesn't have the
139 * minimum amount of memory:
140 */
141 if (end && (end - start) < NODE_MIN_SIZE)
142 return;
143
Joerg Roedelbe3e89e2008-07-25 16:48:58 +0200144 start = roundup(start, ZONE_ALIGN);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700145
Tejun Heoebe685f2011-05-02 14:18:51 +0200146 printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n",
147 nid, start, end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148
Tejun Heoebe685f2011-05-02 14:18:51 +0200149 node_data[nid] = early_node_mem(nid, start, end, nd_size,
150 SMP_CACHE_BYTES);
151 if (node_data[nid] == NULL)
Andi Kleena8062232006-04-07 19:49:21 +0200152 return;
Tejun Heoebe685f2011-05-02 14:18:51 +0200153 nd_pa = __pa(node_data[nid]);
154 memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
155 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n",
156 nd_pa, nd_pa + nd_size - 1);
157 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
158 if (tnid != nid)
159 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160
Tejun Heoebe685f2011-05-02 14:18:51 +0200161 memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
162 NODE_DATA(nid)->node_id = nid;
163 NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
164 NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165
Tejun Heoebe685f2011-05-02 14:18:51 +0200166 node_set_online(nid);
Thomas Gleixnere3cfe522008-01-30 13:30:37 +0100167}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700168
Tejun Heo90e6b672011-02-22 11:10:08 +0100169/**
170 * numa_cleanup_meminfo - Cleanup a numa_meminfo
171 * @mi: numa_meminfo to clean up
172 *
173 * Sanitize @mi by merging and removing unncessary memblks. Also check for
174 * conflicts and clear unused memblks.
175 *
176 * RETURNS:
177 * 0 on success, -errno on failure.
178 */
Tejun Heob8ef9172011-02-22 11:10:08 +0100179int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
Tejun Heofd0435d2011-02-16 17:11:08 +0100180{
Tejun Heo56e827f2011-02-16 17:11:09 +0100181 const u64 low = 0;
182 const u64 high = (u64)max_pfn << PAGE_SHIFT;
Tejun Heo2e756be2011-02-16 17:11:09 +0100183 int i, j, k;
Tejun Heoef396ec2011-02-16 17:11:07 +0100184
Tejun Heo2e756be2011-02-16 17:11:09 +0100185 for (i = 0; i < mi->nr_blks; i++) {
Tejun Heo97e7b782011-02-16 17:11:08 +0100186 struct numa_memblk *bi = &mi->blk[i];
Tejun Heoef396ec2011-02-16 17:11:07 +0100187
Tejun Heo56e827f2011-02-16 17:11:09 +0100188 /* make sure all blocks are inside the limits */
189 bi->start = max(bi->start, low);
190 bi->end = min(bi->end, high);
191
192 /* and there's no empty block */
Yinghai Lu2be19102011-05-01 19:12:04 +0200193 if (bi->start >= bi->end) {
Tejun Heo56e827f2011-02-16 17:11:09 +0100194 numa_remove_memblk_from(i--, mi);
195 continue;
196 }
197
Tejun Heo2e756be2011-02-16 17:11:09 +0100198 for (j = i + 1; j < mi->nr_blks; j++) {
Tejun Heo97e7b782011-02-16 17:11:08 +0100199 struct numa_memblk *bj = &mi->blk[j];
Tejun Heoef396ec2011-02-16 17:11:07 +0100200 unsigned long start, end;
201
Tejun Heo2e756be2011-02-16 17:11:09 +0100202 /*
Tejun Heo56e827f2011-02-16 17:11:09 +0100203 * See whether there are overlapping blocks. Whine
204 * about but allow overlaps of the same nid. They
205 * will be merged below.
206 */
207 if (bi->end > bj->start && bi->start < bj->end) {
208 if (bi->nid != bj->nid) {
209 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
210 bi->nid, bi->start, bi->end,
211 bj->nid, bj->start, bj->end);
212 return -EINVAL;
213 }
214 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
215 bi->nid, bi->start, bi->end,
216 bj->start, bj->end);
217 }
218
219 /*
Tejun Heo2e756be2011-02-16 17:11:09 +0100220 * Join together blocks on the same node, holes
221 * between which don't overlap with memory on other
222 * nodes.
223 */
Tejun Heo97e7b782011-02-16 17:11:08 +0100224 if (bi->nid != bj->nid)
Tejun Heoef396ec2011-02-16 17:11:07 +0100225 continue;
Tejun Heo56e827f2011-02-16 17:11:09 +0100226 start = max(min(bi->start, bj->start), low);
227 end = min(max(bi->end, bj->end), high);
Tejun Heo2e756be2011-02-16 17:11:09 +0100228 for (k = 0; k < mi->nr_blks; k++) {
Tejun Heo97e7b782011-02-16 17:11:08 +0100229 struct numa_memblk *bk = &mi->blk[k];
230
231 if (bi->nid == bk->nid)
Tejun Heoef396ec2011-02-16 17:11:07 +0100232 continue;
Tejun Heo97e7b782011-02-16 17:11:08 +0100233 if (start < bk->end && end > bk->start)
Tejun Heoef396ec2011-02-16 17:11:07 +0100234 break;
235 }
Tejun Heo97e7b782011-02-16 17:11:08 +0100236 if (k < mi->nr_blks)
Tejun Heoef396ec2011-02-16 17:11:07 +0100237 continue;
Tejun Heoef396ec2011-02-16 17:11:07 +0100238 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
Tejun Heo97e7b782011-02-16 17:11:08 +0100239 bi->nid, bi->start, bi->end, bj->start, bj->end,
Tejun Heoef396ec2011-02-16 17:11:07 +0100240 start, end);
Tejun Heo97e7b782011-02-16 17:11:08 +0100241 bi->start = start;
242 bi->end = end;
Tejun Heo2e756be2011-02-16 17:11:09 +0100243 numa_remove_memblk_from(j--, mi);
Tejun Heoef396ec2011-02-16 17:11:07 +0100244 }
245 }
246
Tejun Heo56e827f2011-02-16 17:11:09 +0100247 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
248 mi->blk[i].start = mi->blk[i].end = 0;
249 mi->blk[i].nid = NUMA_NO_NODE;
250 }
251
Tejun Heof9c60252011-02-16 17:11:09 +0100252 return 0;
253}
254
255/*
Tejun Heo4697bdc2011-02-16 17:11:09 +0100256 * Set nodes, which have memory in @mi, in *@nodemask.
257 */
258static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
259 const struct numa_meminfo *mi)
260{
261 int i;
262
263 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
264 if (mi->blk[i].start != mi->blk[i].end &&
265 mi->blk[i].nid != NUMA_NO_NODE)
266 node_set(mi->blk[i].nid, *nodemask);
267}
268
Tejun Heo90e6b672011-02-22 11:10:08 +0100269/**
270 * numa_reset_distance - Reset NUMA distance table
271 *
272 * The current table is freed. The next numa_set_distance() call will
273 * create a new one.
Tejun Heoac7136b2011-02-16 17:11:09 +0100274 */
Tejun Heob8ef9172011-02-22 11:10:08 +0100275void __init numa_reset_distance(void)
Tejun Heoac7136b2011-02-16 17:11:09 +0100276{
Yinghai Luce003332011-03-02 11:22:14 +0100277 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
Tejun Heoac7136b2011-02-16 17:11:09 +0100278
Tejun Heoeb8c1e22011-03-02 11:32:47 +0100279 /* numa_distance could be 1LU marking allocation failure, test cnt */
Yinghai Luce003332011-03-02 11:22:14 +0100280 if (numa_distance_cnt)
Yinghai Lu2ca230b2011-02-17 14:46:37 +0100281 memblock_x86_free_range(__pa(numa_distance),
282 __pa(numa_distance) + size);
Yinghai Luce003332011-03-02 11:22:14 +0100283 numa_distance_cnt = 0;
Tejun Heoeb8c1e22011-03-02 11:32:47 +0100284 numa_distance = NULL; /* enable table creation */
Tejun Heoac7136b2011-02-16 17:11:09 +0100285}
286
Yinghai Lu2bf50552011-02-22 11:18:49 +0100287static int __init numa_alloc_distance(void)
288{
289 nodemask_t nodes_parsed;
290 size_t size;
291 int i, j, cnt = 0;
292 u64 phys;
293
294 /* size the new table and allocate it */
295 nodes_parsed = numa_nodes_parsed;
296 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
297
298 for_each_node_mask(i, nodes_parsed)
299 cnt = i;
David Rientjes1f565a82011-02-25 10:06:39 +0100300 cnt++;
301 size = cnt * cnt * sizeof(numa_distance[0]);
Yinghai Lu2bf50552011-02-22 11:18:49 +0100302
303 phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
304 size, PAGE_SIZE);
305 if (phys == MEMBLOCK_ERROR) {
306 pr_warning("NUMA: Warning: can't allocate distance table!\n");
307 /* don't retry until explicitly reset */
308 numa_distance = (void *)1LU;
309 return -ENOMEM;
310 }
311 memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
312
313 numa_distance = __va(phys);
314 numa_distance_cnt = cnt;
315
316 /* fill with the default distances */
317 for (i = 0; i < cnt; i++)
318 for (j = 0; j < cnt; j++)
319 numa_distance[i * cnt + j] = i == j ?
320 LOCAL_DISTANCE : REMOTE_DISTANCE;
321 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
322
323 return 0;
324}
325
Tejun Heo90e6b672011-02-22 11:10:08 +0100326/**
327 * numa_set_distance - Set NUMA distance from one NUMA to another
328 * @from: the 'from' node to set distance
329 * @to: the 'to' node to set distance
330 * @distance: NUMA distance
331 *
332 * Set the distance from node @from to @to to @distance. If distance table
Lucas De Marchi0d2eb442011-03-17 16:24:16 -0300333 * doesn't exist, one which is large enough to accommodate all the currently
Tejun Heo90e6b672011-02-22 11:10:08 +0100334 * known nodes will be created.
Tejun Heoeb8c1e22011-03-02 11:32:47 +0100335 *
336 * If such table cannot be allocated, a warning is printed and further
337 * calls are ignored until the distance table is reset with
338 * numa_reset_distance().
339 *
340 * If @from or @to is higher than the highest known node at the time of
341 * table creation or @distance doesn't make sense, the call is ignored.
342 * This is to allow simplification of specific NUMA config implementations.
Tejun Heoac7136b2011-02-16 17:11:09 +0100343 */
344void __init numa_set_distance(int from, int to, int distance)
345{
Yinghai Lu2bf50552011-02-22 11:18:49 +0100346 if (!numa_distance && numa_alloc_distance() < 0)
347 return;
Tejun Heoac7136b2011-02-16 17:11:09 +0100348
349 if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
350 printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
351 from, to, distance);
352 return;
353 }
354
355 if ((u8)distance != distance ||
356 (from == to && distance != LOCAL_DISTANCE)) {
357 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
358 from, to, distance);
359 return;
360 }
361
362 numa_distance[from * numa_distance_cnt + to] = distance;
363}
364
365int __node_distance(int from, int to)
366{
Tejun Heoac7136b2011-02-16 17:11:09 +0100367 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
368 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
369 return numa_distance[from * numa_distance_cnt + to];
370}
371EXPORT_SYMBOL(__node_distance);
372
373/*
Tejun Heof9c60252011-02-16 17:11:09 +0100374 * Sanity check to catch more bad NUMA configurations (they are amazingly
375 * common). Make sure the nodes cover all memory.
376 */
Tejun Heo91556232011-02-16 17:11:09 +0100377static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
Tejun Heof9c60252011-02-16 17:11:09 +0100378{
379 unsigned long numaram, e820ram;
380 int i;
381
382 numaram = 0;
Tejun Heo91556232011-02-16 17:11:09 +0100383 for (i = 0; i < mi->nr_blks; i++) {
384 unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
385 unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
Tejun Heof9c60252011-02-16 17:11:09 +0100386 numaram += e - s;
Tejun Heo91556232011-02-16 17:11:09 +0100387 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
Tejun Heof9c60252011-02-16 17:11:09 +0100388 if ((long)numaram < 0)
389 numaram = 0;
390 }
391
392 e820ram = max_pfn - (memblock_x86_hole_size(0,
393 max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
394 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
395 if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
396 printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
397 (numaram << PAGE_SHIFT) >> 20,
398 (e820ram << PAGE_SHIFT) >> 20);
Tejun Heo91556232011-02-16 17:11:09 +0100399 return false;
Tejun Heof9c60252011-02-16 17:11:09 +0100400 }
Tejun Heo91556232011-02-16 17:11:09 +0100401 return true;
Tejun Heof9c60252011-02-16 17:11:09 +0100402}
403
404static int __init numa_register_memblks(struct numa_meminfo *mi)
405{
Yinghai Lu69efcc62011-02-21 10:58:13 +0100406 int i, nid;
Tejun Heof9c60252011-02-16 17:11:09 +0100407
408 /* Account for nodes with cpus and no memory */
Tejun Heo4697bdc2011-02-16 17:11:09 +0100409 node_possible_map = numa_nodes_parsed;
410 numa_nodemask_from_meminfo(&node_possible_map, mi);
Tejun Heof9c60252011-02-16 17:11:09 +0100411 if (WARN_ON(nodes_empty(node_possible_map)))
412 return -EINVAL;
413
Tejun Heo97e7b782011-02-16 17:11:08 +0100414 for (i = 0; i < mi->nr_blks; i++)
415 memblock_x86_register_active_regions(mi->blk[i].nid,
416 mi->blk[i].start >> PAGE_SHIFT,
417 mi->blk[i].end >> PAGE_SHIFT);
Tejun Heofd0435d2011-02-16 17:11:08 +0100418
419 /* for out of order entries */
420 sort_node_map();
Tejun Heo91556232011-02-16 17:11:09 +0100421 if (!numa_meminfo_cover_memory(mi))
Tejun Heofd0435d2011-02-16 17:11:08 +0100422 return -EINVAL;
423
Yinghai Lu69efcc62011-02-21 10:58:13 +0100424 /* Finally register nodes. */
425 for_each_node_mask(nid, node_possible_map) {
426 u64 start = (u64)max_pfn << PAGE_SHIFT;
427 u64 end = 0;
Tejun Heo91556232011-02-16 17:11:09 +0100428
Yinghai Lu69efcc62011-02-21 10:58:13 +0100429 for (i = 0; i < mi->nr_blks; i++) {
430 if (nid != mi->blk[i].nid)
Tejun Heo91556232011-02-16 17:11:09 +0100431 continue;
Yinghai Lu69efcc62011-02-21 10:58:13 +0100432 start = min(mi->blk[i].start, start);
433 end = max(mi->blk[i].end, end);
Tejun Heo91556232011-02-16 17:11:09 +0100434 }
Yinghai Lu69efcc62011-02-21 10:58:13 +0100435
436 if (start < end)
437 setup_node_bootmem(nid, start, end);
Tejun Heo91556232011-02-16 17:11:09 +0100438 }
Tejun Heofd0435d2011-02-16 17:11:08 +0100439
Tejun Heoef396ec2011-02-16 17:11:07 +0100440 return 0;
441}
442
David Rientjesc09cedf2011-03-04 15:17:21 +0100443/**
444 * dummy_numma_init - Fallback dummy NUMA init
445 *
446 * Used if there's no underlying NUMA architecture, NUMA initialization
447 * fails, or NUMA is disabled on the command line.
448 *
449 * Must online at least one node and add memory blocks that cover all
450 * allowed memory. This function must not fail.
451 */
Yinghai Lu6d496f92011-02-17 14:53:20 +0100452static int __init dummy_numa_init(void)
Thomas Gleixnere3cfe522008-01-30 13:30:37 +0100453{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454 printk(KERN_INFO "%s\n",
455 numa_off ? "NUMA turned off" : "No NUMA configuration found");
Thomas Gleixnere3cfe522008-01-30 13:30:37 +0100456 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
Tejun Heo86ef4db2011-02-16 12:13:06 +0100457 0LU, max_pfn << PAGE_SHIFT);
Tejun Heoffe77a42011-02-16 12:13:06 +0100458
Tejun Heo92d4a432011-02-16 17:11:09 +0100459 node_set(0, numa_nodes_parsed);
Tejun Heo43a662f2011-02-16 17:11:08 +0100460 numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
Tejun Heoec8cf29b2011-02-16 12:13:07 +0100461
462 return 0;
463}
464
David Rientjesc09cedf2011-03-04 15:17:21 +0100465static int __init numa_init(int (*init_func)(void))
466{
467 int i;
468 int ret;
469
470 for (i = 0; i < MAX_LOCAL_APIC; i++)
471 set_apicid_to_node(i, NUMA_NO_NODE);
472
473 nodes_clear(numa_nodes_parsed);
474 nodes_clear(node_possible_map);
475 nodes_clear(node_online_map);
476 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
477 remove_all_active_ranges();
478 numa_reset_distance();
479
480 ret = init_func();
481 if (ret < 0)
482 return ret;
483 ret = numa_cleanup_meminfo(&numa_meminfo);
484 if (ret < 0)
485 return ret;
486
487 numa_emulation(&numa_meminfo, numa_distance_cnt);
488
489 ret = numa_register_memblks(&numa_meminfo);
490 if (ret < 0)
491 return ret;
492
493 for (i = 0; i < nr_cpu_ids; i++) {
494 int nid = early_cpu_to_node(i);
495
496 if (nid == NUMA_NO_NODE)
497 continue;
498 if (!node_online(nid))
499 numa_clear_node(i);
500 }
501 numa_init_array();
502 return 0;
503}
504
Tejun Heoffe77a42011-02-16 12:13:06 +0100505void __init initmem_init(void)
506{
Tejun Heoffe77a42011-02-16 12:13:06 +0100507 if (!numa_off) {
508#ifdef CONFIG_ACPI_NUMA
Florian Mickler711b8c82011-04-04 01:17:40 +0200509 if (!numa_init(x86_acpi_numa_init))
David Rientjesc09cedf2011-03-04 15:17:21 +0100510 return;
Tejun Heoffe77a42011-02-16 12:13:06 +0100511#endif
512#ifdef CONFIG_AMD_NUMA
Florian Mickler711b8c82011-04-04 01:17:40 +0200513 if (!numa_init(amd_numa_init))
David Rientjesc09cedf2011-03-04 15:17:21 +0100514 return;
Tejun Heoffe77a42011-02-16 12:13:06 +0100515#endif
516 }
517
David Rientjesc09cedf2011-03-04 15:17:21 +0100518 numa_init(dummy_numa_init);
Andi Kleen69d81fc2005-11-05 17:25:53 +0100519}
520
Thomas Gleixnere3cfe522008-01-30 13:30:37 +0100521unsigned long __init numa_free_all_bootmem(void)
522{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700523 unsigned long pages = 0;
Thomas Gleixnere3cfe522008-01-30 13:30:37 +0100524 int i;
525
526 for_each_online_node(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700527 pages += free_all_bootmem_node(NODE_DATA(i));
Thomas Gleixnere3cfe522008-01-30 13:30:37 +0100528
Yinghai Lu08677212010-02-10 01:20:20 -0800529 pages += free_all_memory_core_early(MAX_NUMNODES);
Yinghai Lu08677212010-02-10 01:20:20 -0800530
Linus Torvalds1da177e2005-04-16 15:20:36 -0700531 return pages;
Thomas Gleixnere3cfe522008-01-30 13:30:37 +0100532}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700533
Tejun Heobbc9e2f2011-01-23 14:37:39 +0100534int __cpuinit numa_cpu_node(int cpu)
Yinghai Lud9c2d5a2009-11-21 00:23:37 -0800535{
Tejun Heobbc9e2f2011-01-23 14:37:39 +0100536 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
Yinghai Lud9c2d5a2009-11-21 00:23:37 -0800537
Tejun Heobbc9e2f2011-01-23 14:37:39 +0100538 if (apicid != BAD_APICID)
539 return __apicid_to_node[apicid];
540 return NUMA_NO_NODE;
Yinghai Lud9c2d5a2009-11-21 00:23:37 -0800541}
Tejun Heo96886782011-05-02 14:18:51 +0200542
543#ifdef CONFIG_MEMORY_HOTPLUG
544int memory_add_physaddr_to_nid(u64 start)
545{
546 struct numa_meminfo *mi = &numa_meminfo;
547 int nid = mi->blk[0].nid;
548 int i;
549
550 for (i = 0; i < mi->nr_blks; i++)
551 if (mi->blk[i].start <= start && mi->blk[i].end > start)
552 nid = mi->blk[i].nid;
553 return nid;
554}
555EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
556#endif