blob: 04f7a33e144c4bbbd05befd65d6c396184e6e75d [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
Ravikiran G Thirumalai6c231b72005-09-06 15:17:45 -070025struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070026bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28int memnode_shift;
29u8 memnodemap[NODEMAPSIZE];
30
Ravikiran G Thirumalai6c231b72005-09-06 15:17:45 -070031unsigned char cpu_to_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
32cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070033
34int numa_off __initdata;
35
36int __init compute_hash_shift(struct node *nodes, int numnodes)
37{
38 int i;
Keith Manntheyb6846642005-07-28 21:15:38 -070039 int shift = 20;
40 unsigned long addr,maxend=0;
Linus Torvalds1da177e2005-04-16 15:20:36 -070041
Keith Manntheyb6846642005-07-28 21:15:38 -070042 for (i = 0; i < numnodes; i++)
43 if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend))
44 maxend = nodes[i].end;
45
46 while ((1UL << shift) < (maxend / NODEMAPSIZE))
47 shift++;
48
49 printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n",
50 shift,maxend);
51 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
52 for (i = 0; i < numnodes; i++) {
53 if (nodes[i].start == nodes[i].end)
54 continue;
55 for (addr = nodes[i].start;
56 addr < nodes[i].end;
57 addr += (1UL << shift)) {
58 if (memnodemap[addr >> shift] != 0xff) {
59 printk(KERN_INFO
60 "Your memory is not aligned you need to rebuild your kernel "
61 "with a bigger NODEMAPSIZE shift=%d adder=%lu\n",
62 shift,addr);
63 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -070064 }
Keith Manntheyb6846642005-07-28 21:15:38 -070065 memnodemap[addr >> shift] = i;
Linus Torvalds1da177e2005-04-16 15:20:36 -070066 }
Linus Torvalds1da177e2005-04-16 15:20:36 -070067 }
Keith Manntheyb6846642005-07-28 21:15:38 -070068 return shift;
Linus Torvalds1da177e2005-04-16 15:20:36 -070069}
70
Matt Tolentinobbfceef2005-06-23 00:08:07 -070071#ifdef CONFIG_SPARSEMEM
72int early_pfn_to_nid(unsigned long pfn)
73{
74 return phys_to_nid(pfn << PAGE_SHIFT);
75}
76#endif
77
Linus Torvalds1da177e2005-04-16 15:20:36 -070078/* Initialize bootmem allocator for a node */
79void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
80{
81 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
82 unsigned long nodedata_phys;
83 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
84
85 start = round_up(start, ZONE_ALIGN);
86
87 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
88
89 start_pfn = start >> PAGE_SHIFT;
90 end_pfn = end >> PAGE_SHIFT;
91
Matt Tolentinobbfceef2005-06-23 00:08:07 -070092 memory_present(nodeid, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -070093 nodedata_phys = find_e820_area(start, end, pgdat_size);
94 if (nodedata_phys == -1L)
95 panic("Cannot find memory pgdat in node %d\n", nodeid);
96
97 Dprintk("nodedata_phys %lx\n", nodedata_phys);
98
99 node_data[nodeid] = phys_to_virt(nodedata_phys);
100 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
101 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
102 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
103 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
104
105 /* Find a place for the bootmem map */
106 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
107 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
108 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
109 if (bootmap_start == -1L)
110 panic("Not enough continuous space for bootmap on node %d", nodeid);
111 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
112
113 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
114 bootmap_start >> PAGE_SHIFT,
115 start_pfn, end_pfn);
116
117 e820_bootmem_free(NODE_DATA(nodeid), start, end);
118
119 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
120 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
121 node_set_online(nodeid);
122}
123
124/* Initialize final allocator for a zone */
125void __init setup_node_zones(int nodeid)
126{
127 unsigned long start_pfn, end_pfn;
128 unsigned long zones[MAX_NR_ZONES];
Andi Kleen485761b2005-08-26 18:34:10 -0700129 unsigned long holes[MAX_NR_ZONES];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700130 unsigned long dma_end_pfn;
131
132 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
Andi Kleen485761b2005-08-26 18:34:10 -0700133 memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134
135 start_pfn = node_start_pfn(nodeid);
136 end_pfn = node_end_pfn(nodeid);
137
138 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
139
140 /* All nodes > 0 have a zero length zone DMA */
141 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
142 if (start_pfn < dma_end_pfn) {
143 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
Andi Kleen485761b2005-08-26 18:34:10 -0700144 holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700145 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
Andi Kleen485761b2005-08-26 18:34:10 -0700146 holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn);
147
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148 } else {
149 zones[ZONE_NORMAL] = end_pfn - start_pfn;
Andi Kleen485761b2005-08-26 18:34:10 -0700150 holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151 }
152
153 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
Andi Kleen485761b2005-08-26 18:34:10 -0700154 start_pfn, holes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155}
156
157void __init numa_init_array(void)
158{
159 int rr, i;
160 /* There are unfortunately some poorly designed mainboards around
161 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
162 mapping. To avoid this fill in the mapping for all possible
163 CPUs, as the number of CPUs is not known yet.
164 We round robin the existing nodes. */
165 rr = 0;
166 for (i = 0; i < NR_CPUS; i++) {
167 if (cpu_to_node[i] != NUMA_NO_NODE)
168 continue;
169 rr = next_node(rr, node_online_map);
170 if (rr == MAX_NUMNODES)
171 rr = first_node(node_online_map);
172 cpu_to_node[i] = rr;
173 rr++;
174 }
175
176 set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
177}
178
179#ifdef CONFIG_NUMA_EMU
180int numa_fake __initdata = 0;
181
182/* Numa emulation */
183static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
184{
185 int i;
186 struct node nodes[MAX_NUMNODES];
187 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
188
189 /* Kludge needed for the hash function */
190 if (hweight64(sz) > 1) {
191 unsigned long x = 1;
192 while ((x << 1) < sz)
193 x <<= 1;
194 if (x < sz/2)
195 printk("Numa emulation unbalanced. Complain to maintainer\n");
196 sz = x;
197 }
198
199 memset(&nodes,0,sizeof(nodes));
200 for (i = 0; i < numa_fake; i++) {
201 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
202 if (i == numa_fake-1)
203 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
204 nodes[i].end = nodes[i].start + sz;
205 if (i != numa_fake-1)
206 nodes[i].end--;
207 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
208 i,
209 nodes[i].start, nodes[i].end,
210 (nodes[i].end - nodes[i].start) >> 20);
211 node_set_online(i);
212 }
213 memnode_shift = compute_hash_shift(nodes, numa_fake);
214 if (memnode_shift < 0) {
215 memnode_shift = 0;
216 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
217 return -1;
218 }
219 for_each_online_node(i)
220 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
221 numa_init_array();
222 return 0;
223}
224#endif
225
226void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
227{
228 int i;
229
230#ifdef CONFIG_NUMA_EMU
231 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
232 return;
233#endif
234
235#ifdef CONFIG_ACPI_NUMA
236 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
237 end_pfn << PAGE_SHIFT))
238 return;
239#endif
240
241#ifdef CONFIG_K8_NUMA
242 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
243 return;
244#endif
245 printk(KERN_INFO "%s\n",
246 numa_off ? "NUMA turned off" : "No NUMA configuration found");
247
248 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
249 start_pfn << PAGE_SHIFT,
250 end_pfn << PAGE_SHIFT);
251 /* setup dummy node covering all memory */
252 memnode_shift = 63;
253 memnodemap[0] = 0;
254 nodes_clear(node_online_map);
255 node_set_online(0);
256 for (i = 0; i < NR_CPUS; i++)
257 cpu_to_node[i] = 0;
258 node_to_cpumask[0] = cpumask_of_cpu(0);
259 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
260}
261
Ashok Raje6982c62005-06-25 14:54:58 -0700262__cpuinit void numa_add_cpu(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263{
264 /* BP is initialized elsewhere */
265 if (cpu)
266 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
267}
268
269unsigned long __init numa_free_all_bootmem(void)
270{
271 int i;
272 unsigned long pages = 0;
273 for_each_online_node(i) {
274 pages += free_all_bootmem_node(NODE_DATA(i));
275 }
276 return pages;
277}
278
279void __init paging_init(void)
280{
281 int i;
282 for_each_online_node(i) {
283 setup_node_zones(i);
284 }
285}
286
287/* [numa=off] */
288__init int numa_setup(char *opt)
289{
290 if (!strncmp(opt,"off",3))
291 numa_off = 1;
292#ifdef CONFIG_NUMA_EMU
293 if(!strncmp(opt, "fake=", 5)) {
294 numa_fake = simple_strtoul(opt+5,NULL,0); ;
295 if (numa_fake >= MAX_NUMNODES)
296 numa_fake = MAX_NUMNODES;
297 }
298#endif
299#ifdef CONFIG_ACPI_NUMA
300 if (!strncmp(opt,"noacpi",6))
301 acpi_numa = -1;
302#endif
303 return 1;
304}
305
306EXPORT_SYMBOL(cpu_to_node);
307EXPORT_SYMBOL(node_to_cpumask);
308EXPORT_SYMBOL(memnode_shift);
309EXPORT_SYMBOL(memnodemap);
310EXPORT_SYMBOL(node_data);