blob: 70cb2904a90f9dd8c9310bb368976bc67124d784 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
25struct pglist_data *node_data[MAX_NUMNODES];
26bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28int memnode_shift;
29u8 memnodemap[NODEMAPSIZE];
30
31unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
32cpumask_t node_to_cpumask[MAX_NUMNODES];
33
34int numa_off __initdata;
35
36int __init compute_hash_shift(struct node *nodes, int numnodes)
37{
38 int i;
Keith Manntheyb6846642005-07-28 21:15:38 -070039 int shift = 20;
40 unsigned long addr,maxend=0;
Linus Torvalds1da177e2005-04-16 15:20:36 -070041
Keith Manntheyb6846642005-07-28 21:15:38 -070042 for (i = 0; i < numnodes; i++)
43 if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend))
44 maxend = nodes[i].end;
45
46 while ((1UL << shift) < (maxend / NODEMAPSIZE))
47 shift++;
48
49 printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n",
50 shift,maxend);
51 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
52 for (i = 0; i < numnodes; i++) {
53 if (nodes[i].start == nodes[i].end)
54 continue;
55 for (addr = nodes[i].start;
56 addr < nodes[i].end;
57 addr += (1UL << shift)) {
58 if (memnodemap[addr >> shift] != 0xff) {
59 printk(KERN_INFO
60 "Your memory is not aligned you need to rebuild your kernel "
61 "with a bigger NODEMAPSIZE shift=%d adder=%lu\n",
62 shift,addr);
63 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -070064 }
Keith Manntheyb6846642005-07-28 21:15:38 -070065 memnodemap[addr >> shift] = i;
Linus Torvalds1da177e2005-04-16 15:20:36 -070066 }
Linus Torvalds1da177e2005-04-16 15:20:36 -070067 }
Keith Manntheyb6846642005-07-28 21:15:38 -070068 return shift;
Linus Torvalds1da177e2005-04-16 15:20:36 -070069}
70
Matt Tolentinobbfceef2005-06-23 00:08:07 -070071#ifdef CONFIG_SPARSEMEM
72int early_pfn_to_nid(unsigned long pfn)
73{
74 return phys_to_nid(pfn << PAGE_SHIFT);
75}
76#endif
77
Linus Torvalds1da177e2005-04-16 15:20:36 -070078/* Initialize bootmem allocator for a node */
79void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
80{
81 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
82 unsigned long nodedata_phys;
83 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
84
85 start = round_up(start, ZONE_ALIGN);
86
87 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
88
89 start_pfn = start >> PAGE_SHIFT;
90 end_pfn = end >> PAGE_SHIFT;
91
Matt Tolentinobbfceef2005-06-23 00:08:07 -070092 memory_present(nodeid, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -070093 nodedata_phys = find_e820_area(start, end, pgdat_size);
94 if (nodedata_phys == -1L)
95 panic("Cannot find memory pgdat in node %d\n", nodeid);
96
97 Dprintk("nodedata_phys %lx\n", nodedata_phys);
98
99 node_data[nodeid] = phys_to_virt(nodedata_phys);
100 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
101 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
102 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
103 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
104
105 /* Find a place for the bootmem map */
106 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
107 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
108 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
109 if (bootmap_start == -1L)
110 panic("Not enough continuous space for bootmap on node %d", nodeid);
111 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
112
113 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
114 bootmap_start >> PAGE_SHIFT,
115 start_pfn, end_pfn);
116
117 e820_bootmem_free(NODE_DATA(nodeid), start, end);
118
119 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
120 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
121 node_set_online(nodeid);
122}
123
124/* Initialize final allocator for a zone */
125void __init setup_node_zones(int nodeid)
126{
127 unsigned long start_pfn, end_pfn;
128 unsigned long zones[MAX_NR_ZONES];
129 unsigned long dma_end_pfn;
130
131 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
132
133 start_pfn = node_start_pfn(nodeid);
134 end_pfn = node_end_pfn(nodeid);
135
136 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
137
138 /* All nodes > 0 have a zero length zone DMA */
139 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
140 if (start_pfn < dma_end_pfn) {
141 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
142 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
143 } else {
144 zones[ZONE_NORMAL] = end_pfn - start_pfn;
145 }
146
147 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
148 start_pfn, NULL);
149}
150
151void __init numa_init_array(void)
152{
153 int rr, i;
154 /* There are unfortunately some poorly designed mainboards around
155 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
156 mapping. To avoid this fill in the mapping for all possible
157 CPUs, as the number of CPUs is not known yet.
158 We round robin the existing nodes. */
159 rr = 0;
160 for (i = 0; i < NR_CPUS; i++) {
161 if (cpu_to_node[i] != NUMA_NO_NODE)
162 continue;
163 rr = next_node(rr, node_online_map);
164 if (rr == MAX_NUMNODES)
165 rr = first_node(node_online_map);
166 cpu_to_node[i] = rr;
167 rr++;
168 }
169
170 set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
171}
172
173#ifdef CONFIG_NUMA_EMU
174int numa_fake __initdata = 0;
175
176/* Numa emulation */
177static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
178{
179 int i;
180 struct node nodes[MAX_NUMNODES];
181 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
182
183 /* Kludge needed for the hash function */
184 if (hweight64(sz) > 1) {
185 unsigned long x = 1;
186 while ((x << 1) < sz)
187 x <<= 1;
188 if (x < sz/2)
189 printk("Numa emulation unbalanced. Complain to maintainer\n");
190 sz = x;
191 }
192
193 memset(&nodes,0,sizeof(nodes));
194 for (i = 0; i < numa_fake; i++) {
195 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
196 if (i == numa_fake-1)
197 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
198 nodes[i].end = nodes[i].start + sz;
199 if (i != numa_fake-1)
200 nodes[i].end--;
201 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
202 i,
203 nodes[i].start, nodes[i].end,
204 (nodes[i].end - nodes[i].start) >> 20);
205 node_set_online(i);
206 }
207 memnode_shift = compute_hash_shift(nodes, numa_fake);
208 if (memnode_shift < 0) {
209 memnode_shift = 0;
210 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
211 return -1;
212 }
213 for_each_online_node(i)
214 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
215 numa_init_array();
216 return 0;
217}
218#endif
219
220void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
221{
222 int i;
223
224#ifdef CONFIG_NUMA_EMU
225 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
226 return;
227#endif
228
229#ifdef CONFIG_ACPI_NUMA
230 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
231 end_pfn << PAGE_SHIFT))
232 return;
233#endif
234
235#ifdef CONFIG_K8_NUMA
236 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
237 return;
238#endif
239 printk(KERN_INFO "%s\n",
240 numa_off ? "NUMA turned off" : "No NUMA configuration found");
241
242 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
243 start_pfn << PAGE_SHIFT,
244 end_pfn << PAGE_SHIFT);
245 /* setup dummy node covering all memory */
246 memnode_shift = 63;
247 memnodemap[0] = 0;
248 nodes_clear(node_online_map);
249 node_set_online(0);
250 for (i = 0; i < NR_CPUS; i++)
251 cpu_to_node[i] = 0;
252 node_to_cpumask[0] = cpumask_of_cpu(0);
253 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
254}
255
Ashok Raje6982c62005-06-25 14:54:58 -0700256__cpuinit void numa_add_cpu(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257{
258 /* BP is initialized elsewhere */
259 if (cpu)
260 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
261}
262
263unsigned long __init numa_free_all_bootmem(void)
264{
265 int i;
266 unsigned long pages = 0;
267 for_each_online_node(i) {
268 pages += free_all_bootmem_node(NODE_DATA(i));
269 }
270 return pages;
271}
272
273void __init paging_init(void)
274{
275 int i;
276 for_each_online_node(i) {
277 setup_node_zones(i);
278 }
279}
280
281/* [numa=off] */
282__init int numa_setup(char *opt)
283{
284 if (!strncmp(opt,"off",3))
285 numa_off = 1;
286#ifdef CONFIG_NUMA_EMU
287 if(!strncmp(opt, "fake=", 5)) {
288 numa_fake = simple_strtoul(opt+5,NULL,0); ;
289 if (numa_fake >= MAX_NUMNODES)
290 numa_fake = MAX_NUMNODES;
291 }
292#endif
293#ifdef CONFIG_ACPI_NUMA
294 if (!strncmp(opt,"noacpi",6))
295 acpi_numa = -1;
296#endif
297 return 1;
298}
299
300EXPORT_SYMBOL(cpu_to_node);
301EXPORT_SYMBOL(node_to_cpumask);
302EXPORT_SYMBOL(memnode_shift);
303EXPORT_SYMBOL(memnodemap);
304EXPORT_SYMBOL(node_data);