blob: 68ad758535102a364fd77d0776bd56f12994fdac [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
Ravikiran G Thirumalai6c231b72005-09-06 15:17:45 -070025struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070026bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28int memnode_shift;
29u8 memnodemap[NODEMAPSIZE];
30
Andi Kleen3f098c22005-09-12 18:49:24 +020031unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
Andi Kleen0b07e982005-09-12 18:49:24 +020033};
Andi Kleen3f098c22005-09-12 18:49:24 +020034unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36};
37cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070038
39int numa_off __initdata;
40
41int __init compute_hash_shift(struct node *nodes, int numnodes)
42{
43 int i;
Keith Manntheyb6846642005-07-28 21:15:38 -070044 int shift = 20;
45 unsigned long addr,maxend=0;
Linus Torvalds1da177e2005-04-16 15:20:36 -070046
Keith Manntheyb6846642005-07-28 21:15:38 -070047 for (i = 0; i < numnodes; i++)
48 if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend))
49 maxend = nodes[i].end;
50
51 while ((1UL << shift) < (maxend / NODEMAPSIZE))
52 shift++;
53
54 printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n",
55 shift,maxend);
56 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
57 for (i = 0; i < numnodes; i++) {
58 if (nodes[i].start == nodes[i].end)
59 continue;
60 for (addr = nodes[i].start;
61 addr < nodes[i].end;
62 addr += (1UL << shift)) {
63 if (memnodemap[addr >> shift] != 0xff) {
64 printk(KERN_INFO
65 "Your memory is not aligned you need to rebuild your kernel "
66 "with a bigger NODEMAPSIZE shift=%d adder=%lu\n",
67 shift,addr);
68 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -070069 }
Keith Manntheyb6846642005-07-28 21:15:38 -070070 memnodemap[addr >> shift] = i;
Linus Torvalds1da177e2005-04-16 15:20:36 -070071 }
Linus Torvalds1da177e2005-04-16 15:20:36 -070072 }
Keith Manntheyb6846642005-07-28 21:15:38 -070073 return shift;
Linus Torvalds1da177e2005-04-16 15:20:36 -070074}
75
Matt Tolentinobbfceef2005-06-23 00:08:07 -070076#ifdef CONFIG_SPARSEMEM
77int early_pfn_to_nid(unsigned long pfn)
78{
79 return phys_to_nid(pfn << PAGE_SHIFT);
80}
81#endif
82
Linus Torvalds1da177e2005-04-16 15:20:36 -070083/* Initialize bootmem allocator for a node */
84void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
85{
86 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
87 unsigned long nodedata_phys;
88 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
89
90 start = round_up(start, ZONE_ALIGN);
91
92 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
93
94 start_pfn = start >> PAGE_SHIFT;
95 end_pfn = end >> PAGE_SHIFT;
96
Matt Tolentinobbfceef2005-06-23 00:08:07 -070097 memory_present(nodeid, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -070098 nodedata_phys = find_e820_area(start, end, pgdat_size);
99 if (nodedata_phys == -1L)
100 panic("Cannot find memory pgdat in node %d\n", nodeid);
101
102 Dprintk("nodedata_phys %lx\n", nodedata_phys);
103
104 node_data[nodeid] = phys_to_virt(nodedata_phys);
105 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
106 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
107 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
108 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
109
110 /* Find a place for the bootmem map */
111 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
112 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
113 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
114 if (bootmap_start == -1L)
115 panic("Not enough continuous space for bootmap on node %d", nodeid);
116 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
117
118 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
119 bootmap_start >> PAGE_SHIFT,
120 start_pfn, end_pfn);
121
122 e820_bootmem_free(NODE_DATA(nodeid), start, end);
123
124 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
125 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
126 node_set_online(nodeid);
127}
128
129/* Initialize final allocator for a zone */
130void __init setup_node_zones(int nodeid)
131{
132 unsigned long start_pfn, end_pfn;
133 unsigned long zones[MAX_NR_ZONES];
Andi Kleen485761b2005-08-26 18:34:10 -0700134 unsigned long holes[MAX_NR_ZONES];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135 unsigned long dma_end_pfn;
136
137 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
Andi Kleen485761b2005-08-26 18:34:10 -0700138 memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700139
140 start_pfn = node_start_pfn(nodeid);
141 end_pfn = node_end_pfn(nodeid);
142
143 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
144
145 /* All nodes > 0 have a zero length zone DMA */
146 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
147 if (start_pfn < dma_end_pfn) {
148 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
Andi Kleen485761b2005-08-26 18:34:10 -0700149 holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700150 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
Andi Kleen485761b2005-08-26 18:34:10 -0700151 holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn);
152
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153 } else {
154 zones[ZONE_NORMAL] = end_pfn - start_pfn;
Andi Kleen485761b2005-08-26 18:34:10 -0700155 holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156 }
157
158 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
Andi Kleen485761b2005-08-26 18:34:10 -0700159 start_pfn, holes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160}
161
162void __init numa_init_array(void)
163{
164 int rr, i;
165 /* There are unfortunately some poorly designed mainboards around
166 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
167 mapping. To avoid this fill in the mapping for all possible
168 CPUs, as the number of CPUs is not known yet.
169 We round robin the existing nodes. */
170 rr = 0;
171 for (i = 0; i < NR_CPUS; i++) {
172 if (cpu_to_node[i] != NUMA_NO_NODE)
173 continue;
174 rr = next_node(rr, node_online_map);
175 if (rr == MAX_NUMNODES)
176 rr = first_node(node_online_map);
177 cpu_to_node[i] = rr;
178 rr++;
179 }
180
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181}
182
183#ifdef CONFIG_NUMA_EMU
184int numa_fake __initdata = 0;
185
186/* Numa emulation */
187static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
188{
189 int i;
190 struct node nodes[MAX_NUMNODES];
191 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
192
193 /* Kludge needed for the hash function */
194 if (hweight64(sz) > 1) {
195 unsigned long x = 1;
196 while ((x << 1) < sz)
197 x <<= 1;
198 if (x < sz/2)
199 printk("Numa emulation unbalanced. Complain to maintainer\n");
200 sz = x;
201 }
202
203 memset(&nodes,0,sizeof(nodes));
204 for (i = 0; i < numa_fake; i++) {
205 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
206 if (i == numa_fake-1)
207 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
208 nodes[i].end = nodes[i].start + sz;
209 if (i != numa_fake-1)
210 nodes[i].end--;
211 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
212 i,
213 nodes[i].start, nodes[i].end,
214 (nodes[i].end - nodes[i].start) >> 20);
215 node_set_online(i);
216 }
217 memnode_shift = compute_hash_shift(nodes, numa_fake);
218 if (memnode_shift < 0) {
219 memnode_shift = 0;
220 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
221 return -1;
222 }
223 for_each_online_node(i)
224 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
225 numa_init_array();
226 return 0;
227}
228#endif
229
230void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
231{
232 int i;
233
234#ifdef CONFIG_NUMA_EMU
235 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
236 return;
237#endif
238
239#ifdef CONFIG_ACPI_NUMA
240 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
241 end_pfn << PAGE_SHIFT))
242 return;
243#endif
244
245#ifdef CONFIG_K8_NUMA
246 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
247 return;
248#endif
249 printk(KERN_INFO "%s\n",
250 numa_off ? "NUMA turned off" : "No NUMA configuration found");
251
252 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
253 start_pfn << PAGE_SHIFT,
254 end_pfn << PAGE_SHIFT);
255 /* setup dummy node covering all memory */
256 memnode_shift = 63;
257 memnodemap[0] = 0;
258 nodes_clear(node_online_map);
259 node_set_online(0);
260 for (i = 0; i < NR_CPUS; i++)
261 cpu_to_node[i] = 0;
262 node_to_cpumask[0] = cpumask_of_cpu(0);
263 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
264}
265
Ashok Raje6982c62005-06-25 14:54:58 -0700266__cpuinit void numa_add_cpu(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267{
Ravikiran G Thirumalaie6a045a2005-09-30 11:59:21 -0700268 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700269}
270
271unsigned long __init numa_free_all_bootmem(void)
272{
273 int i;
274 unsigned long pages = 0;
275 for_each_online_node(i) {
276 pages += free_all_bootmem_node(NODE_DATA(i));
277 }
278 return pages;
279}
280
281void __init paging_init(void)
282{
283 int i;
284 for_each_online_node(i) {
285 setup_node_zones(i);
286 }
287}
288
289/* [numa=off] */
290__init int numa_setup(char *opt)
291{
292 if (!strncmp(opt,"off",3))
293 numa_off = 1;
294#ifdef CONFIG_NUMA_EMU
295 if(!strncmp(opt, "fake=", 5)) {
296 numa_fake = simple_strtoul(opt+5,NULL,0); ;
297 if (numa_fake >= MAX_NUMNODES)
298 numa_fake = MAX_NUMNODES;
299 }
300#endif
301#ifdef CONFIG_ACPI_NUMA
302 if (!strncmp(opt,"noacpi",6))
303 acpi_numa = -1;
304#endif
305 return 1;
306}
307
308EXPORT_SYMBOL(cpu_to_node);
309EXPORT_SYMBOL(node_to_cpumask);
310EXPORT_SYMBOL(memnode_shift);
311EXPORT_SYMBOL(memnodemap);
312EXPORT_SYMBOL(node_data);