blob: 4bf64583ba3b696deeedb61fd049f1876cae7e20 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
Ravikiran G Thirumalai6c231b72005-09-06 15:17:45 -070025struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070026bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28int memnode_shift;
29u8 memnodemap[NODEMAPSIZE];
30
Andi Kleen3f098c22005-09-12 18:49:24 +020031unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
Andi Kleen0b07e982005-09-12 18:49:24 +020033};
Andi Kleen3f098c22005-09-12 18:49:24 +020034unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36};
37cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070038
39int numa_off __initdata;
40
41int __init compute_hash_shift(struct node *nodes, int numnodes)
42{
43 int i;
Keith Manntheyb6846642005-07-28 21:15:38 -070044 int shift = 20;
45 unsigned long addr,maxend=0;
Linus Torvalds1da177e2005-04-16 15:20:36 -070046
Keith Manntheyb6846642005-07-28 21:15:38 -070047 for (i = 0; i < numnodes; i++)
48 if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend))
49 maxend = nodes[i].end;
50
51 while ((1UL << shift) < (maxend / NODEMAPSIZE))
52 shift++;
53
54 printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n",
55 shift,maxend);
56 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
57 for (i = 0; i < numnodes; i++) {
58 if (nodes[i].start == nodes[i].end)
59 continue;
60 for (addr = nodes[i].start;
61 addr < nodes[i].end;
62 addr += (1UL << shift)) {
63 if (memnodemap[addr >> shift] != 0xff) {
64 printk(KERN_INFO
65 "Your memory is not aligned you need to rebuild your kernel "
66 "with a bigger NODEMAPSIZE shift=%d adder=%lu\n",
67 shift,addr);
68 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -070069 }
Keith Manntheyb6846642005-07-28 21:15:38 -070070 memnodemap[addr >> shift] = i;
Linus Torvalds1da177e2005-04-16 15:20:36 -070071 }
Linus Torvalds1da177e2005-04-16 15:20:36 -070072 }
Keith Manntheyb6846642005-07-28 21:15:38 -070073 return shift;
Linus Torvalds1da177e2005-04-16 15:20:36 -070074}
75
Matt Tolentinobbfceef2005-06-23 00:08:07 -070076#ifdef CONFIG_SPARSEMEM
77int early_pfn_to_nid(unsigned long pfn)
78{
79 return phys_to_nid(pfn << PAGE_SHIFT);
80}
81#endif
82
Linus Torvalds1da177e2005-04-16 15:20:36 -070083/* Initialize bootmem allocator for a node */
84void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
85{
86 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
87 unsigned long nodedata_phys;
88 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
89
90 start = round_up(start, ZONE_ALIGN);
91
92 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
93
94 start_pfn = start >> PAGE_SHIFT;
95 end_pfn = end >> PAGE_SHIFT;
96
Matt Tolentinobbfceef2005-06-23 00:08:07 -070097 memory_present(nodeid, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -070098 nodedata_phys = find_e820_area(start, end, pgdat_size);
99 if (nodedata_phys == -1L)
100 panic("Cannot find memory pgdat in node %d\n", nodeid);
101
102 Dprintk("nodedata_phys %lx\n", nodedata_phys);
103
104 node_data[nodeid] = phys_to_virt(nodedata_phys);
105 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
106 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
107 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
108 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
109
110 /* Find a place for the bootmem map */
111 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
112 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
113 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
114 if (bootmap_start == -1L)
115 panic("Not enough continuous space for bootmap on node %d", nodeid);
116 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
117
118 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
119 bootmap_start >> PAGE_SHIFT,
120 start_pfn, end_pfn);
121
122 e820_bootmem_free(NODE_DATA(nodeid), start, end);
123
124 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
125 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
126 node_set_online(nodeid);
127}
128
129/* Initialize final allocator for a zone */
130void __init setup_node_zones(int nodeid)
131{
132 unsigned long start_pfn, end_pfn;
133 unsigned long zones[MAX_NR_ZONES];
Andi Kleen485761b2005-08-26 18:34:10 -0700134 unsigned long holes[MAX_NR_ZONES];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135
Andi Kleena2f1b422005-11-05 17:25:53 +0100136 start_pfn = node_start_pfn(nodeid);
137 end_pfn = node_end_pfn(nodeid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700138
Andi Kleena2f1b422005-11-05 17:25:53 +0100139 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n",
140 nodeid, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700141
Andi Kleena2f1b422005-11-05 17:25:53 +0100142 size_zones(zones, holes, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
Andi Kleen485761b2005-08-26 18:34:10 -0700144 start_pfn, holes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700145}
146
147void __init numa_init_array(void)
148{
149 int rr, i;
150 /* There are unfortunately some poorly designed mainboards around
151 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
152 mapping. To avoid this fill in the mapping for all possible
153 CPUs, as the number of CPUs is not known yet.
154 We round robin the existing nodes. */
Ravikiran G Thirumalai85cc5132005-09-30 11:59:22 -0700155 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156 for (i = 0; i < NR_CPUS; i++) {
157 if (cpu_to_node[i] != NUMA_NO_NODE)
158 continue;
Andi Kleen69d81fc2005-11-05 17:25:53 +0100159 numa_set_node(i, rr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160 rr = next_node(rr, node_online_map);
161 if (rr == MAX_NUMNODES)
162 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163 }
164
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165}
166
167#ifdef CONFIG_NUMA_EMU
168int numa_fake __initdata = 0;
169
170/* Numa emulation */
171static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
172{
173 int i;
174 struct node nodes[MAX_NUMNODES];
175 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
176
177 /* Kludge needed for the hash function */
178 if (hweight64(sz) > 1) {
179 unsigned long x = 1;
180 while ((x << 1) < sz)
181 x <<= 1;
182 if (x < sz/2)
183 printk("Numa emulation unbalanced. Complain to maintainer\n");
184 sz = x;
185 }
186
187 memset(&nodes,0,sizeof(nodes));
188 for (i = 0; i < numa_fake; i++) {
189 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
190 if (i == numa_fake-1)
191 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
192 nodes[i].end = nodes[i].start + sz;
193 if (i != numa_fake-1)
194 nodes[i].end--;
195 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
196 i,
197 nodes[i].start, nodes[i].end,
198 (nodes[i].end - nodes[i].start) >> 20);
199 node_set_online(i);
200 }
201 memnode_shift = compute_hash_shift(nodes, numa_fake);
202 if (memnode_shift < 0) {
203 memnode_shift = 0;
204 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
205 return -1;
206 }
207 for_each_online_node(i)
208 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
209 numa_init_array();
210 return 0;
211}
212#endif
213
214void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
215{
216 int i;
217
218#ifdef CONFIG_NUMA_EMU
219 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
220 return;
221#endif
222
223#ifdef CONFIG_ACPI_NUMA
224 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
225 end_pfn << PAGE_SHIFT))
226 return;
227#endif
228
229#ifdef CONFIG_K8_NUMA
230 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
231 return;
232#endif
233 printk(KERN_INFO "%s\n",
234 numa_off ? "NUMA turned off" : "No NUMA configuration found");
235
236 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
237 start_pfn << PAGE_SHIFT,
238 end_pfn << PAGE_SHIFT);
239 /* setup dummy node covering all memory */
240 memnode_shift = 63;
241 memnodemap[0] = 0;
242 nodes_clear(node_online_map);
243 node_set_online(0);
244 for (i = 0; i < NR_CPUS; i++)
Andi Kleen69d81fc2005-11-05 17:25:53 +0100245 numa_set_node(i, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700246 node_to_cpumask[0] = cpumask_of_cpu(0);
247 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
248}
249
Ashok Raje6982c62005-06-25 14:54:58 -0700250__cpuinit void numa_add_cpu(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251{
Ravikiran G Thirumalaie6a045a2005-09-30 11:59:21 -0700252 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700253}
254
Andi Kleen69d81fc2005-11-05 17:25:53 +0100255void __cpuinit numa_set_node(int cpu, int node)
256{
257 cpu_pda[cpu].nodenumber = node;
258 cpu_to_node[cpu] = node;
259}
260
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261unsigned long __init numa_free_all_bootmem(void)
262{
263 int i;
264 unsigned long pages = 0;
265 for_each_online_node(i) {
266 pages += free_all_bootmem_node(NODE_DATA(i));
267 }
268 return pages;
269}
270
271void __init paging_init(void)
272{
273 int i;
274 for_each_online_node(i) {
275 setup_node_zones(i);
276 }
277}
278
279/* [numa=off] */
280__init int numa_setup(char *opt)
281{
282 if (!strncmp(opt,"off",3))
283 numa_off = 1;
284#ifdef CONFIG_NUMA_EMU
285 if(!strncmp(opt, "fake=", 5)) {
286 numa_fake = simple_strtoul(opt+5,NULL,0); ;
287 if (numa_fake >= MAX_NUMNODES)
288 numa_fake = MAX_NUMNODES;
289 }
290#endif
291#ifdef CONFIG_ACPI_NUMA
292 if (!strncmp(opt,"noacpi",6))
293 acpi_numa = -1;
294#endif
295 return 1;
296}
297
298EXPORT_SYMBOL(cpu_to_node);
299EXPORT_SYMBOL(node_to_cpumask);
300EXPORT_SYMBOL(memnode_shift);
301EXPORT_SYMBOL(memnodemap);
302EXPORT_SYMBOL(node_data);