blob: 5b15186298938850ec4e90ea8e025f7e6a4ceeb2 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
Ravikiran G Thirumalai6c231b72005-09-06 15:17:45 -070025struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070026bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28int memnode_shift;
29u8 memnodemap[NODEMAPSIZE];
30
Ravikiran G Thirumalai6c231b72005-09-06 15:17:45 -070031unsigned char cpu_to_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
Andi Kleen0b07e982005-09-12 18:49:24 +020032unsigned char apicid_to_node[256] __cpuinitdata = {
33 [0 ... NR_CPUS-1] = NUMA_NO_NODE
34};
Ravikiran G Thirumalai6c231b72005-09-06 15:17:45 -070035cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070036
37int numa_off __initdata;
38
39int __init compute_hash_shift(struct node *nodes, int numnodes)
40{
41 int i;
Keith Manntheyb6846642005-07-28 21:15:38 -070042 int shift = 20;
43 unsigned long addr,maxend=0;
Linus Torvalds1da177e2005-04-16 15:20:36 -070044
Keith Manntheyb6846642005-07-28 21:15:38 -070045 for (i = 0; i < numnodes; i++)
46 if ((nodes[i].start != nodes[i].end) && (nodes[i].end > maxend))
47 maxend = nodes[i].end;
48
49 while ((1UL << shift) < (maxend / NODEMAPSIZE))
50 shift++;
51
52 printk (KERN_DEBUG"Using %d for the hash shift. Max adder is %lx \n",
53 shift,maxend);
54 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
55 for (i = 0; i < numnodes; i++) {
56 if (nodes[i].start == nodes[i].end)
57 continue;
58 for (addr = nodes[i].start;
59 addr < nodes[i].end;
60 addr += (1UL << shift)) {
61 if (memnodemap[addr >> shift] != 0xff) {
62 printk(KERN_INFO
63 "Your memory is not aligned you need to rebuild your kernel "
64 "with a bigger NODEMAPSIZE shift=%d adder=%lu\n",
65 shift,addr);
66 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -070067 }
Keith Manntheyb6846642005-07-28 21:15:38 -070068 memnodemap[addr >> shift] = i;
Linus Torvalds1da177e2005-04-16 15:20:36 -070069 }
Linus Torvalds1da177e2005-04-16 15:20:36 -070070 }
Keith Manntheyb6846642005-07-28 21:15:38 -070071 return shift;
Linus Torvalds1da177e2005-04-16 15:20:36 -070072}
73
Matt Tolentinobbfceef2005-06-23 00:08:07 -070074#ifdef CONFIG_SPARSEMEM
75int early_pfn_to_nid(unsigned long pfn)
76{
77 return phys_to_nid(pfn << PAGE_SHIFT);
78}
79#endif
80
Linus Torvalds1da177e2005-04-16 15:20:36 -070081/* Initialize bootmem allocator for a node */
82void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
83{
84 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
85 unsigned long nodedata_phys;
86 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
87
88 start = round_up(start, ZONE_ALIGN);
89
90 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
91
92 start_pfn = start >> PAGE_SHIFT;
93 end_pfn = end >> PAGE_SHIFT;
94
Matt Tolentinobbfceef2005-06-23 00:08:07 -070095 memory_present(nodeid, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -070096 nodedata_phys = find_e820_area(start, end, pgdat_size);
97 if (nodedata_phys == -1L)
98 panic("Cannot find memory pgdat in node %d\n", nodeid);
99
100 Dprintk("nodedata_phys %lx\n", nodedata_phys);
101
102 node_data[nodeid] = phys_to_virt(nodedata_phys);
103 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
104 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
105 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
106 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
107
108 /* Find a place for the bootmem map */
109 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
110 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
111 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
112 if (bootmap_start == -1L)
113 panic("Not enough continuous space for bootmap on node %d", nodeid);
114 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
115
116 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
117 bootmap_start >> PAGE_SHIFT,
118 start_pfn, end_pfn);
119
120 e820_bootmem_free(NODE_DATA(nodeid), start, end);
121
122 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
123 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
124 node_set_online(nodeid);
125}
126
127/* Initialize final allocator for a zone */
128void __init setup_node_zones(int nodeid)
129{
130 unsigned long start_pfn, end_pfn;
131 unsigned long zones[MAX_NR_ZONES];
Andi Kleen485761b2005-08-26 18:34:10 -0700132 unsigned long holes[MAX_NR_ZONES];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700133 unsigned long dma_end_pfn;
134
135 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
Andi Kleen485761b2005-08-26 18:34:10 -0700136 memset(holes, 0, sizeof(unsigned long) * MAX_NR_ZONES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137
138 start_pfn = node_start_pfn(nodeid);
139 end_pfn = node_end_pfn(nodeid);
140
141 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
142
143 /* All nodes > 0 have a zero length zone DMA */
144 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
145 if (start_pfn < dma_end_pfn) {
146 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
Andi Kleen485761b2005-08-26 18:34:10 -0700147 holes[ZONE_DMA] = e820_hole_size(start_pfn, dma_end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
Andi Kleen485761b2005-08-26 18:34:10 -0700149 holes[ZONE_NORMAL] = e820_hole_size(dma_end_pfn, end_pfn);
150
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151 } else {
152 zones[ZONE_NORMAL] = end_pfn - start_pfn;
Andi Kleen485761b2005-08-26 18:34:10 -0700153 holes[ZONE_NORMAL] = e820_hole_size(start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154 }
155
156 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
Andi Kleen485761b2005-08-26 18:34:10 -0700157 start_pfn, holes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158}
159
160void __init numa_init_array(void)
161{
162 int rr, i;
163 /* There are unfortunately some poorly designed mainboards around
164 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
165 mapping. To avoid this fill in the mapping for all possible
166 CPUs, as the number of CPUs is not known yet.
167 We round robin the existing nodes. */
168 rr = 0;
169 for (i = 0; i < NR_CPUS; i++) {
170 if (cpu_to_node[i] != NUMA_NO_NODE)
171 continue;
172 rr = next_node(rr, node_online_map);
173 if (rr == MAX_NUMNODES)
174 rr = first_node(node_online_map);
175 cpu_to_node[i] = rr;
176 rr++;
177 }
178
179 set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
180}
181
182#ifdef CONFIG_NUMA_EMU
183int numa_fake __initdata = 0;
184
185/* Numa emulation */
186static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
187{
188 int i;
189 struct node nodes[MAX_NUMNODES];
190 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
191
192 /* Kludge needed for the hash function */
193 if (hweight64(sz) > 1) {
194 unsigned long x = 1;
195 while ((x << 1) < sz)
196 x <<= 1;
197 if (x < sz/2)
198 printk("Numa emulation unbalanced. Complain to maintainer\n");
199 sz = x;
200 }
201
202 memset(&nodes,0,sizeof(nodes));
203 for (i = 0; i < numa_fake; i++) {
204 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
205 if (i == numa_fake-1)
206 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
207 nodes[i].end = nodes[i].start + sz;
208 if (i != numa_fake-1)
209 nodes[i].end--;
210 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
211 i,
212 nodes[i].start, nodes[i].end,
213 (nodes[i].end - nodes[i].start) >> 20);
214 node_set_online(i);
215 }
216 memnode_shift = compute_hash_shift(nodes, numa_fake);
217 if (memnode_shift < 0) {
218 memnode_shift = 0;
219 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
220 return -1;
221 }
222 for_each_online_node(i)
223 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
224 numa_init_array();
225 return 0;
226}
227#endif
228
229void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
230{
231 int i;
232
233#ifdef CONFIG_NUMA_EMU
234 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
235 return;
236#endif
237
238#ifdef CONFIG_ACPI_NUMA
239 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
240 end_pfn << PAGE_SHIFT))
241 return;
242#endif
243
244#ifdef CONFIG_K8_NUMA
245 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
246 return;
247#endif
248 printk(KERN_INFO "%s\n",
249 numa_off ? "NUMA turned off" : "No NUMA configuration found");
250
251 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
252 start_pfn << PAGE_SHIFT,
253 end_pfn << PAGE_SHIFT);
254 /* setup dummy node covering all memory */
255 memnode_shift = 63;
256 memnodemap[0] = 0;
257 nodes_clear(node_online_map);
258 node_set_online(0);
259 for (i = 0; i < NR_CPUS; i++)
260 cpu_to_node[i] = 0;
261 node_to_cpumask[0] = cpumask_of_cpu(0);
262 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
263}
264
Ashok Raje6982c62005-06-25 14:54:58 -0700265__cpuinit void numa_add_cpu(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700266{
267 /* BP is initialized elsewhere */
268 if (cpu)
269 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
270}
271
272unsigned long __init numa_free_all_bootmem(void)
273{
274 int i;
275 unsigned long pages = 0;
276 for_each_online_node(i) {
277 pages += free_all_bootmem_node(NODE_DATA(i));
278 }
279 return pages;
280}
281
282void __init paging_init(void)
283{
284 int i;
285 for_each_online_node(i) {
286 setup_node_zones(i);
287 }
288}
289
290/* [numa=off] */
291__init int numa_setup(char *opt)
292{
293 if (!strncmp(opt,"off",3))
294 numa_off = 1;
295#ifdef CONFIG_NUMA_EMU
296 if(!strncmp(opt, "fake=", 5)) {
297 numa_fake = simple_strtoul(opt+5,NULL,0); ;
298 if (numa_fake >= MAX_NUMNODES)
299 numa_fake = MAX_NUMNODES;
300 }
301#endif
302#ifdef CONFIG_ACPI_NUMA
303 if (!strncmp(opt,"noacpi",6))
304 acpi_numa = -1;
305#endif
306 return 1;
307}
308
309EXPORT_SYMBOL(cpu_to_node);
310EXPORT_SYMBOL(node_to_cpumask);
311EXPORT_SYMBOL(memnode_shift);
312EXPORT_SYMBOL(memnodemap);
313EXPORT_SYMBOL(node_data);