blob: 177659050fa0ab852da9d96a90bbddf8bbba407a [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * pSeries NUMA support
3 *
4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
Nishanth Aravamudan2d73bae2014-10-10 09:04:49 -070011#define pr_fmt(fmt) "numa: " fmt
12
Linus Torvalds1da177e2005-04-16 15:20:36 -070013#include <linux/threads.h>
14#include <linux/bootmem.h>
15#include <linux/init.h>
16#include <linux/mm.h>
17#include <linux/mmzone.h>
Paul Gortmaker4b16f8e2011-07-22 18:24:23 -040018#include <linux/export.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070019#include <linux/nodemask.h>
20#include <linux/cpu.h>
21#include <linux/notifier.h>
Yinghai Lu95f72d12010-07-12 14:36:09 +100022#include <linux/memblock.h>
Michael Ellerman6df16462008-02-14 11:37:49 +110023#include <linux/of.h>
Dave Hansen06eccea2009-02-12 12:36:04 +000024#include <linux/pfn.h>
Jesse Larrew9eff1a32010-12-01 12:31:15 +000025#include <linux/cpuset.h>
26#include <linux/node.h>
Nathan Fontenot30c05352013-04-24 06:02:13 +000027#include <linux/stop_machine.h>
Nathan Fontenote04fa612013-04-24 06:07:39 +000028#include <linux/proc_fs.h>
29#include <linux/seq_file.h>
30#include <linux/uaccess.h>
Linus Torvalds191a7122013-04-29 19:14:20 -070031#include <linux/slab.h>
Robert Jennings3be7db62013-07-24 20:13:21 -050032#include <asm/cputhreads.h>
Anton Blanchard45fb6ce2005-11-11 14:22:35 +110033#include <asm/sparsemem.h>
David S. Millerd9b2b2a2008-02-13 16:56:49 -080034#include <asm/prom.h>
Paul Mackerras2249ca92005-11-07 13:18:13 +110035#include <asm/smp.h>
Srivatsa S. Bhatd4edc5b2013-12-30 17:05:34 +053036#include <asm/cputhreads.h>
37#include <asm/topology.h>
Jesse Larrew9eff1a32010-12-01 12:31:15 +000038#include <asm/firmware.h>
39#include <asm/paca.h>
Jesse Larrew39bf9902010-12-17 22:07:47 +000040#include <asm/hvcall.h>
David Howellsae3a1972012-03-28 18:30:02 +010041#include <asm/setup.h>
Jesse Larrew176bbf12013-04-24 06:03:48 +000042#include <asm/vdso.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070043
44static int numa_enabled = 1;
45
Balbir Singh1daa6d02008-02-01 15:57:31 +110046static char *cmdline __initdata;
47
Linus Torvalds1da177e2005-04-16 15:20:36 -070048static int numa_debug;
49#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
50
Anton Blanchard45fb6ce2005-11-11 14:22:35 +110051int numa_cpu_lookup_table[NR_CPUS];
Anton Blanchard25863de2010-04-26 15:32:43 +000052cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
Linus Torvalds1da177e2005-04-16 15:20:36 -070053struct pglist_data *node_data[MAX_NUMNODES];
Anton Blanchard45fb6ce2005-11-11 14:22:35 +110054
55EXPORT_SYMBOL(numa_cpu_lookup_table);
Anton Blanchard25863de2010-04-26 15:32:43 +000056EXPORT_SYMBOL(node_to_cpumask_map);
Anton Blanchard45fb6ce2005-11-11 14:22:35 +110057EXPORT_SYMBOL(node_data);
58
Linus Torvalds1da177e2005-04-16 15:20:36 -070059static int min_common_depth;
Mike Kravetz237a09892005-12-05 12:06:42 -080060static int n_mem_addr_cells, n_mem_size_cells;
Anton Blanchard41eab6f2010-05-16 20:22:31 +000061static int form1_affinity;
62
63#define MAX_DISTANCE_REF_POINTS 4
64static int distance_ref_points_depth;
Alistair Poppleb08a2a12013-08-07 02:01:44 +100065static const __be32 *distance_ref_points;
Anton Blanchard41eab6f2010-05-16 20:22:31 +000066static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
Linus Torvalds1da177e2005-04-16 15:20:36 -070067
Anton Blanchard25863de2010-04-26 15:32:43 +000068/*
69 * Allocate node_to_cpumask_map based on number of available nodes
70 * Requires node_possible_map to be valid.
71 *
Wanlong Gao95129382012-01-12 17:20:09 -080072 * Note: cpumask_of_node() is not valid until after this is done.
Anton Blanchard25863de2010-04-26 15:32:43 +000073 */
74static void __init setup_node_to_cpumask_map(void)
75{
Cody P Schaferf9d531b2013-04-29 15:08:03 -070076 unsigned int node;
Anton Blanchard25863de2010-04-26 15:32:43 +000077
78 /* setup nr_node_ids if not done yet */
Cody P Schaferf9d531b2013-04-29 15:08:03 -070079 if (nr_node_ids == MAX_NUMNODES)
80 setup_nr_node_ids();
Anton Blanchard25863de2010-04-26 15:32:43 +000081
82 /* allocate the map */
83 for (node = 0; node < nr_node_ids; node++)
84 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
85
86 /* cpumask_of_node() will now work */
87 dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
88}
89
Stephen Rothwell55671f32013-03-25 18:44:44 +000090static int __init fake_numa_create_new_node(unsigned long end_pfn,
Balbir Singh1daa6d02008-02-01 15:57:31 +110091 unsigned int *nid)
92{
93 unsigned long long mem;
94 char *p = cmdline;
95 static unsigned int fake_nid;
96 static unsigned long long curr_boundary;
97
98 /*
99 * Modify node id, iff we started creating NUMA nodes
100 * We want to continue from where we left of the last time
101 */
102 if (fake_nid)
103 *nid = fake_nid;
104 /*
105 * In case there are no more arguments to parse, the
106 * node_id should be the same as the last fake node id
107 * (we've handled this above).
108 */
109 if (!p)
110 return 0;
111
112 mem = memparse(p, &p);
113 if (!mem)
114 return 0;
115
116 if (mem < curr_boundary)
117 return 0;
118
119 curr_boundary = mem;
120
121 if ((end_pfn << PAGE_SHIFT) > mem) {
122 /*
123 * Skip commas and spaces
124 */
125 while (*p == ',' || *p == ' ' || *p == '\t')
126 p++;
127
128 cmdline = p;
129 fake_nid++;
130 *nid = fake_nid;
131 dbg("created new fake_node with id %d\n", fake_nid);
132 return 1;
133 }
134 return 0;
135}
136
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000137/*
Tejun Heo5dfe8662011-07-14 09:46:10 +0200138 * get_node_active_region - Return active region containing pfn
Jon Tollefsone8170372008-10-16 18:59:43 +0000139 * Active range returned is empty if none found.
Tejun Heo5dfe8662011-07-14 09:46:10 +0200140 * @pfn: The page to return the region for
141 * @node_ar: Returned set to the active region containing @pfn
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000142 */
Tejun Heo5dfe8662011-07-14 09:46:10 +0200143static void __init get_node_active_region(unsigned long pfn,
144 struct node_active_region *node_ar)
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000145{
Tejun Heo5dfe8662011-07-14 09:46:10 +0200146 unsigned long start_pfn, end_pfn;
147 int i, nid;
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000148
Tejun Heo5dfe8662011-07-14 09:46:10 +0200149 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
150 if (pfn >= start_pfn && pfn < end_pfn) {
151 node_ar->nid = nid;
152 node_ar->start_pfn = start_pfn;
153 node_ar->end_pfn = end_pfn;
154 break;
155 }
156 }
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000157}
158
Srivatsa S. Bhatd4edc5b2013-12-30 17:05:34 +0530159static void reset_numa_cpu_lookup_table(void)
160{
161 unsigned int cpu;
162
163 for_each_possible_cpu(cpu)
164 numa_cpu_lookup_table[cpu] = -1;
165}
166
167static void update_numa_cpu_lookup_table(unsigned int cpu, int node)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700168{
169 numa_cpu_lookup_table[cpu] = node;
Srivatsa S. Bhatd4edc5b2013-12-30 17:05:34 +0530170}
171
172static void map_cpu_to_node(int cpu, int node)
173{
174 update_numa_cpu_lookup_table(cpu, node);
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100175
Nathan Lynchbf4b85b2006-03-20 18:34:45 -0600176 dbg("adding cpu %d to node %d\n", cpu, node);
177
Anton Blanchard25863de2010-04-26 15:32:43 +0000178 if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
179 cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180}
181
Jesse Larrew39bf9902010-12-17 22:07:47 +0000182#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183static void unmap_cpu_from_node(unsigned long cpu)
184{
185 int node = numa_cpu_lookup_table[cpu];
186
187 dbg("removing cpu %lu from node %d\n", cpu, node);
188
Anton Blanchard25863de2010-04-26 15:32:43 +0000189 if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
Anton Blanchard429f4d82011-01-29 12:37:16 +0000190 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700191 } else {
192 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
193 cpu, node);
194 }
195}
Jesse Larrew39bf9902010-12-17 22:07:47 +0000196#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198/* must hold reference to node during call */
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000199static const __be32 *of_get_associativity(struct device_node *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700200{
Stephen Rothwelle2eb6392007-04-03 22:26:41 +1000201 return of_get_property(dev, "ibm,associativity", NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202}
203
Chandrucf000852008-08-30 00:28:16 +1000204/*
205 * Returns the property linux,drconf-usable-memory if
206 * it exists (the property exists only in kexec/kdump kernels,
207 * added by kexec-tools)
208 */
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000209static const __be32 *of_get_usable_memory(struct device_node *memory)
Chandrucf000852008-08-30 00:28:16 +1000210{
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000211 const __be32 *prop;
Chandrucf000852008-08-30 00:28:16 +1000212 u32 len;
213 prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
214 if (!prop || len < sizeof(unsigned int))
Robert Jenningsec32dd62013-10-28 09:20:50 -0500215 return NULL;
Chandrucf000852008-08-30 00:28:16 +1000216 return prop;
217}
218
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000219int __node_distance(int a, int b)
220{
221 int i;
222 int distance = LOCAL_DISTANCE;
223
224 if (!form1_affinity)
Vaidyanathan Srinivasan7122bee2013-03-22 05:49:35 +0000225 return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000226
227 for (i = 0; i < distance_ref_points_depth; i++) {
228 if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
229 break;
230
231 /* Double the distance for each NUMA level */
232 distance *= 2;
233 }
234
235 return distance;
236}
Mike Qiu12c743e2014-04-18 15:07:14 -0700237EXPORT_SYMBOL(__node_distance);
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000238
239static void initialize_distance_lookup_table(int nid,
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000240 const __be32 *associativity)
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000241{
242 int i;
243
244 if (!form1_affinity)
245 return;
246
247 for (i = 0; i < distance_ref_points_depth; i++) {
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000248 const __be32 *entry;
249
250 entry = &associativity[be32_to_cpu(distance_ref_points[i])];
251 distance_lookup_table[nid][i] = of_read_number(entry, 1);
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000252 }
253}
254
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600255/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
256 * info is found.
257 */
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000258static int associativity_to_nid(const __be32 *associativity)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259{
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600260 int nid = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261
262 if (min_common_depth == -1)
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600263 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700264
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000265 if (of_read_number(associativity, 1) >= min_common_depth)
266 nid = of_read_number(&associativity[min_common_depth], 1);
Nathan Lynchbc16a752006-03-20 18:36:15 -0600267
268 /* POWER4 LPAR uses 0xffff as invalid node */
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600269 if (nid == 0xffff || nid >= MAX_NUMNODES)
270 nid = -1;
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000271
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000272 if (nid > 0 &&
273 of_read_number(associativity, 1) >= distance_ref_points_depth)
Jesse Larrew9eff1a32010-12-01 12:31:15 +0000274 initialize_distance_lookup_table(nid, associativity);
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000275
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600276out:
Nathan Lynchcf950b72006-03-20 18:35:45 -0600277 return nid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278}
279
Jesse Larrew9eff1a32010-12-01 12:31:15 +0000280/* Returns the nid associated with the given device tree node,
281 * or -1 if not found.
282 */
283static int of_node_to_nid_single(struct device_node *device)
284{
285 int nid = -1;
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000286 const __be32 *tmp;
Jesse Larrew9eff1a32010-12-01 12:31:15 +0000287
288 tmp = of_get_associativity(device);
289 if (tmp)
290 nid = associativity_to_nid(tmp);
291 return nid;
292}
293
Jeremy Kerr953039c2006-05-01 12:16:12 -0700294/* Walk the device tree upwards, looking for an associativity id */
295int of_node_to_nid(struct device_node *device)
296{
297 struct device_node *tmp;
298 int nid = -1;
299
300 of_node_get(device);
301 while (device) {
302 nid = of_node_to_nid_single(device);
303 if (nid != -1)
304 break;
305
306 tmp = device;
307 device = of_get_parent(tmp);
308 of_node_put(tmp);
309 }
310 of_node_put(device);
311
312 return nid;
313}
314EXPORT_SYMBOL_GPL(of_node_to_nid);
315
Linus Torvalds1da177e2005-04-16 15:20:36 -0700316static int __init find_min_common_depth(void)
317{
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000318 int depth;
Michael Ellermane70606e2011-04-10 20:42:05 +0000319 struct device_node *root;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700320
Dipankar Sarma1c8ee732011-10-28 04:25:32 +0000321 if (firmware_has_feature(FW_FEATURE_OPAL))
322 root = of_find_node_by_path("/ibm,opal");
323 else
324 root = of_find_node_by_path("/rtas");
Michael Ellermane70606e2011-04-10 20:42:05 +0000325 if (!root)
326 root = of_find_node_by_path("/");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327
328 /*
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000329 * This property is a set of 32-bit integers, each representing
330 * an index into the ibm,associativity nodes.
331 *
332 * With form 0 affinity the first integer is for an SMP configuration
333 * (should be all 0's) and the second is for a normal NUMA
334 * configuration. We have only one level of NUMA.
335 *
336 * With form 1 affinity the first integer is the most significant
337 * NUMA boundary and the following are progressively less significant
338 * boundaries. There can be more than one level of NUMA.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339 */
Michael Ellermane70606e2011-04-10 20:42:05 +0000340 distance_ref_points = of_get_property(root,
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000341 "ibm,associativity-reference-points",
342 &distance_ref_points_depth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000344 if (!distance_ref_points) {
345 dbg("NUMA: ibm,associativity-reference-points not found.\n");
346 goto err;
347 }
348
349 distance_ref_points_depth /= sizeof(int);
350
Nathan Fontenot8002b0c2013-04-24 05:58:23 +0000351 if (firmware_has_feature(FW_FEATURE_OPAL) ||
352 firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
353 dbg("Using form 1 affinity\n");
Dipankar Sarma1c8ee732011-10-28 04:25:32 +0000354 form1_affinity = 1;
Anton Blanchard4b83c332010-04-07 15:33:44 +0000355 }
356
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000357 if (form1_affinity) {
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000358 depth = of_read_number(distance_ref_points, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359 } else {
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000360 if (distance_ref_points_depth < 2) {
361 printk(KERN_WARNING "NUMA: "
362 "short ibm,associativity-reference-points\n");
363 goto err;
364 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700365
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000366 depth = of_read_number(&distance_ref_points[1], 1);
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000367 }
368
369 /*
370 * Warn and cap if the hardware supports more than
371 * MAX_DISTANCE_REF_POINTS domains.
372 */
373 if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
374 printk(KERN_WARNING "NUMA: distance array capped at "
375 "%d entries\n", MAX_DISTANCE_REF_POINTS);
376 distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
377 }
378
Michael Ellermane70606e2011-04-10 20:42:05 +0000379 of_node_put(root);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380 return depth;
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000381
382err:
Michael Ellermane70606e2011-04-10 20:42:05 +0000383 of_node_put(root);
Anton Blanchard41eab6f2010-05-16 20:22:31 +0000384 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700385}
386
Mike Kravetz84c9fdd2005-11-30 13:47:23 -0800387static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700388{
389 struct device_node *memory = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390
391 memory = of_find_node_by_type(memory, "memory");
Paul Mackerras54c23312005-12-05 15:50:39 +1100392 if (!memory)
Mike Kravetz84c9fdd2005-11-30 13:47:23 -0800393 panic("numa.c: No memory nodes found!");
Paul Mackerras54c23312005-12-05 15:50:39 +1100394
Stephen Rothwella8bda5d2007-04-03 10:56:50 +1000395 *n_addr_cells = of_n_addr_cells(memory);
Stephen Rothwell9213fee2007-04-03 10:57:48 +1000396 *n_size_cells = of_n_size_cells(memory);
Mike Kravetz84c9fdd2005-11-30 13:47:23 -0800397 of_node_put(memory);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700398}
399
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000400static unsigned long read_n_cells(int n, const __be32 **buf)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401{
402 unsigned long result = 0;
403
404 while (n--) {
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000405 result = (result << 32) | of_read_number(*buf, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700406 (*buf)++;
407 }
408 return result;
409}
410
Nathan Fontenot83426812008-07-03 13:35:54 +1000411/*
Yinghai Lu95f72d12010-07-12 14:36:09 +1000412 * Read the next memblock list entry from the ibm,dynamic-memory property
Nathan Fontenot83426812008-07-03 13:35:54 +1000413 * and return the information in the provided of_drconf_cell structure.
414 */
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000415static void read_drconf_cell(struct of_drconf_cell *drmem, const __be32 **cellp)
Nathan Fontenot83426812008-07-03 13:35:54 +1000416{
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000417 const __be32 *cp;
Nathan Fontenot83426812008-07-03 13:35:54 +1000418
419 drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
420
421 cp = *cellp;
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000422 drmem->drc_index = of_read_number(cp, 1);
423 drmem->reserved = of_read_number(&cp[1], 1);
424 drmem->aa_index = of_read_number(&cp[2], 1);
425 drmem->flags = of_read_number(&cp[3], 1);
Nathan Fontenot83426812008-07-03 13:35:54 +1000426
427 *cellp = cp + 4;
428}
429
430/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300431 * Retrieve and validate the ibm,dynamic-memory property of the device tree.
Nathan Fontenot83426812008-07-03 13:35:54 +1000432 *
Yinghai Lu95f72d12010-07-12 14:36:09 +1000433 * The layout of the ibm,dynamic-memory property is a number N of memblock
434 * list entries followed by N memblock list entries. Each memblock list entry
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300435 * contains information as laid out in the of_drconf_cell struct above.
Nathan Fontenot83426812008-07-03 13:35:54 +1000436 */
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000437static int of_get_drconf_memory(struct device_node *memory, const __be32 **dm)
Nathan Fontenot83426812008-07-03 13:35:54 +1000438{
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000439 const __be32 *prop;
Nathan Fontenot83426812008-07-03 13:35:54 +1000440 u32 len, entries;
441
442 prop = of_get_property(memory, "ibm,dynamic-memory", &len);
443 if (!prop || len < sizeof(unsigned int))
444 return 0;
445
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000446 entries = of_read_number(prop++, 1);
Nathan Fontenot83426812008-07-03 13:35:54 +1000447
448 /* Now that we know the number of entries, revalidate the size
449 * of the property read in to ensure we have everything
450 */
451 if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
452 return 0;
453
454 *dm = prop;
455 return entries;
456}
457
458/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300459 * Retrieve and validate the ibm,lmb-size property for drconf memory
Nathan Fontenot83426812008-07-03 13:35:54 +1000460 * from the device tree.
461 */
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +1000462static u64 of_get_lmb_size(struct device_node *memory)
Nathan Fontenot83426812008-07-03 13:35:54 +1000463{
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000464 const __be32 *prop;
Nathan Fontenot83426812008-07-03 13:35:54 +1000465 u32 len;
466
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +1000467 prop = of_get_property(memory, "ibm,lmb-size", &len);
Nathan Fontenot83426812008-07-03 13:35:54 +1000468 if (!prop || len < sizeof(unsigned int))
469 return 0;
470
471 return read_n_cells(n_mem_size_cells, &prop);
472}
473
474struct assoc_arrays {
475 u32 n_arrays;
476 u32 array_sz;
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000477 const __be32 *arrays;
Nathan Fontenot83426812008-07-03 13:35:54 +1000478};
479
480/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300481 * Retrieve and validate the list of associativity arrays for drconf
Nathan Fontenot83426812008-07-03 13:35:54 +1000482 * memory from the ibm,associativity-lookup-arrays property of the
483 * device tree..
484 *
485 * The layout of the ibm,associativity-lookup-arrays property is a number N
486 * indicating the number of associativity arrays, followed by a number M
487 * indicating the size of each associativity array, followed by a list
488 * of N associativity arrays.
489 */
490static int of_get_assoc_arrays(struct device_node *memory,
491 struct assoc_arrays *aa)
492{
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000493 const __be32 *prop;
Nathan Fontenot83426812008-07-03 13:35:54 +1000494 u32 len;
495
496 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
497 if (!prop || len < 2 * sizeof(unsigned int))
498 return -1;
499
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000500 aa->n_arrays = of_read_number(prop++, 1);
501 aa->array_sz = of_read_number(prop++, 1);
Nathan Fontenot83426812008-07-03 13:35:54 +1000502
Justin P. Mattock42b2aa82011-11-28 20:31:00 -0800503 /* Now that we know the number of arrays and size of each array,
Nathan Fontenot83426812008-07-03 13:35:54 +1000504 * revalidate the size of the property read in.
505 */
506 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
507 return -1;
508
509 aa->arrays = prop;
510 return 0;
511}
512
513/*
514 * This is like of_node_to_nid_single() for memory represented in the
515 * ibm,dynamic-reconfiguration-memory node.
516 */
517static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
518 struct assoc_arrays *aa)
519{
520 int default_nid = 0;
521 int nid = default_nid;
522 int index;
523
524 if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
525 !(drmem->flags & DRCONF_MEM_AI_INVALID) &&
526 drmem->aa_index < aa->n_arrays) {
527 index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000528 nid = of_read_number(&aa->arrays[index], 1);
Nathan Fontenot83426812008-07-03 13:35:54 +1000529
530 if (nid == 0xffff || nid >= MAX_NUMNODES)
531 nid = default_nid;
532 }
533
534 return nid;
535}
536
Linus Torvalds1da177e2005-04-16 15:20:36 -0700537/*
538 * Figure out to which domain a cpu belongs and stick it there.
539 * Return the id of the domain used.
540 */
Paul Gortmaker061d19f2013-06-24 15:30:09 -0400541static int numa_setup_cpu(unsigned long lcpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700542{
Li Zhong297cf502014-08-27 17:34:01 +0800543 int nid = -1;
Srivatsa S. Bhatd4edc5b2013-12-30 17:05:34 +0530544 struct device_node *cpu;
545
546 /*
547 * If a valid cpu-to-node mapping is already available, use it
548 * directly instead of querying the firmware, since it represents
549 * the most recent mapping notified to us by the platform (eg: VPHN).
550 */
551 if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) {
552 map_cpu_to_node(lcpu, nid);
553 return nid;
554 }
555
556 cpu = of_get_cpu_node(lcpu, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700557
558 if (!cpu) {
559 WARN_ON(1);
Li Zhong297cf502014-08-27 17:34:01 +0800560 if (cpu_present(lcpu))
561 goto out_present;
562 else
563 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700564 }
565
Jeremy Kerr953039c2006-05-01 12:16:12 -0700566 nid = of_node_to_nid_single(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700567
Li Zhong297cf502014-08-27 17:34:01 +0800568out_present:
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600569 if (nid < 0 || !node_online(nid))
H Hartley Sweeten72c33682010-03-05 13:42:43 -0800570 nid = first_online_node;
Li Zhong297cf502014-08-27 17:34:01 +0800571
Nathan Lynchcf950b72006-03-20 18:35:45 -0600572 map_cpu_to_node(lcpu, nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700573 of_node_put(cpu);
Li Zhong297cf502014-08-27 17:34:01 +0800574out:
Nathan Lynchcf950b72006-03-20 18:35:45 -0600575 return nid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700576}
577
Srivatsa S. Bhat68fb18aa2013-12-30 17:06:04 +0530578static void verify_cpu_node_mapping(int cpu, int node)
579{
580 int base, sibling, i;
581
582 /* Verify that all the threads in the core belong to the same node */
583 base = cpu_first_thread_sibling(cpu);
584
585 for (i = 0; i < threads_per_core; i++) {
586 sibling = base + i;
587
588 if (sibling == cpu || cpu_is_offline(sibling))
589 continue;
590
591 if (cpu_to_node(sibling) != node) {
592 WARN(1, "CPU thread siblings %d and %d don't belong"
593 " to the same node!\n", cpu, sibling);
594 break;
595 }
596 }
597}
598
Paul Gortmaker061d19f2013-06-24 15:30:09 -0400599static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700600 void *hcpu)
601{
602 unsigned long lcpu = (unsigned long)hcpu;
Srivatsa S. Bhat68fb18aa2013-12-30 17:06:04 +0530603 int ret = NOTIFY_DONE, nid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700604
605 switch (action) {
606 case CPU_UP_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700607 case CPU_UP_PREPARE_FROZEN:
Srivatsa S. Bhat68fb18aa2013-12-30 17:06:04 +0530608 nid = numa_setup_cpu(lcpu);
609 verify_cpu_node_mapping((int)lcpu, nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700610 ret = NOTIFY_OK;
611 break;
612#ifdef CONFIG_HOTPLUG_CPU
613 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700614 case CPU_DEAD_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700615 case CPU_UP_CANCELED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700616 case CPU_UP_CANCELED_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700617 unmap_cpu_from_node(lcpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618 ret = NOTIFY_OK;
Andrey Utkinb00fc6e2014-08-04 23:13:10 +0300619 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700620#endif
621 }
622 return ret;
623}
624
625/*
626 * Check and possibly modify a memory region to enforce the memory limit.
627 *
628 * Returns the size the region should have to enforce the memory limit.
629 * This will either be the original value of size, a truncated value,
630 * or zero. If the returned value of size is 0 the region should be
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300631 * discarded as it lies wholly above the memory limit.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700632 */
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100633static unsigned long __init numa_enforce_memory_limit(unsigned long start,
634 unsigned long size)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700635{
636 /*
Yinghai Lu95f72d12010-07-12 14:36:09 +1000637 * We use memblock_end_of_DRAM() in here instead of memory_limit because
Linus Torvalds1da177e2005-04-16 15:20:36 -0700638 * we've already adjusted it for the limit and it takes care of
Milton Millerfe552492008-10-20 15:37:04 +0000639 * having memory holes below the limit. Also, in the case of
640 * iommu_is_off, memory_limit is not set but is implicitly enforced.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700641 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700642
Yinghai Lu95f72d12010-07-12 14:36:09 +1000643 if (start + size <= memblock_end_of_DRAM())
Linus Torvalds1da177e2005-04-16 15:20:36 -0700644 return size;
645
Yinghai Lu95f72d12010-07-12 14:36:09 +1000646 if (start >= memblock_end_of_DRAM())
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647 return 0;
648
Yinghai Lu95f72d12010-07-12 14:36:09 +1000649 return memblock_end_of_DRAM() - start;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650}
651
Paul Mackerras02045682006-11-29 22:27:42 +1100652/*
Chandrucf000852008-08-30 00:28:16 +1000653 * Reads the counter for a given entry in
654 * linux,drconf-usable-memory property
655 */
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000656static inline int __init read_usm_ranges(const __be32 **usm)
Chandrucf000852008-08-30 00:28:16 +1000657{
658 /*
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +1000659 * For each lmb in ibm,dynamic-memory a corresponding
Chandrucf000852008-08-30 00:28:16 +1000660 * entry in linux,drconf-usable-memory property contains
661 * a counter followed by that many (base, size) duple.
662 * read the counter from linux,drconf-usable-memory
663 */
664 return read_n_cells(n_mem_size_cells, usm);
665}
666
667/*
Paul Mackerras02045682006-11-29 22:27:42 +1100668 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
669 * node. This assumes n_mem_{addr,size}_cells have been set.
670 */
671static void __init parse_drconf_memory(struct device_node *memory)
672{
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000673 const __be32 *uninitialized_var(dm), *usm;
Chandrucf000852008-08-30 00:28:16 +1000674 unsigned int n, rc, ranges, is_kexec_kdump = 0;
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +1000675 unsigned long lmb_size, base, size, sz;
Nathan Fontenot83426812008-07-03 13:35:54 +1000676 int nid;
Benjamin Herrenschmidtaa709f32012-07-05 16:30:33 +0000677 struct assoc_arrays aa = { .arrays = NULL };
Paul Mackerras02045682006-11-29 22:27:42 +1100678
Nathan Fontenot83426812008-07-03 13:35:54 +1000679 n = of_get_drconf_memory(memory, &dm);
680 if (!n)
Paul Mackerras02045682006-11-29 22:27:42 +1100681 return;
682
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +1000683 lmb_size = of_get_lmb_size(memory);
684 if (!lmb_size)
Nathan Fontenot83426812008-07-03 13:35:54 +1000685 return;
686
687 rc = of_get_assoc_arrays(memory, &aa);
688 if (rc)
Paul Mackerras02045682006-11-29 22:27:42 +1100689 return;
690
Chandrucf000852008-08-30 00:28:16 +1000691 /* check if this is a kexec/kdump kernel */
692 usm = of_get_usable_memory(memory);
693 if (usm != NULL)
694 is_kexec_kdump = 1;
695
Paul Mackerras02045682006-11-29 22:27:42 +1100696 for (; n != 0; --n) {
Nathan Fontenot83426812008-07-03 13:35:54 +1000697 struct of_drconf_cell drmem;
Balbir Singh1daa6d02008-02-01 15:57:31 +1100698
Nathan Fontenot83426812008-07-03 13:35:54 +1000699 read_drconf_cell(&drmem, &dm);
700
701 /* skip this block if the reserved bit is set in flags (0x80)
702 or if the block is not assigned to this partition (0x8) */
703 if ((drmem.flags & DRCONF_MEM_RESERVED)
704 || !(drmem.flags & DRCONF_MEM_ASSIGNED))
705 continue;
706
Chandrucf000852008-08-30 00:28:16 +1000707 base = drmem.base_addr;
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +1000708 size = lmb_size;
Chandrucf000852008-08-30 00:28:16 +1000709 ranges = 1;
Nathan Fontenot83426812008-07-03 13:35:54 +1000710
Chandrucf000852008-08-30 00:28:16 +1000711 if (is_kexec_kdump) {
712 ranges = read_usm_ranges(&usm);
713 if (!ranges) /* there are no (base, size) duple */
714 continue;
715 }
716 do {
717 if (is_kexec_kdump) {
718 base = read_n_cells(n_mem_addr_cells, &usm);
719 size = read_n_cells(n_mem_size_cells, &usm);
720 }
721 nid = of_drconf_to_nid_single(&drmem, &aa);
722 fake_numa_create_new_node(
723 ((base + size) >> PAGE_SHIFT),
Nathan Fontenot83426812008-07-03 13:35:54 +1000724 &nid);
Chandrucf000852008-08-30 00:28:16 +1000725 node_set_online(nid);
726 sz = numa_enforce_memory_limit(base, size);
727 if (sz)
Tang Chene7e8de52014-01-21 15:49:26 -0800728 memblock_set_node(base, sz,
729 &memblock.memory, nid);
Chandrucf000852008-08-30 00:28:16 +1000730 } while (--ranges);
Paul Mackerras02045682006-11-29 22:27:42 +1100731 }
732}
733
Linus Torvalds1da177e2005-04-16 15:20:36 -0700734static int __init parse_numa_properties(void)
735{
Anton Blanchard94db7c52011-08-10 20:44:22 +0000736 struct device_node *memory;
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600737 int default_nid = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700738 unsigned long i;
739
740 if (numa_enabled == 0) {
741 printk(KERN_WARNING "NUMA disabled by user\n");
742 return -1;
743 }
744
Linus Torvalds1da177e2005-04-16 15:20:36 -0700745 min_common_depth = find_min_common_depth();
746
Linus Torvalds1da177e2005-04-16 15:20:36 -0700747 if (min_common_depth < 0)
748 return min_common_depth;
749
Nathan Lynchbf4b85b2006-03-20 18:34:45 -0600750 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
751
Linus Torvalds1da177e2005-04-16 15:20:36 -0700752 /*
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600753 * Even though we connect cpus to numa domains later in SMP
754 * init, we need to know the node ids now. This is because
755 * each node to be onlined must have NODE_DATA etc backing it.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700756 */
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600757 for_each_present_cpu(i) {
Anton Blancharddfbe93a2011-08-10 20:44:23 +0000758 struct device_node *cpu;
Nathan Lynchcf950b72006-03-20 18:35:45 -0600759 int nid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700760
Milton Miller8b16cd22009-01-08 02:19:45 +0000761 cpu = of_get_cpu_node(i, NULL);
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600762 BUG_ON(!cpu);
Jeremy Kerr953039c2006-05-01 12:16:12 -0700763 nid = of_node_to_nid_single(cpu);
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600764 of_node_put(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700765
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600766 /*
767 * Don't fall back to default_nid yet -- we will plug
768 * cpus into nodes once the memory scan has discovered
769 * the topology.
770 */
771 if (nid < 0)
772 continue;
773 node_set_online(nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700774 }
775
Mike Kravetz237a09892005-12-05 12:06:42 -0800776 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
Anton Blanchard94db7c52011-08-10 20:44:22 +0000777
778 for_each_node_by_type(memory, "memory") {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700779 unsigned long start;
780 unsigned long size;
Nathan Lynchcf950b72006-03-20 18:35:45 -0600781 int nid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700782 int ranges;
Alistair Poppleb08a2a12013-08-07 02:01:44 +1000783 const __be32 *memcell_buf;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700784 unsigned int len;
785
Stephen Rothwelle2eb6392007-04-03 22:26:41 +1000786 memcell_buf = of_get_property(memory,
Michael Ellermanba759482005-12-04 18:39:55 +1100787 "linux,usable-memory", &len);
788 if (!memcell_buf || len <= 0)
Stephen Rothwelle2eb6392007-04-03 22:26:41 +1000789 memcell_buf = of_get_property(memory, "reg", &len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790 if (!memcell_buf || len <= 0)
791 continue;
792
Benjamin Herrenschmidtcc5d0182005-12-13 18:01:21 +1100793 /* ranges in cell */
794 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700795new_range:
796 /* these are order-sensitive, and modify the buffer pointer */
Mike Kravetz237a09892005-12-05 12:06:42 -0800797 start = read_n_cells(n_mem_addr_cells, &memcell_buf);
798 size = read_n_cells(n_mem_size_cells, &memcell_buf);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600800 /*
801 * Assumption: either all memory nodes or none will
802 * have associativity properties. If none, then
803 * everything goes to default_nid.
804 */
Jeremy Kerr953039c2006-05-01 12:16:12 -0700805 nid = of_node_to_nid_single(memory);
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600806 if (nid < 0)
807 nid = default_nid;
Balbir Singh1daa6d02008-02-01 15:57:31 +1100808
809 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600810 node_set_online(nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700811
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100812 if (!(size = numa_enforce_memory_limit(start, size))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700813 if (--ranges)
814 goto new_range;
815 else
816 continue;
817 }
818
Tang Chene7e8de52014-01-21 15:49:26 -0800819 memblock_set_node(start, size, &memblock.memory, nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700820
821 if (--ranges)
822 goto new_range;
823 }
824
Paul Mackerras02045682006-11-29 22:27:42 +1100825 /*
Anton Blancharddfbe93a2011-08-10 20:44:23 +0000826 * Now do the same thing for each MEMBLOCK listed in the
827 * ibm,dynamic-memory property in the
828 * ibm,dynamic-reconfiguration-memory node.
Paul Mackerras02045682006-11-29 22:27:42 +1100829 */
830 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
831 if (memory)
832 parse_drconf_memory(memory);
833
Linus Torvalds1da177e2005-04-16 15:20:36 -0700834 return 0;
835}
836
837static void __init setup_nonnuma(void)
838{
Yinghai Lu95f72d12010-07-12 14:36:09 +1000839 unsigned long top_of_ram = memblock_end_of_DRAM();
840 unsigned long total_ram = memblock_phys_mem_size();
Mel Gormanc67c3cb2006-09-27 01:49:49 -0700841 unsigned long start_pfn, end_pfn;
Benjamin Herrenschmidt28be7072010-08-04 13:43:53 +1000842 unsigned int nid = 0;
843 struct memblock_region *reg;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700844
Olof Johanssone110b282006-04-12 15:25:01 -0500845 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700846 top_of_ram, total_ram);
Olof Johanssone110b282006-04-12 15:25:01 -0500847 printk(KERN_DEBUG "Memory hole size: %ldMB\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700848 (top_of_ram - total_ram) >> 20);
849
Benjamin Herrenschmidt28be7072010-08-04 13:43:53 +1000850 for_each_memblock(memory, reg) {
Yinghai Luc7fc2de2010-10-12 14:07:09 -0700851 start_pfn = memblock_region_memory_base_pfn(reg);
852 end_pfn = memblock_region_memory_end_pfn(reg);
Balbir Singh1daa6d02008-02-01 15:57:31 +1100853
854 fake_numa_create_new_node(end_pfn, &nid);
Tejun Heo1d7cfe12011-12-08 10:22:08 -0800855 memblock_set_node(PFN_PHYS(start_pfn),
Tang Chene7e8de52014-01-21 15:49:26 -0800856 PFN_PHYS(end_pfn - start_pfn),
857 &memblock.memory, nid);
Balbir Singh1daa6d02008-02-01 15:57:31 +1100858 node_set_online(nid);
Mel Gormanc67c3cb2006-09-27 01:49:49 -0700859 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700860}
861
Anton Blanchard4b703a22005-12-13 06:56:47 +1100862void __init dump_numa_cpu_topology(void)
863{
864 unsigned int node;
865 unsigned int cpu, count;
866
867 if (min_common_depth == -1 || !numa_enabled)
868 return;
869
870 for_each_online_node(node) {
Olof Johanssone110b282006-04-12 15:25:01 -0500871 printk(KERN_DEBUG "Node %d CPUs:", node);
Anton Blanchard4b703a22005-12-13 06:56:47 +1100872
873 count = 0;
874 /*
875 * If we used a CPU iterator here we would miss printing
876 * the holes in the cpumap.
877 */
Anton Blanchard25863de2010-04-26 15:32:43 +0000878 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
879 if (cpumask_test_cpu(cpu,
880 node_to_cpumask_map[node])) {
Anton Blanchard4b703a22005-12-13 06:56:47 +1100881 if (count == 0)
882 printk(" %u", cpu);
883 ++count;
884 } else {
885 if (count > 1)
886 printk("-%u", cpu - 1);
887 count = 0;
888 }
889 }
890
891 if (count > 1)
Anton Blanchard25863de2010-04-26 15:32:43 +0000892 printk("-%u", nr_cpu_ids - 1);
Anton Blanchard4b703a22005-12-13 06:56:47 +1100893 printk("\n");
894 }
895}
896
897static void __init dump_numa_memory_topology(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700898{
899 unsigned int node;
900 unsigned int count;
901
902 if (min_common_depth == -1 || !numa_enabled)
903 return;
904
905 for_each_online_node(node) {
906 unsigned long i;
907
Olof Johanssone110b282006-04-12 15:25:01 -0500908 printk(KERN_DEBUG "Node %d Memory:", node);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700909
910 count = 0;
911
Yinghai Lu95f72d12010-07-12 14:36:09 +1000912 for (i = 0; i < memblock_end_of_DRAM();
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100913 i += (1 << SECTION_SIZE_BITS)) {
914 if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700915 if (count == 0)
916 printk(" 0x%lx", i);
917 ++count;
918 } else {
919 if (count > 0)
920 printk("-0x%lx", i);
921 count = 0;
922 }
923 }
924
925 if (count > 0)
926 printk("-0x%lx", i);
927 printk("\n");
928 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700929}
930
931/*
Yinghai Lu95f72d12010-07-12 14:36:09 +1000932 * Allocate some memory, satisfying the memblock or bootmem allocator where
Linus Torvalds1da177e2005-04-16 15:20:36 -0700933 * required. nid is the preferred node and end is the physical address of
934 * the highest address in the node.
935 *
Dave Hansen0be210f2008-12-09 08:21:35 +0000936 * Returns the virtual address of the memory.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700937 */
Dave Hansen893473d2008-12-09 08:21:36 +0000938static void __init *careful_zallocation(int nid, unsigned long size,
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100939 unsigned long align,
940 unsigned long end_pfn)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700941{
Dave Hansen0be210f2008-12-09 08:21:35 +0000942 void *ret;
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100943 int new_nid;
Dave Hansen0be210f2008-12-09 08:21:35 +0000944 unsigned long ret_paddr;
945
Yinghai Lu95f72d12010-07-12 14:36:09 +1000946 ret_paddr = __memblock_alloc_base(size, align, end_pfn << PAGE_SHIFT);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700947
948 /* retry over all memory */
Dave Hansen0be210f2008-12-09 08:21:35 +0000949 if (!ret_paddr)
Yinghai Lu95f72d12010-07-12 14:36:09 +1000950 ret_paddr = __memblock_alloc_base(size, align, memblock_end_of_DRAM());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700951
Dave Hansen0be210f2008-12-09 08:21:35 +0000952 if (!ret_paddr)
Dave Hansen5d21ea22008-12-09 08:21:33 +0000953 panic("numa.c: cannot allocate %lu bytes for node %d",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700954 size, nid);
955
Dave Hansen0be210f2008-12-09 08:21:35 +0000956 ret = __va(ret_paddr);
957
Linus Torvalds1da177e2005-04-16 15:20:36 -0700958 /*
Dave Hansenc555e5202008-12-09 08:21:32 +0000959 * We initialize the nodes in numeric order: 0, 1, 2...
Yinghai Lu95f72d12010-07-12 14:36:09 +1000960 * and hand over control from the MEMBLOCK allocator to the
Dave Hansenc555e5202008-12-09 08:21:32 +0000961 * bootmem allocator. If this function is called for
962 * node 5, then we know that all nodes <5 are using the
Yinghai Lu95f72d12010-07-12 14:36:09 +1000963 * bootmem allocator instead of the MEMBLOCK allocator.
Dave Hansenc555e5202008-12-09 08:21:32 +0000964 *
965 * So, check the nid from which this allocation came
966 * and double check to see if we need to use bootmem
Yinghai Lu95f72d12010-07-12 14:36:09 +1000967 * instead of the MEMBLOCK. We don't free the MEMBLOCK memory
Dave Hansenc555e5202008-12-09 08:21:32 +0000968 * since it would be useless.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700969 */
Dave Hansen0be210f2008-12-09 08:21:35 +0000970 new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT);
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100971 if (new_nid < nid) {
Dave Hansen0be210f2008-12-09 08:21:35 +0000972 ret = __alloc_bootmem_node(NODE_DATA(new_nid),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700973 size, align, 0);
974
Dave Hansen0be210f2008-12-09 08:21:35 +0000975 dbg("alloc_bootmem %p %lx\n", ret, size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700976 }
977
Dave Hansen893473d2008-12-09 08:21:36 +0000978 memset(ret, 0, size);
Dave Hansen0be210f2008-12-09 08:21:35 +0000979 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980}
981
Paul Gortmaker061d19f2013-06-24 15:30:09 -0400982static struct notifier_block ppc64_numa_nb = {
Chandra Seetharaman74b85f32006-06-27 02:54:09 -0700983 .notifier_call = cpu_numa_callback,
984 .priority = 1 /* Must run before sched domains notifier. */
985};
986
David Rientjes28e86bd2011-12-08 12:33:29 +0000987static void __init mark_reserved_regions_for_nid(int nid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700988{
Dave Hansen4a618662008-11-24 12:02:35 +0000989 struct pglist_data *node = NODE_DATA(nid);
Benjamin Herrenschmidt28be7072010-08-04 13:43:53 +1000990 struct memblock_region *reg;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991
Benjamin Herrenschmidt28be7072010-08-04 13:43:53 +1000992 for_each_memblock(reserved, reg) {
993 unsigned long physbase = reg->base;
994 unsigned long size = reg->size;
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000995 unsigned long start_pfn = physbase >> PAGE_SHIFT;
Dave Hansen06eccea2009-02-12 12:36:04 +0000996 unsigned long end_pfn = PFN_UP(physbase + size);
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000997 struct node_active_region node_ar;
Xishi Qiu64080682013-11-12 15:07:17 -0800998 unsigned long node_end_pfn = pgdat_end_pfn(node);
Dave Hansen4a618662008-11-24 12:02:35 +0000999
1000 /*
Yinghai Lu95f72d12010-07-12 14:36:09 +10001001 * Check to make sure that this memblock.reserved area is
Dave Hansen4a618662008-11-24 12:02:35 +00001002 * within the bounds of the node that we care about.
1003 * Checking the nid of the start and end points is not
1004 * sufficient because the reserved area could span the
1005 * entire node.
1006 */
1007 if (end_pfn <= node->node_start_pfn ||
1008 start_pfn >= node_end_pfn)
1009 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001010
Jon Tollefson8f64e1f2008-10-09 10:18:40 +00001011 get_node_active_region(start_pfn, &node_ar);
Jon Tollefsone8170372008-10-16 18:59:43 +00001012 while (start_pfn < end_pfn &&
1013 node_ar.start_pfn < node_ar.end_pfn) {
1014 unsigned long reserve_size = size;
Jon Tollefson8f64e1f2008-10-09 10:18:40 +00001015 /*
1016 * if reserved region extends past active region
1017 * then trim size to active region
1018 */
1019 if (end_pfn > node_ar.end_pfn)
Jon Tollefsone8170372008-10-16 18:59:43 +00001020 reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
Dave Hansen06eccea2009-02-12 12:36:04 +00001021 - physbase;
Dave Hansena4c74dd2008-12-11 08:36:06 +00001022 /*
1023 * Only worry about *this* node, others may not
1024 * yet have valid NODE_DATA().
1025 */
1026 if (node_ar.nid == nid) {
1027 dbg("reserve_bootmem %lx %lx nid=%d\n",
1028 physbase, reserve_size, node_ar.nid);
1029 reserve_bootmem_node(NODE_DATA(node_ar.nid),
1030 physbase, reserve_size,
1031 BOOTMEM_DEFAULT);
1032 }
Jon Tollefson8f64e1f2008-10-09 10:18:40 +00001033 /*
1034 * if reserved region is contained in the active region
1035 * then done.
1036 */
1037 if (end_pfn <= node_ar.end_pfn)
1038 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001039
Jon Tollefson8f64e1f2008-10-09 10:18:40 +00001040 /*
1041 * reserved region extends past the active region
1042 * get next active region that contains this
1043 * reserved region
1044 */
1045 start_pfn = node_ar.end_pfn;
1046 physbase = start_pfn << PAGE_SHIFT;
Jon Tollefsone8170372008-10-16 18:59:43 +00001047 size = size - reserve_size;
Jon Tollefson8f64e1f2008-10-09 10:18:40 +00001048 get_node_active_region(start_pfn, &node_ar);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001049 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050 }
Dave Hansen4a618662008-11-24 12:02:35 +00001051}
Jon Tollefson8f64e1f2008-10-09 10:18:40 +00001052
Dave Hansen4a618662008-11-24 12:02:35 +00001053
1054void __init do_init_bootmem(void)
1055{
Nishanth Aravamudan2fabf082014-07-17 16:15:12 -07001056 int nid, cpu;
Dave Hansen4a618662008-11-24 12:02:35 +00001057
1058 min_low_pfn = 0;
Yinghai Lu95f72d12010-07-12 14:36:09 +10001059 max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
Dave Hansen4a618662008-11-24 12:02:35 +00001060 max_pfn = max_low_pfn;
1061
1062 if (parse_numa_properties())
1063 setup_nonnuma();
1064 else
1065 dump_numa_memory_topology();
1066
Dave Hansen4a618662008-11-24 12:02:35 +00001067 for_each_online_node(nid) {
1068 unsigned long start_pfn, end_pfn;
Dave Hansen0be210f2008-12-09 08:21:35 +00001069 void *bootmem_vaddr;
Dave Hansen4a618662008-11-24 12:02:35 +00001070 unsigned long bootmap_pages;
1071
1072 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
1073
1074 /*
1075 * Allocate the node structure node local if possible
1076 *
1077 * Be careful moving this around, as it relies on all
1078 * previous nodes' bootmem to be initialized and have
1079 * all reserved areas marked.
1080 */
Dave Hansen893473d2008-12-09 08:21:36 +00001081 NODE_DATA(nid) = careful_zallocation(nid,
Dave Hansen4a618662008-11-24 12:02:35 +00001082 sizeof(struct pglist_data),
1083 SMP_CACHE_BYTES, end_pfn);
Dave Hansen4a618662008-11-24 12:02:35 +00001084
1085 dbg("node %d\n", nid);
1086 dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
1087
1088 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
1089 NODE_DATA(nid)->node_start_pfn = start_pfn;
1090 NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
1091
1092 if (NODE_DATA(nid)->node_spanned_pages == 0)
1093 continue;
1094
1095 dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT);
1096 dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
1097
1098 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
Dave Hansen893473d2008-12-09 08:21:36 +00001099 bootmem_vaddr = careful_zallocation(nid,
Dave Hansen4a618662008-11-24 12:02:35 +00001100 bootmap_pages << PAGE_SHIFT,
1101 PAGE_SIZE, end_pfn);
Dave Hansen4a618662008-11-24 12:02:35 +00001102
Dave Hansen0be210f2008-12-09 08:21:35 +00001103 dbg("bootmap_vaddr = %p\n", bootmem_vaddr);
Dave Hansen4a618662008-11-24 12:02:35 +00001104
Dave Hansen0be210f2008-12-09 08:21:35 +00001105 init_bootmem_node(NODE_DATA(nid),
1106 __pa(bootmem_vaddr) >> PAGE_SHIFT,
Dave Hansen4a618662008-11-24 12:02:35 +00001107 start_pfn, end_pfn);
1108
1109 free_bootmem_with_active_regions(nid, end_pfn);
1110 /*
1111 * Be very careful about moving this around. Future
Dave Hansen893473d2008-12-09 08:21:36 +00001112 * calls to careful_zallocation() depend on this getting
Dave Hansen4a618662008-11-24 12:02:35 +00001113 * done correctly.
1114 */
1115 mark_reserved_regions_for_nid(nid);
Jon Tollefson8f64e1f2008-10-09 10:18:40 +00001116 sparse_memory_present_with_active_regions(nid);
Dave Hansen4a618662008-11-24 12:02:35 +00001117 }
Benjamin Herrenschmidtd3f62042009-06-02 21:16:38 +00001118
1119 init_bootmem_done = 1;
Anton Blanchard25863de2010-04-26 15:32:43 +00001120
1121 /*
1122 * Now bootmem is initialised we can create the node to cpumask
1123 * lookup tables and setup the cpu callback to populate them.
1124 */
1125 setup_node_to_cpumask_map();
1126
Srivatsa S. Bhatd4edc5b2013-12-30 17:05:34 +05301127 reset_numa_cpu_lookup_table();
Anton Blanchard25863de2010-04-26 15:32:43 +00001128 register_cpu_notifier(&ppc64_numa_nb);
Nishanth Aravamudan2fabf082014-07-17 16:15:12 -07001129 /*
1130 * We need the numa_cpu_lookup_table to be accurate for all CPUs,
1131 * even before we online them, so that we can use cpu_to_{node,mem}
1132 * early in boot, cf. smp_prepare_cpus().
1133 */
Li Zhongbc3c4322014-08-27 17:34:00 +08001134 for_each_present_cpu(cpu) {
Li Zhong70ad2372014-08-27 17:33:59 +08001135 numa_setup_cpu((unsigned long)cpu);
Nishanth Aravamudan2fabf082014-07-17 16:15:12 -07001136 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001137}
1138
Linus Torvalds1da177e2005-04-16 15:20:36 -07001139static int __init early_numa(char *p)
1140{
1141 if (!p)
1142 return 0;
1143
1144 if (strstr(p, "off"))
1145 numa_enabled = 0;
1146
1147 if (strstr(p, "debug"))
1148 numa_debug = 1;
1149
Balbir Singh1daa6d02008-02-01 15:57:31 +11001150 p = strstr(p, "fake=");
1151 if (p)
1152 cmdline = p + strlen("fake=");
1153
Linus Torvalds1da177e2005-04-16 15:20:36 -07001154 return 0;
1155}
1156early_param("numa", early_numa);
Mike Kravetz237a09892005-12-05 12:06:42 -08001157
Nishanth Aravamudan2d73bae2014-10-10 09:04:49 -07001158static bool topology_updates_enabled = true;
1159
1160static int __init early_topology_updates(char *p)
1161{
1162 if (!p)
1163 return 0;
1164
1165 if (!strcmp(p, "off")) {
1166 pr_info("Disabling topology updates\n");
1167 topology_updates_enabled = false;
1168 }
1169
1170 return 0;
1171}
1172early_param("topology_updates", early_topology_updates);
1173
Mike Kravetz237a09892005-12-05 12:06:42 -08001174#ifdef CONFIG_MEMORY_HOTPLUG
1175/*
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001176 * Find the node associated with a hot added memory section for
1177 * memory represented in the device tree by the property
1178 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
Nathan Fontenot0db93602008-07-03 13:25:08 +10001179 */
1180static int hot_add_drconf_scn_to_nid(struct device_node *memory,
1181 unsigned long scn_addr)
1182{
Alistair Poppleb08a2a12013-08-07 02:01:44 +10001183 const __be32 *dm;
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001184 unsigned int drconf_cell_cnt, rc;
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +10001185 unsigned long lmb_size;
Nathan Fontenot0db93602008-07-03 13:25:08 +10001186 struct assoc_arrays aa;
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001187 int nid = -1;
Nathan Fontenot0db93602008-07-03 13:25:08 +10001188
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001189 drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
1190 if (!drconf_cell_cnt)
1191 return -1;
Nathan Fontenot0db93602008-07-03 13:25:08 +10001192
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +10001193 lmb_size = of_get_lmb_size(memory);
1194 if (!lmb_size)
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001195 return -1;
Nathan Fontenot0db93602008-07-03 13:25:08 +10001196
1197 rc = of_get_assoc_arrays(memory, &aa);
1198 if (rc)
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001199 return -1;
Nathan Fontenot0db93602008-07-03 13:25:08 +10001200
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001201 for (; drconf_cell_cnt != 0; --drconf_cell_cnt) {
Nathan Fontenot0db93602008-07-03 13:25:08 +10001202 struct of_drconf_cell drmem;
1203
1204 read_drconf_cell(&drmem, &dm);
1205
1206 /* skip this block if it is reserved or not assigned to
1207 * this partition */
1208 if ((drmem.flags & DRCONF_MEM_RESERVED)
1209 || !(drmem.flags & DRCONF_MEM_ASSIGNED))
1210 continue;
1211
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001212 if ((scn_addr < drmem.base_addr)
Benjamin Herrenschmidt3fdfd992010-07-23 10:35:52 +10001213 || (scn_addr >= (drmem.base_addr + lmb_size)))
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001214 continue;
Nathan Fontenot0db93602008-07-03 13:25:08 +10001215
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001216 nid = of_drconf_to_nid_single(&drmem, &aa);
1217 break;
Nathan Fontenot0db93602008-07-03 13:25:08 +10001218 }
1219
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001220 return nid;
Nathan Fontenot0db93602008-07-03 13:25:08 +10001221}
1222
1223/*
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001224 * Find the node associated with a hot added memory section for memory
1225 * represented in the device tree as a node (i.e. memory@XXXX) for
Yinghai Lu95f72d12010-07-12 14:36:09 +10001226 * each memblock.
Mike Kravetz237a09892005-12-05 12:06:42 -08001227 */
Robert Jenningsec32dd62013-10-28 09:20:50 -05001228static int hot_add_node_scn_to_nid(unsigned long scn_addr)
Mike Kravetz237a09892005-12-05 12:06:42 -08001229{
Anton Blanchard94db7c52011-08-10 20:44:22 +00001230 struct device_node *memory;
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001231 int nid = -1;
Mike Kravetz237a09892005-12-05 12:06:42 -08001232
Anton Blanchard94db7c52011-08-10 20:44:22 +00001233 for_each_node_by_type(memory, "memory") {
Mike Kravetz237a09892005-12-05 12:06:42 -08001234 unsigned long start, size;
Mike Kravetzb226e462005-12-16 14:30:35 -08001235 int ranges;
Alistair Poppleb08a2a12013-08-07 02:01:44 +10001236 const __be32 *memcell_buf;
Mike Kravetz237a09892005-12-05 12:06:42 -08001237 unsigned int len;
1238
Stephen Rothwelle2eb6392007-04-03 22:26:41 +10001239 memcell_buf = of_get_property(memory, "reg", &len);
Mike Kravetz237a09892005-12-05 12:06:42 -08001240 if (!memcell_buf || len <= 0)
1241 continue;
1242
Benjamin Herrenschmidtcc5d0182005-12-13 18:01:21 +11001243 /* ranges in cell */
1244 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
Mike Kravetz237a09892005-12-05 12:06:42 -08001245
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001246 while (ranges--) {
1247 start = read_n_cells(n_mem_addr_cells, &memcell_buf);
1248 size = read_n_cells(n_mem_size_cells, &memcell_buf);
1249
1250 if ((scn_addr < start) || (scn_addr >= (start + size)))
1251 continue;
1252
1253 nid = of_node_to_nid_single(memory);
1254 break;
Mike Kravetz237a09892005-12-05 12:06:42 -08001255 }
1256
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001257 if (nid >= 0)
1258 break;
Mike Kravetz237a09892005-12-05 12:06:42 -08001259 }
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001260
Anton Blanchard60831842011-08-10 20:44:21 +00001261 of_node_put(memory);
1262
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001263 return nid;
Mike Kravetz237a09892005-12-05 12:06:42 -08001264}
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001265
1266/*
1267 * Find the node associated with a hot added memory section. Section
Yinghai Lu95f72d12010-07-12 14:36:09 +10001268 * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that
1269 * sections are fully contained within a single MEMBLOCK.
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001270 */
1271int hot_add_scn_to_nid(unsigned long scn_addr)
1272{
1273 struct device_node *memory = NULL;
1274 int nid, found = 0;
1275
1276 if (!numa_enabled || (min_common_depth < 0))
H Hartley Sweeten72c33682010-03-05 13:42:43 -08001277 return first_online_node;
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001278
1279 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1280 if (memory) {
1281 nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
1282 of_node_put(memory);
1283 } else {
1284 nid = hot_add_node_scn_to_nid(scn_addr);
1285 }
1286
1287 if (nid < 0 || !node_online(nid))
H Hartley Sweeten72c33682010-03-05 13:42:43 -08001288 nid = first_online_node;
Nathan Fontenot0f16ef72009-02-17 08:08:30 +00001289
1290 if (NODE_DATA(nid)->node_spanned_pages)
1291 return nid;
1292
1293 for_each_online_node(nid) {
1294 if (NODE_DATA(nid)->node_spanned_pages) {
1295 found = 1;
1296 break;
1297 }
1298 }
1299
1300 BUG_ON(!found);
1301 return nid;
1302}
1303
Nishanth Aravamudancd342062010-10-26 17:35:12 +00001304static u64 hot_add_drconf_memory_max(void)
1305{
1306 struct device_node *memory = NULL;
1307 unsigned int drconf_cell_cnt = 0;
1308 u64 lmb_size = 0;
Robert Jenningsec32dd62013-10-28 09:20:50 -05001309 const __be32 *dm = NULL;
Nishanth Aravamudancd342062010-10-26 17:35:12 +00001310
1311 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1312 if (memory) {
1313 drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
1314 lmb_size = of_get_lmb_size(memory);
1315 of_node_put(memory);
1316 }
1317 return lmb_size * drconf_cell_cnt;
1318}
1319
1320/*
1321 * memory_hotplug_max - return max address of memory that may be added
1322 *
1323 * This is currently only used on systems that support drconfig memory
1324 * hotplug.
1325 */
1326u64 memory_hotplug_max(void)
1327{
1328 return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
1329}
Mike Kravetz237a09892005-12-05 12:06:42 -08001330#endif /* CONFIG_MEMORY_HOTPLUG */
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001331
Jesse Larrewbd034032011-01-20 19:00:51 +00001332/* Virtual Processor Home Node (VPHN) support */
Jesse Larrew39bf9902010-12-17 22:07:47 +00001333#ifdef CONFIG_PPC_SPLPAR
Nathan Fontenot30c05352013-04-24 06:02:13 +00001334struct topology_update_data {
1335 struct topology_update_data *next;
1336 unsigned int cpu;
1337 int old_nid;
1338 int new_nid;
1339};
1340
Anton Blanchard5de16692011-01-29 12:24:34 +00001341static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001342static cpumask_t cpu_associativity_changes_mask;
1343static int vphn_enabled;
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001344static int prrn_enabled;
1345static void reset_topology_timer(void);
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001346
1347/*
1348 * Store the current values of the associativity change counters in the
1349 * hypervisor.
1350 */
1351static void setup_cpu_associativity_change_counters(void)
1352{
Jesse Larrewcd9d6cc2011-01-20 19:01:35 +00001353 int cpu;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001354
Anton Blanchard5de16692011-01-29 12:24:34 +00001355 /* The VPHN feature supports a maximum of 8 reference points */
1356 BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
1357
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001358 for_each_possible_cpu(cpu) {
Jesse Larrewcd9d6cc2011-01-20 19:01:35 +00001359 int i;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001360 u8 *counts = vphn_cpu_change_counts[cpu];
1361 volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
1362
Anton Blanchard5de16692011-01-29 12:24:34 +00001363 for (i = 0; i < distance_ref_points_depth; i++)
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001364 counts[i] = hypervisor_counts[i];
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001365 }
1366}
1367
1368/*
1369 * The hypervisor maintains a set of 8 associativity change counters in
1370 * the VPA of each cpu that correspond to the associativity levels in the
1371 * ibm,associativity-reference-points property. When an associativity
1372 * level changes, the corresponding counter is incremented.
1373 *
1374 * Set a bit in cpu_associativity_changes_mask for each cpu whose home
1375 * node associativity levels have changed.
1376 *
1377 * Returns the number of cpus with unhandled associativity changes.
1378 */
1379static int update_cpu_associativity_changes_mask(void)
1380{
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001381 int cpu;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001382 cpumask_t *changes = &cpu_associativity_changes_mask;
1383
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001384 for_each_possible_cpu(cpu) {
1385 int i, changed = 0;
1386 u8 *counts = vphn_cpu_change_counts[cpu];
1387 volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
1388
Anton Blanchard5de16692011-01-29 12:24:34 +00001389 for (i = 0; i < distance_ref_points_depth; i++) {
Anton Blanchardd69043e2011-01-29 12:26:19 +00001390 if (hypervisor_counts[i] != counts[i]) {
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001391 counts[i] = hypervisor_counts[i];
1392 changed = 1;
1393 }
1394 }
1395 if (changed) {
Robert Jennings3be7db62013-07-24 20:13:21 -05001396 cpumask_or(changes, changes, cpu_sibling_mask(cpu));
1397 cpu = cpu_last_thread_sibling(cpu);
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001398 }
1399 }
1400
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001401 return cpumask_weight(changes);
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001402}
1403
Anton Blanchardc0e5e462011-01-29 12:28:04 +00001404/*
1405 * 6 64-bit registers unpacked into 12 32-bit associativity values. To form
1406 * the complete property we have to add the length in the first cell.
1407 */
1408#define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32) + 1)
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001409
1410/*
1411 * Convert the associativity domain numbers returned from the hypervisor
1412 * to the sequence they would appear in the ibm,associativity property.
1413 */
Alistair Poppleb08a2a12013-08-07 02:01:44 +10001414static int vphn_unpack_associativity(const long *packed, __be32 *unpacked)
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001415{
Jesse Larrewcd9d6cc2011-01-20 19:01:35 +00001416 int i, nr_assoc_doms = 0;
Alistair Poppleb08a2a12013-08-07 02:01:44 +10001417 const __be16 *field = (const __be16 *) packed;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001418
1419#define VPHN_FIELD_UNUSED (0xffff)
1420#define VPHN_FIELD_MSB (0x8000)
1421#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB)
1422
Anton Blanchardc0e5e462011-01-29 12:28:04 +00001423 for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
Alistair Poppleb08a2a12013-08-07 02:01:44 +10001424 if (be16_to_cpup(field) == VPHN_FIELD_UNUSED) {
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001425 /* All significant fields processed, and remaining
1426 * fields contain the reserved value of all 1's.
1427 * Just store them.
1428 */
Alistair Poppleb08a2a12013-08-07 02:01:44 +10001429 unpacked[i] = *((__be32 *)field);
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001430 field += 2;
Alistair Poppleb08a2a12013-08-07 02:01:44 +10001431 } else if (be16_to_cpup(field) & VPHN_FIELD_MSB) {
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001432 /* Data is in the lower 15 bits of this field */
Alistair Poppleb08a2a12013-08-07 02:01:44 +10001433 unpacked[i] = cpu_to_be32(
1434 be16_to_cpup(field) & VPHN_FIELD_MASK);
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001435 field++;
1436 nr_assoc_doms++;
Jesse Larrew7639ada2011-01-20 19:01:13 +00001437 } else {
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001438 /* Data is in the lower 15 bits of this field
1439 * concatenated with the next 16 bit field
1440 */
Alistair Poppleb08a2a12013-08-07 02:01:44 +10001441 unpacked[i] = *((__be32 *)field);
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001442 field += 2;
1443 nr_assoc_doms++;
1444 }
1445 }
1446
Anton Blanchardc0e5e462011-01-29 12:28:04 +00001447 /* The first cell contains the length of the property */
Alistair Poppleb08a2a12013-08-07 02:01:44 +10001448 unpacked[0] = cpu_to_be32(nr_assoc_doms);
Anton Blanchardc0e5e462011-01-29 12:28:04 +00001449
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001450 return nr_assoc_doms;
1451}
1452
1453/*
1454 * Retrieve the new associativity information for a virtual processor's
1455 * home node.
1456 */
Alistair Poppleb08a2a12013-08-07 02:01:44 +10001457static long hcall_vphn(unsigned long cpu, __be32 *associativity)
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001458{
Jesse Larrewcd9d6cc2011-01-20 19:01:35 +00001459 long rc;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001460 long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
1461 u64 flags = 1;
1462 int hwcpu = get_hard_smp_processor_id(cpu);
1463
1464 rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
1465 vphn_unpack_associativity(retbuf, associativity);
1466
1467 return rc;
1468}
1469
1470static long vphn_get_associativity(unsigned long cpu,
Alistair Poppleb08a2a12013-08-07 02:01:44 +10001471 __be32 *associativity)
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001472{
Jesse Larrewcd9d6cc2011-01-20 19:01:35 +00001473 long rc;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001474
1475 rc = hcall_vphn(cpu, associativity);
1476
1477 switch (rc) {
1478 case H_FUNCTION:
1479 printk(KERN_INFO
1480 "VPHN is not supported. Disabling polling...\n");
1481 stop_topology_update();
1482 break;
1483 case H_HARDWARE:
1484 printk(KERN_ERR
1485 "hcall_vphn() experienced a hardware fault "
1486 "preventing VPHN. Disabling polling...\n");
1487 stop_topology_update();
1488 }
1489
1490 return rc;
1491}
1492
1493/*
Nathan Fontenot30c05352013-04-24 06:02:13 +00001494 * Update the CPU maps and sysfs entries for a single CPU when its NUMA
1495 * characteristics change. This function doesn't perform any locking and is
1496 * only safe to call from stop_machine().
1497 */
1498static int update_cpu_topology(void *data)
1499{
1500 struct topology_update_data *update;
1501 unsigned long cpu;
1502
1503 if (!data)
1504 return -EINVAL;
1505
Robert Jennings3be7db62013-07-24 20:13:21 -05001506 cpu = smp_processor_id();
Nathan Fontenot30c05352013-04-24 06:02:13 +00001507
1508 for (update = data; update; update = update->next) {
1509 if (cpu != update->cpu)
1510 continue;
1511
Nathan Fontenot30c05352013-04-24 06:02:13 +00001512 unmap_cpu_from_node(update->cpu);
1513 map_cpu_to_node(update->cpu, update->new_nid);
Jesse Larrew176bbf12013-04-24 06:03:48 +00001514 vdso_getcpu_init();
Nathan Fontenot30c05352013-04-24 06:02:13 +00001515 }
1516
1517 return 0;
1518}
1519
Srivatsa S. Bhatd4edc5b2013-12-30 17:05:34 +05301520static int update_lookup_table(void *data)
1521{
1522 struct topology_update_data *update;
1523
1524 if (!data)
1525 return -EINVAL;
1526
1527 /*
1528 * Upon topology update, the numa-cpu lookup table needs to be updated
1529 * for all threads in the core, including offline CPUs, to ensure that
1530 * future hotplug operations respect the cpu-to-node associativity
1531 * properly.
1532 */
1533 for (update = data; update; update = update->next) {
1534 int nid, base, j;
1535
1536 nid = update->new_nid;
1537 base = cpu_first_thread_sibling(update->cpu);
1538
1539 for (j = 0; j < threads_per_core; j++) {
1540 update_numa_cpu_lookup_table(base + j, nid);
1541 }
1542 }
1543
1544 return 0;
1545}
1546
Nathan Fontenot30c05352013-04-24 06:02:13 +00001547/*
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001548 * Update the node maps and sysfs entries for each cpu whose home node
Jesse Larrew79c5fce2012-06-07 16:04:34 -05001549 * has changed. Returns 1 when the topology has changed, and 0 otherwise.
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001550 */
1551int arch_update_cpu_topology(void)
1552{
Robert Jennings3be7db62013-07-24 20:13:21 -05001553 unsigned int cpu, sibling, changed = 0;
Nathan Fontenot30c05352013-04-24 06:02:13 +00001554 struct topology_update_data *updates, *ud;
Alistair Poppleb08a2a12013-08-07 02:01:44 +10001555 __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
Jesse Larrew176bbf12013-04-24 06:03:48 +00001556 cpumask_t updated_cpus;
Kay Sievers8a25a2f2011-12-21 14:29:42 -08001557 struct device *dev;
Robert Jennings3be7db62013-07-24 20:13:21 -05001558 int weight, new_nid, i = 0;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001559
Nishanth Aravamudan2d73bae2014-10-10 09:04:49 -07001560 if (!prrn_enabled && !vphn_enabled)
1561 return 0;
1562
Nathan Fontenot30c05352013-04-24 06:02:13 +00001563 weight = cpumask_weight(&cpu_associativity_changes_mask);
1564 if (!weight)
1565 return 0;
1566
1567 updates = kzalloc(weight * (sizeof(*updates)), GFP_KERNEL);
1568 if (!updates)
1569 return 0;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001570
Jesse Larrew176bbf12013-04-24 06:03:48 +00001571 cpumask_clear(&updated_cpus);
1572
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001573 for_each_cpu(cpu, &cpu_associativity_changes_mask) {
Robert Jennings3be7db62013-07-24 20:13:21 -05001574 /*
1575 * If siblings aren't flagged for changes, updates list
1576 * will be too short. Skip on this update and set for next
1577 * update.
1578 */
1579 if (!cpumask_subset(cpu_sibling_mask(cpu),
1580 &cpu_associativity_changes_mask)) {
1581 pr_info("Sibling bits not set for associativity "
1582 "change, cpu%d\n", cpu);
1583 cpumask_or(&cpu_associativity_changes_mask,
1584 &cpu_associativity_changes_mask,
1585 cpu_sibling_mask(cpu));
1586 cpu = cpu_last_thread_sibling(cpu);
1587 continue;
1588 }
1589
1590 /* Use associativity from first thread for all siblings */
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001591 vphn_get_associativity(cpu, associativity);
Robert Jennings3be7db62013-07-24 20:13:21 -05001592 new_nid = associativity_to_nid(associativity);
1593 if (new_nid < 0 || !node_online(new_nid))
1594 new_nid = first_online_node;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001595
Robert Jennings3be7db62013-07-24 20:13:21 -05001596 if (new_nid == numa_cpu_lookup_table[cpu]) {
1597 cpumask_andnot(&cpu_associativity_changes_mask,
1598 &cpu_associativity_changes_mask,
1599 cpu_sibling_mask(cpu));
1600 cpu = cpu_last_thread_sibling(cpu);
1601 continue;
1602 }
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001603
Robert Jennings3be7db62013-07-24 20:13:21 -05001604 for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
1605 ud = &updates[i++];
1606 ud->cpu = sibling;
1607 ud->new_nid = new_nid;
1608 ud->old_nid = numa_cpu_lookup_table[sibling];
1609 cpumask_set_cpu(sibling, &updated_cpus);
1610 if (i < weight)
1611 ud->next = &updates[i];
1612 }
1613 cpu = cpu_last_thread_sibling(cpu);
Nathan Fontenot30c05352013-04-24 06:02:13 +00001614 }
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001615
Nishanth Aravamudan2d73bae2014-10-10 09:04:49 -07001616 pr_debug("Topology update for the following CPUs:\n");
1617 if (cpumask_weight(&updated_cpus)) {
1618 for (ud = &updates[0]; ud; ud = ud->next) {
1619 pr_debug("cpu %d moving from node %d "
1620 "to %d\n", ud->cpu,
1621 ud->old_nid, ud->new_nid);
1622 }
1623 }
1624
Michael Wang9a013362014-04-08 11:19:36 +08001625 /*
1626 * In cases where we have nothing to update (because the updates list
1627 * is too short or because the new topology is same as the old one),
1628 * skip invoking update_cpu_topology() via stop-machine(). This is
1629 * necessary (and not just a fast-path optimization) since stop-machine
1630 * can end up electing a random CPU to run update_cpu_topology(), and
1631 * thus trick us into setting up incorrect cpu-node mappings (since
1632 * 'updates' is kzalloc()'ed).
1633 *
1634 * And for the similar reason, we will skip all the following updating.
1635 */
1636 if (!cpumask_weight(&updated_cpus))
1637 goto out;
1638
Jesse Larrew176bbf12013-04-24 06:03:48 +00001639 stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
Nathan Fontenot30c05352013-04-24 06:02:13 +00001640
Srivatsa S. Bhatd4edc5b2013-12-30 17:05:34 +05301641 /*
1642 * Update the numa-cpu lookup table with the new mappings, even for
1643 * offline CPUs. It is best to perform this update from the stop-
1644 * machine context.
1645 */
1646 stop_machine(update_lookup_table, &updates[0],
1647 cpumask_of(raw_smp_processor_id()));
1648
Nathan Fontenot30c05352013-04-24 06:02:13 +00001649 for (ud = &updates[0]; ud; ud = ud->next) {
Nathan Fontenotdd023212013-06-24 22:08:05 -05001650 unregister_cpu_under_node(ud->cpu, ud->old_nid);
1651 register_cpu_under_node(ud->cpu, ud->new_nid);
1652
Nathan Fontenot30c05352013-04-24 06:02:13 +00001653 dev = get_cpu_device(ud->cpu);
Kay Sievers8a25a2f2011-12-21 14:29:42 -08001654 if (dev)
1655 kobject_uevent(&dev->kobj, KOBJ_CHANGE);
Nathan Fontenot30c05352013-04-24 06:02:13 +00001656 cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask);
Jesse Larrew79c5fce2012-06-07 16:04:34 -05001657 changed = 1;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001658 }
1659
Michael Wang9a013362014-04-08 11:19:36 +08001660out:
Nathan Fontenot30c05352013-04-24 06:02:13 +00001661 kfree(updates);
Jesse Larrew79c5fce2012-06-07 16:04:34 -05001662 return changed;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001663}
1664
1665static void topology_work_fn(struct work_struct *work)
1666{
1667 rebuild_sched_domains();
1668}
1669static DECLARE_WORK(topology_work, topology_work_fn);
1670
Robert Jenningsec32dd62013-10-28 09:20:50 -05001671static void topology_schedule_update(void)
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001672{
1673 schedule_work(&topology_work);
1674}
1675
1676static void topology_timer_fn(unsigned long ignored)
1677{
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001678 if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001679 topology_schedule_update();
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001680 else if (vphn_enabled) {
1681 if (update_cpu_associativity_changes_mask() > 0)
1682 topology_schedule_update();
1683 reset_topology_timer();
1684 }
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001685}
1686static struct timer_list topology_timer =
1687 TIMER_INITIALIZER(topology_timer_fn, 0, 0);
1688
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001689static void reset_topology_timer(void)
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001690{
1691 topology_timer.data = 0;
1692 topology_timer.expires = jiffies + 60 * HZ;
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001693 mod_timer(&topology_timer, topology_timer.expires);
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001694}
1695
Nathan Fontenot601abdc32013-04-29 03:45:36 +00001696#ifdef CONFIG_SMP
1697
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001698static void stage_topology_update(int core_id)
1699{
1700 cpumask_or(&cpu_associativity_changes_mask,
1701 &cpu_associativity_changes_mask, cpu_sibling_mask(core_id));
1702 reset_topology_timer();
1703}
1704
1705static int dt_update_callback(struct notifier_block *nb,
1706 unsigned long action, void *data)
1707{
1708 struct of_prop_reconfig *update;
1709 int rc = NOTIFY_DONE;
1710
1711 switch (action) {
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001712 case OF_RECONFIG_UPDATE_PROPERTY:
1713 update = (struct of_prop_reconfig *)data;
Nathan Fontenot30c05352013-04-24 06:02:13 +00001714 if (!of_prop_cmp(update->dn->type, "cpu") &&
1715 !of_prop_cmp(update->prop->name, "ibm,associativity")) {
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001716 u32 core_id;
1717 of_property_read_u32(update->dn, "reg", &core_id);
1718 stage_topology_update(core_id);
1719 rc = NOTIFY_OK;
1720 }
1721 break;
1722 }
1723
1724 return rc;
1725}
1726
1727static struct notifier_block dt_update_nb = {
1728 .notifier_call = dt_update_callback,
1729};
1730
Nathan Fontenot601abdc32013-04-29 03:45:36 +00001731#endif
1732
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001733/*
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001734 * Start polling for associativity changes.
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001735 */
1736int start_topology_update(void)
1737{
1738 int rc = 0;
1739
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001740 if (firmware_has_feature(FW_FEATURE_PRRN)) {
1741 if (!prrn_enabled) {
1742 prrn_enabled = 1;
1743 vphn_enabled = 0;
Nathan Fontenot601abdc32013-04-29 03:45:36 +00001744#ifdef CONFIG_SMP
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001745 rc = of_reconfig_notifier_register(&dt_update_nb);
Nathan Fontenot601abdc32013-04-29 03:45:36 +00001746#endif
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001747 }
Jesse Larrewb7abef02013-04-24 06:05:22 +00001748 } else if (firmware_has_feature(FW_FEATURE_VPHN) &&
Anton Blanchardf13c13a2013-08-07 02:01:26 +10001749 lppaca_shared_proc(get_lppaca())) {
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001750 if (!vphn_enabled) {
1751 prrn_enabled = 0;
1752 vphn_enabled = 1;
1753 setup_cpu_associativity_change_counters();
1754 init_timer_deferrable(&topology_timer);
1755 reset_topology_timer();
1756 }
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001757 }
1758
1759 return rc;
1760}
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001761
1762/*
1763 * Disable polling for VPHN associativity changes.
1764 */
1765int stop_topology_update(void)
1766{
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001767 int rc = 0;
1768
1769 if (prrn_enabled) {
1770 prrn_enabled = 0;
Nathan Fontenot601abdc32013-04-29 03:45:36 +00001771#ifdef CONFIG_SMP
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001772 rc = of_reconfig_notifier_unregister(&dt_update_nb);
Nathan Fontenot601abdc32013-04-29 03:45:36 +00001773#endif
Jesse Larrew5d88aa82013-04-24 06:00:35 +00001774 } else if (vphn_enabled) {
1775 vphn_enabled = 0;
1776 rc = del_timer_sync(&topology_timer);
1777 }
1778
1779 return rc;
Jesse Larrew9eff1a32010-12-01 12:31:15 +00001780}
Nathan Fontenote04fa612013-04-24 06:07:39 +00001781
1782int prrn_is_enabled(void)
1783{
1784 return prrn_enabled;
1785}
1786
1787static int topology_read(struct seq_file *file, void *v)
1788{
1789 if (vphn_enabled || prrn_enabled)
1790 seq_puts(file, "on\n");
1791 else
1792 seq_puts(file, "off\n");
1793
1794 return 0;
1795}
1796
1797static int topology_open(struct inode *inode, struct file *file)
1798{
1799 return single_open(file, topology_read, NULL);
1800}
1801
1802static ssize_t topology_write(struct file *file, const char __user *buf,
1803 size_t count, loff_t *off)
1804{
1805 char kbuf[4]; /* "on" or "off" plus null. */
1806 int read_len;
1807
1808 read_len = count < 3 ? count : 3;
1809 if (copy_from_user(kbuf, buf, read_len))
1810 return -EINVAL;
1811
1812 kbuf[read_len] = '\0';
1813
1814 if (!strncmp(kbuf, "on", 2))
1815 start_topology_update();
1816 else if (!strncmp(kbuf, "off", 3))
1817 stop_topology_update();
1818 else
1819 return -EINVAL;
1820
1821 return count;
1822}
1823
1824static const struct file_operations topology_ops = {
1825 .read = seq_read,
1826 .write = topology_write,
1827 .open = topology_open,
1828 .release = single_release
1829};
1830
1831static int topology_update_init(void)
1832{
Nishanth Aravamudan2d73bae2014-10-10 09:04:49 -07001833 /* Do not poll for changes if disabled at boot */
1834 if (topology_updates_enabled)
1835 start_topology_update();
1836
Nishanth Aravamudan2d15b9b2014-10-09 16:41:28 -07001837 if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops))
1838 return -ENOMEM;
Nathan Fontenote04fa612013-04-24 06:07:39 +00001839
1840 return 0;
1841}
1842device_initcall(topology_update_init);
Jesse Larrew39bf9902010-12-17 22:07:47 +00001843#endif /* CONFIG_PPC_SPLPAR */