blob: b52518c54efe85f77a146e3a02549fcb8d4d0eae [file] [log] [blame]
Peng Taod7e09d02013-05-02 16:46:55 +08001/*
2 * GPL HEADER START
3 *
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
15 *
Peng Taod7e09d02013-05-02 16:46:55 +080016 * GPL HEADER END
17 */
18/*
19 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
Andreas Dilger1dc563a2015-11-08 18:09:37 -050020 *
21 * Copyright (c) 2012, 2015 Intel Corporation.
Peng Taod7e09d02013-05-02 16:46:55 +080022 */
23/*
24 * This file is part of Lustre, http://www.lustre.org/
25 * Lustre is a trademark of Sun Microsystems, Inc.
26 *
27 * Author: liang@whamcloud.com
28 */
29
30#define DEBUG_SUBSYSTEM S_LNET
31
32#include <linux/cpu.h>
33#include <linux/sched.h>
Greg Kroah-Hartman9fdaf8c2014-07-11 20:51:16 -070034#include "../../../include/linux/libcfs/libcfs.h"
Peng Taod7e09d02013-05-02 16:46:55 +080035
36#ifdef CONFIG_SMP
37
38/**
39 * modparam for setting number of partitions
40 *
41 * 0 : estimate best value based on cores or NUMA nodes
42 * 1 : disable multiple partitions
43 * >1 : specify number of partitions
44 */
45static int cpu_npartitions;
Peng Tao8cc7b4b2013-11-21 22:28:30 +080046module_param(cpu_npartitions, int, 0444);
47MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions");
Peng Taod7e09d02013-05-02 16:46:55 +080048
49/**
50 * modparam for setting CPU partitions patterns:
51 *
52 * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID,
53 * number in bracket is processor ID (core or HT)
54 *
55 * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket
56 * are NUMA node ID, number before bracket is CPU partition ID.
57 *
58 * NB: If user specified cpu_pattern, cpu_npartitions will be ignored
59 */
60static char *cpu_pattern = "";
Peng Tao8cc7b4b2013-11-21 22:28:30 +080061module_param(cpu_pattern, charp, 0444);
62MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern");
Peng Taod7e09d02013-05-02 16:46:55 +080063
64struct cfs_cpt_data {
65 /* serialize hotplug etc */
66 spinlock_t cpt_lock;
67 /* reserved for hotplug */
68 unsigned long cpt_version;
69 /* mutex to protect cpt_cpumask */
Dmitry Eremin6246dab2014-04-27 13:06:59 -040070 struct mutex cpt_mutex;
Peng Taod7e09d02013-05-02 16:46:55 +080071 /* scratch buffer for set/unset_node */
72 cpumask_t *cpt_cpumask;
73};
74
75static struct cfs_cpt_data cpt_data;
76
Peng Taod7e09d02013-05-02 16:46:55 +080077void
78cfs_cpt_table_free(struct cfs_cpt_table *cptab)
79{
80 int i;
81
Oleg Drokin15d9f522016-02-16 00:46:44 -050082 if (cptab->ctb_cpu2cpt) {
Peng Taod7e09d02013-05-02 16:46:55 +080083 LIBCFS_FREE(cptab->ctb_cpu2cpt,
84 num_possible_cpus() *
85 sizeof(cptab->ctb_cpu2cpt[0]));
86 }
87
Oleg Drokin15d9f522016-02-16 00:46:44 -050088 for (i = 0; cptab->ctb_parts && i < cptab->ctb_nparts; i++) {
Peng Taod7e09d02013-05-02 16:46:55 +080089 struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
90
Oleg Drokin15d9f522016-02-16 00:46:44 -050091 if (part->cpt_nodemask) {
Peng Taod7e09d02013-05-02 16:46:55 +080092 LIBCFS_FREE(part->cpt_nodemask,
93 sizeof(*part->cpt_nodemask));
94 }
95
Oleg Drokin15d9f522016-02-16 00:46:44 -050096 if (part->cpt_cpumask)
Peng Taod7e09d02013-05-02 16:46:55 +080097 LIBCFS_FREE(part->cpt_cpumask, cpumask_size());
98 }
99
Oleg Drokin15d9f522016-02-16 00:46:44 -0500100 if (cptab->ctb_parts) {
Peng Taod7e09d02013-05-02 16:46:55 +0800101 LIBCFS_FREE(cptab->ctb_parts,
102 cptab->ctb_nparts * sizeof(cptab->ctb_parts[0]));
103 }
104
Oleg Drokin15d9f522016-02-16 00:46:44 -0500105 if (cptab->ctb_nodemask)
Peng Taod7e09d02013-05-02 16:46:55 +0800106 LIBCFS_FREE(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
Oleg Drokin15d9f522016-02-16 00:46:44 -0500107 if (cptab->ctb_cpumask)
Peng Taod7e09d02013-05-02 16:46:55 +0800108 LIBCFS_FREE(cptab->ctb_cpumask, cpumask_size());
109
110 LIBCFS_FREE(cptab, sizeof(*cptab));
111}
112EXPORT_SYMBOL(cfs_cpt_table_free);
113
114struct cfs_cpt_table *
115cfs_cpt_table_alloc(unsigned int ncpt)
116{
117 struct cfs_cpt_table *cptab;
118 int i;
119
120 LIBCFS_ALLOC(cptab, sizeof(*cptab));
Oleg Drokin15d9f522016-02-16 00:46:44 -0500121 if (!cptab)
Peng Taod7e09d02013-05-02 16:46:55 +0800122 return NULL;
123
124 cptab->ctb_nparts = ncpt;
125
126 LIBCFS_ALLOC(cptab->ctb_cpumask, cpumask_size());
127 LIBCFS_ALLOC(cptab->ctb_nodemask, sizeof(*cptab->ctb_nodemask));
128
Oleg Drokin15d9f522016-02-16 00:46:44 -0500129 if (!cptab->ctb_cpumask || !cptab->ctb_nodemask)
Peng Taod7e09d02013-05-02 16:46:55 +0800130 goto failed;
131
132 LIBCFS_ALLOC(cptab->ctb_cpu2cpt,
133 num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
Oleg Drokin15d9f522016-02-16 00:46:44 -0500134 if (!cptab->ctb_cpu2cpt)
Peng Taod7e09d02013-05-02 16:46:55 +0800135 goto failed;
136
137 memset(cptab->ctb_cpu2cpt, -1,
138 num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0]));
139
140 LIBCFS_ALLOC(cptab->ctb_parts, ncpt * sizeof(cptab->ctb_parts[0]));
Oleg Drokin15d9f522016-02-16 00:46:44 -0500141 if (!cptab->ctb_parts)
Peng Taod7e09d02013-05-02 16:46:55 +0800142 goto failed;
143
144 for (i = 0; i < ncpt; i++) {
145 struct cfs_cpu_partition *part = &cptab->ctb_parts[i];
146
147 LIBCFS_ALLOC(part->cpt_cpumask, cpumask_size());
148 LIBCFS_ALLOC(part->cpt_nodemask, sizeof(*part->cpt_nodemask));
Oleg Drokin15d9f522016-02-16 00:46:44 -0500149 if (!part->cpt_cpumask || !part->cpt_nodemask)
Peng Taod7e09d02013-05-02 16:46:55 +0800150 goto failed;
151 }
152
153 spin_lock(&cpt_data.cpt_lock);
154 /* Reserved for hotplug */
155 cptab->ctb_version = cpt_data.cpt_version;
156 spin_unlock(&cpt_data.cpt_lock);
157
158 return cptab;
159
160 failed:
161 cfs_cpt_table_free(cptab);
162 return NULL;
163}
164EXPORT_SYMBOL(cfs_cpt_table_alloc);
165
166int
167cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len)
168{
169 char *tmp = buf;
170 int rc = 0;
171 int i;
172 int j;
173
174 for (i = 0; i < cptab->ctb_nparts; i++) {
175 if (len > 0) {
176 rc = snprintf(tmp, len, "%d\t: ", i);
177 len -= rc;
178 }
179
180 if (len <= 0) {
181 rc = -EFBIG;
182 goto out;
183 }
184
185 tmp += rc;
Oleg Drokin84177732015-03-07 19:24:26 -0500186 for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) {
Peng Taod7e09d02013-05-02 16:46:55 +0800187 rc = snprintf(tmp, len, "%d ", j);
188 len -= rc;
189 if (len <= 0) {
190 rc = -EFBIG;
191 goto out;
192 }
193 tmp += rc;
194 }
195
196 *tmp = '\n';
197 tmp++;
198 len--;
199 }
200
201 out:
202 if (rc < 0)
203 return rc;
204
205 return tmp - buf;
206}
207EXPORT_SYMBOL(cfs_cpt_table_print);
208
209int
210cfs_cpt_number(struct cfs_cpt_table *cptab)
211{
212 return cptab->ctb_nparts;
213}
214EXPORT_SYMBOL(cfs_cpt_number);
215
216int
217cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt)
218{
219 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
220
221 return cpt == CFS_CPT_ANY ?
Oleg Drokinc96d2362015-03-02 01:01:47 -0500222 cpumask_weight(cptab->ctb_cpumask) :
223 cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask);
Peng Taod7e09d02013-05-02 16:46:55 +0800224}
225EXPORT_SYMBOL(cfs_cpt_weight);
226
227int
228cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt)
229{
230 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
231
232 return cpt == CFS_CPT_ANY ?
Oleg Drokin84177732015-03-07 19:24:26 -0500233 cpumask_any_and(cptab->ctb_cpumask,
234 cpu_online_mask) < nr_cpu_ids :
235 cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask,
236 cpu_online_mask) < nr_cpu_ids;
Peng Taod7e09d02013-05-02 16:46:55 +0800237}
238EXPORT_SYMBOL(cfs_cpt_online);
239
240cpumask_t *
241cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt)
242{
243 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
244
245 return cpt == CFS_CPT_ANY ?
246 cptab->ctb_cpumask : cptab->ctb_parts[cpt].cpt_cpumask;
247}
248EXPORT_SYMBOL(cfs_cpt_cpumask);
249
250nodemask_t *
251cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt)
252{
253 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
254
255 return cpt == CFS_CPT_ANY ?
256 cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask;
257}
258EXPORT_SYMBOL(cfs_cpt_nodemask);
259
260int
261cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
262{
263 int node;
264
265 LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts);
266
Oleg Drokinc96d2362015-03-02 01:01:47 -0500267 if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) {
Peng Taod7e09d02013-05-02 16:46:55 +0800268 CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu);
269 return 0;
270 }
271
272 if (cptab->ctb_cpu2cpt[cpu] != -1) {
273 CDEBUG(D_INFO, "CPU %d is already in partition %d\n",
274 cpu, cptab->ctb_cpu2cpt[cpu]);
275 return 0;
276 }
277
278 cptab->ctb_cpu2cpt[cpu] = cpt;
279
Oleg Drokinc96d2362015-03-02 01:01:47 -0500280 LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_cpumask));
281 LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
Peng Taod7e09d02013-05-02 16:46:55 +0800282
Oleg Drokinc96d2362015-03-02 01:01:47 -0500283 cpumask_set_cpu(cpu, cptab->ctb_cpumask);
284 cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
Peng Taod7e09d02013-05-02 16:46:55 +0800285
286 node = cpu_to_node(cpu);
287
288 /* first CPU of @node in this CPT table */
289 if (!node_isset(node, *cptab->ctb_nodemask))
290 node_set(node, *cptab->ctb_nodemask);
291
292 /* first CPU of @node in this partition */
293 if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask))
294 node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask);
295
296 return 1;
297}
298EXPORT_SYMBOL(cfs_cpt_set_cpu);
299
300void
301cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu)
302{
303 int node;
304 int i;
305
306 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
307
Oleg Drokinc96d2362015-03-02 01:01:47 -0500308 if (cpu < 0 || cpu >= nr_cpu_ids) {
Peng Taod7e09d02013-05-02 16:46:55 +0800309 CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu);
310 return;
311 }
312
313 if (cpt == CFS_CPT_ANY) {
314 /* caller doesn't know the partition ID */
315 cpt = cptab->ctb_cpu2cpt[cpu];
316 if (cpt < 0) { /* not set in this CPT-table */
Joe Perches2d00bd12014-11-23 11:28:50 -0800317 CDEBUG(D_INFO, "Try to unset cpu %d which is not in CPT-table %p\n",
318 cpt, cptab);
Peng Taod7e09d02013-05-02 16:46:55 +0800319 return;
320 }
321
322 } else if (cpt != cptab->ctb_cpu2cpt[cpu]) {
323 CDEBUG(D_INFO,
324 "CPU %d is not in cpu-partition %d\n", cpu, cpt);
325 return;
326 }
327
Oleg Drokinc96d2362015-03-02 01:01:47 -0500328 LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask));
329 LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask));
Peng Taod7e09d02013-05-02 16:46:55 +0800330
Oleg Drokinc96d2362015-03-02 01:01:47 -0500331 cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask);
332 cpumask_clear_cpu(cpu, cptab->ctb_cpumask);
Peng Taod7e09d02013-05-02 16:46:55 +0800333 cptab->ctb_cpu2cpt[cpu] = -1;
334
335 node = cpu_to_node(cpu);
336
337 LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask));
338 LASSERT(node_isset(node, *cptab->ctb_nodemask));
339
Oleg Drokin84177732015-03-07 19:24:26 -0500340 for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask) {
Peng Taod7e09d02013-05-02 16:46:55 +0800341 /* this CPT has other CPU belonging to this node? */
342 if (cpu_to_node(i) == node)
343 break;
344 }
345
Oleg Drokin84177732015-03-07 19:24:26 -0500346 if (i >= nr_cpu_ids)
Peng Taod7e09d02013-05-02 16:46:55 +0800347 node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask);
348
Oleg Drokin84177732015-03-07 19:24:26 -0500349 for_each_cpu(i, cptab->ctb_cpumask) {
Peng Taod7e09d02013-05-02 16:46:55 +0800350 /* this CPT-table has other CPU belonging to this node? */
351 if (cpu_to_node(i) == node)
352 break;
353 }
354
Oleg Drokin84177732015-03-07 19:24:26 -0500355 if (i >= nr_cpu_ids)
Peng Taod7e09d02013-05-02 16:46:55 +0800356 node_clear(node, *cptab->ctb_nodemask);
Peng Taod7e09d02013-05-02 16:46:55 +0800357}
358EXPORT_SYMBOL(cfs_cpt_unset_cpu);
359
360int
361cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
362{
363 int i;
364
Oleg Drokin84177732015-03-07 19:24:26 -0500365 if (cpumask_weight(mask) == 0 ||
366 cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) {
Joe Perches2d00bd12014-11-23 11:28:50 -0800367 CDEBUG(D_INFO, "No online CPU is found in the CPU mask for CPU partition %d\n",
368 cpt);
Peng Taod7e09d02013-05-02 16:46:55 +0800369 return 0;
370 }
371
Oleg Drokin84177732015-03-07 19:24:26 -0500372 for_each_cpu(i, mask) {
Peng Taod7e09d02013-05-02 16:46:55 +0800373 if (!cfs_cpt_set_cpu(cptab, cpt, i))
374 return 0;
375 }
376
377 return 1;
378}
379EXPORT_SYMBOL(cfs_cpt_set_cpumask);
380
381void
382cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask)
383{
384 int i;
385
Oleg Drokin84177732015-03-07 19:24:26 -0500386 for_each_cpu(i, mask)
Peng Taod7e09d02013-05-02 16:46:55 +0800387 cfs_cpt_unset_cpu(cptab, cpt, i);
388}
389EXPORT_SYMBOL(cfs_cpt_unset_cpumask);
390
391int
392cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node)
393{
394 cpumask_t *mask;
395 int rc;
396
397 if (node < 0 || node >= MAX_NUMNODES) {
398 CDEBUG(D_INFO,
399 "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
400 return 0;
401 }
402
Dmitry Eremin6246dab2014-04-27 13:06:59 -0400403 mutex_lock(&cpt_data.cpt_mutex);
Peng Taod7e09d02013-05-02 16:46:55 +0800404
405 mask = cpt_data.cpt_cpumask;
Shivani Bhardwaj26da3232015-11-02 23:19:55 +0530406 cpumask_copy(mask, cpumask_of_node(node));
Peng Taod7e09d02013-05-02 16:46:55 +0800407
408 rc = cfs_cpt_set_cpumask(cptab, cpt, mask);
409
Dmitry Eremin6246dab2014-04-27 13:06:59 -0400410 mutex_unlock(&cpt_data.cpt_mutex);
Peng Taod7e09d02013-05-02 16:46:55 +0800411
412 return rc;
413}
414EXPORT_SYMBOL(cfs_cpt_set_node);
415
416void
417cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node)
418{
419 cpumask_t *mask;
420
421 if (node < 0 || node >= MAX_NUMNODES) {
422 CDEBUG(D_INFO,
423 "Invalid NUMA id %d for CPU partition %d\n", node, cpt);
424 return;
425 }
426
Dmitry Eremin6246dab2014-04-27 13:06:59 -0400427 mutex_lock(&cpt_data.cpt_mutex);
Peng Taod7e09d02013-05-02 16:46:55 +0800428
429 mask = cpt_data.cpt_cpumask;
Shivani Bhardwaj26da3232015-11-02 23:19:55 +0530430 cpumask_copy(mask, cpumask_of_node(node));
Peng Taod7e09d02013-05-02 16:46:55 +0800431
432 cfs_cpt_unset_cpumask(cptab, cpt, mask);
433
Dmitry Eremin6246dab2014-04-27 13:06:59 -0400434 mutex_unlock(&cpt_data.cpt_mutex);
Peng Taod7e09d02013-05-02 16:46:55 +0800435}
436EXPORT_SYMBOL(cfs_cpt_unset_node);
437
438int
439cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
440{
441 int i;
442
443 for_each_node_mask(i, *mask) {
444 if (!cfs_cpt_set_node(cptab, cpt, i))
445 return 0;
446 }
447
448 return 1;
449}
450EXPORT_SYMBOL(cfs_cpt_set_nodemask);
451
452void
453cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask)
454{
455 int i;
456
457 for_each_node_mask(i, *mask)
458 cfs_cpt_unset_node(cptab, cpt, i);
459}
460EXPORT_SYMBOL(cfs_cpt_unset_nodemask);
461
462void
463cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt)
464{
465 int last;
466 int i;
467
468 if (cpt == CFS_CPT_ANY) {
469 last = cptab->ctb_nparts - 1;
470 cpt = 0;
471 } else {
472 last = cpt;
473 }
474
475 for (; cpt <= last; cpt++) {
Oleg Drokin84177732015-03-07 19:24:26 -0500476 for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask)
Peng Taod7e09d02013-05-02 16:46:55 +0800477 cfs_cpt_unset_cpu(cptab, cpt, i);
478 }
479}
480EXPORT_SYMBOL(cfs_cpt_clear);
481
482int
483cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt)
484{
485 nodemask_t *mask;
486 int weight;
487 int rotor;
488 int node;
489
490 /* convert CPU partition ID to HW node id */
491
492 if (cpt < 0 || cpt >= cptab->ctb_nparts) {
493 mask = cptab->ctb_nodemask;
494 rotor = cptab->ctb_spread_rotor++;
495 } else {
496 mask = cptab->ctb_parts[cpt].cpt_nodemask;
497 rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++;
498 }
499
500 weight = nodes_weight(*mask);
501 LASSERT(weight > 0);
502
503 rotor %= weight;
504
505 for_each_node_mask(node, *mask) {
506 if (rotor-- == 0)
507 return node;
508 }
509
510 LBUG();
511 return 0;
512}
513EXPORT_SYMBOL(cfs_cpt_spread_node);
514
515int
516cfs_cpt_current(struct cfs_cpt_table *cptab, int remap)
517{
518 int cpu = smp_processor_id();
519 int cpt = cptab->ctb_cpu2cpt[cpu];
520
521 if (cpt < 0) {
522 if (!remap)
523 return cpt;
524
525 /* don't return negative value for safety of upper layer,
Oleg Drokina3fbcb3c2016-02-16 00:47:08 -0500526 * instead we shadow the unknown cpu to a valid partition ID
527 */
Peng Taod7e09d02013-05-02 16:46:55 +0800528 cpt = cpu % cptab->ctb_nparts;
529 }
530
531 return cpt;
532}
533EXPORT_SYMBOL(cfs_cpt_current);
534
535int
536cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu)
537{
Oleg Drokinc96d2362015-03-02 01:01:47 -0500538 LASSERT(cpu >= 0 && cpu < nr_cpu_ids);
Peng Taod7e09d02013-05-02 16:46:55 +0800539
540 return cptab->ctb_cpu2cpt[cpu];
541}
542EXPORT_SYMBOL(cfs_cpt_of_cpu);
543
544int
545cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt)
546{
547 cpumask_t *cpumask;
548 nodemask_t *nodemask;
549 int rc;
550 int i;
551
552 LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts));
553
554 if (cpt == CFS_CPT_ANY) {
555 cpumask = cptab->ctb_cpumask;
556 nodemask = cptab->ctb_nodemask;
557 } else {
558 cpumask = cptab->ctb_parts[cpt].cpt_cpumask;
559 nodemask = cptab->ctb_parts[cpt].cpt_nodemask;
560 }
561
Oleg Drokin84177732015-03-07 19:24:26 -0500562 if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) {
Joe Perches2d00bd12014-11-23 11:28:50 -0800563 CERROR("No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n",
564 cpt);
Peng Taod7e09d02013-05-02 16:46:55 +0800565 return -EINVAL;
566 }
567
568 for_each_online_cpu(i) {
Oleg Drokinc96d2362015-03-02 01:01:47 -0500569 if (cpumask_test_cpu(i, cpumask))
Peng Taod7e09d02013-05-02 16:46:55 +0800570 continue;
571
Peng Tao32654b62013-06-06 22:59:09 +0800572 rc = set_cpus_allowed_ptr(current, cpumask);
Peng Taod7e09d02013-05-02 16:46:55 +0800573 set_mems_allowed(*nodemask);
574 if (rc == 0)
575 schedule(); /* switch to allowed CPU */
576
577 return rc;
578 }
579
580 /* don't need to set affinity because all online CPUs are covered */
581 return 0;
582}
583EXPORT_SYMBOL(cfs_cpt_bind);
584
585/**
586 * Choose max to \a number CPUs from \a node and set them in \a cpt.
587 * We always prefer to choose CPU in the same core/socket.
588 */
589static int
590cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt,
591 cpumask_t *node, int number)
592{
593 cpumask_t *socket = NULL;
594 cpumask_t *core = NULL;
595 int rc = 0;
596 int cpu;
597
598 LASSERT(number > 0);
599
Oleg Drokinc96d2362015-03-02 01:01:47 -0500600 if (number >= cpumask_weight(node)) {
601 while (!cpumask_empty(node)) {
602 cpu = cpumask_first(node);
Peng Taod7e09d02013-05-02 16:46:55 +0800603
604 rc = cfs_cpt_set_cpu(cptab, cpt, cpu);
605 if (!rc)
606 return -EINVAL;
Oleg Drokinc96d2362015-03-02 01:01:47 -0500607 cpumask_clear_cpu(cpu, node);
Peng Taod7e09d02013-05-02 16:46:55 +0800608 }
609 return 0;
610 }
611
612 /* allocate scratch buffer */
613 LIBCFS_ALLOC(socket, cpumask_size());
614 LIBCFS_ALLOC(core, cpumask_size());
Oleg Drokin15d9f522016-02-16 00:46:44 -0500615 if (!socket || !core) {
Peng Taod7e09d02013-05-02 16:46:55 +0800616 rc = -ENOMEM;
617 goto out;
618 }
619
Oleg Drokinc96d2362015-03-02 01:01:47 -0500620 while (!cpumask_empty(node)) {
621 cpu = cpumask_first(node);
Peng Taod7e09d02013-05-02 16:46:55 +0800622
623 /* get cpumask for cores in the same socket */
Shivani Bhardwaj7d6e3982015-11-02 23:19:13 +0530624 cpumask_copy(socket, topology_core_cpumask(cpu));
Oleg Drokinc96d2362015-03-02 01:01:47 -0500625 cpumask_and(socket, socket, node);
Peng Taod7e09d02013-05-02 16:46:55 +0800626
Oleg Drokinc96d2362015-03-02 01:01:47 -0500627 LASSERT(!cpumask_empty(socket));
Peng Taod7e09d02013-05-02 16:46:55 +0800628
Oleg Drokinc96d2362015-03-02 01:01:47 -0500629 while (!cpumask_empty(socket)) {
Peng Taod7e09d02013-05-02 16:46:55 +0800630 int i;
631
632 /* get cpumask for hts in the same core */
Shivani Bhardwaj9561c252015-11-02 23:19:34 +0530633 cpumask_copy(core, topology_sibling_cpumask(cpu));
Oleg Drokinc96d2362015-03-02 01:01:47 -0500634 cpumask_and(core, core, node);
Peng Taod7e09d02013-05-02 16:46:55 +0800635
Oleg Drokinc96d2362015-03-02 01:01:47 -0500636 LASSERT(!cpumask_empty(core));
Peng Taod7e09d02013-05-02 16:46:55 +0800637
Oleg Drokin84177732015-03-07 19:24:26 -0500638 for_each_cpu(i, core) {
Oleg Drokinc96d2362015-03-02 01:01:47 -0500639 cpumask_clear_cpu(i, socket);
640 cpumask_clear_cpu(i, node);
Peng Taod7e09d02013-05-02 16:46:55 +0800641
642 rc = cfs_cpt_set_cpu(cptab, cpt, i);
643 if (!rc) {
644 rc = -EINVAL;
645 goto out;
646 }
647
648 if (--number == 0)
649 goto out;
650 }
Oleg Drokinc96d2362015-03-02 01:01:47 -0500651 cpu = cpumask_first(socket);
Peng Taod7e09d02013-05-02 16:46:55 +0800652 }
653 }
654
655 out:
Oleg Drokin15d9f522016-02-16 00:46:44 -0500656 if (socket)
Peng Taod7e09d02013-05-02 16:46:55 +0800657 LIBCFS_FREE(socket, cpumask_size());
Oleg Drokin15d9f522016-02-16 00:46:44 -0500658 if (core)
Peng Taod7e09d02013-05-02 16:46:55 +0800659 LIBCFS_FREE(core, cpumask_size());
660 return rc;
661}
662
663#define CPT_WEIGHT_MIN 4u
664
665static unsigned int
666cfs_cpt_num_estimate(void)
667{
668 unsigned nnode = num_online_nodes();
669 unsigned ncpu = num_online_cpus();
670 unsigned ncpt;
671
672 if (ncpu <= CPT_WEIGHT_MIN) {
673 ncpt = 1;
674 goto out;
675 }
676
677 /* generate reasonable number of CPU partitions based on total number
678 * of CPUs, Preferred N should be power2 and match this condition:
Oleg Drokina3fbcb3c2016-02-16 00:47:08 -0500679 * 2 * (N - 1)^2 < NCPUS <= 2 * N^2
680 */
Mike Rapoport910b5512015-09-12 19:20:03 +0300681 for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1)
682 ;
Peng Taod7e09d02013-05-02 16:46:55 +0800683
684 if (ncpt <= nnode) { /* fat numa system */
685 while (nnode > ncpt)
686 nnode >>= 1;
687
688 } else { /* ncpt > nnode */
689 while ((nnode << 1) <= ncpt)
690 nnode <<= 1;
691 }
692
693 ncpt = nnode;
694
695 out:
696#if (BITS_PER_LONG == 32)
697 /* config many CPU partitions on 32-bit system could consume
Oleg Drokina3fbcb3c2016-02-16 00:47:08 -0500698 * too much memory
699 */
Peng Taod7e09d02013-05-02 16:46:55 +0800700 ncpt = min(2U, ncpt);
701#endif
702 while (ncpu % ncpt != 0)
703 ncpt--; /* worst case is 1 */
704
705 return ncpt;
706}
707
708static struct cfs_cpt_table *
709cfs_cpt_table_create(int ncpt)
710{
711 struct cfs_cpt_table *cptab = NULL;
712 cpumask_t *mask = NULL;
713 int cpt = 0;
714 int num;
715 int rc;
716 int i;
717
718 rc = cfs_cpt_num_estimate();
719 if (ncpt <= 0)
720 ncpt = rc;
721
722 if (ncpt > num_online_cpus() || ncpt > 4 * rc) {
Joe Perches2d00bd12014-11-23 11:28:50 -0800723 CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n",
Peng Taod7e09d02013-05-02 16:46:55 +0800724 ncpt, rc);
725 }
726
727 if (num_online_cpus() % ncpt != 0) {
Joe Perches2d00bd12014-11-23 11:28:50 -0800728 CERROR("CPU number %d is not multiple of cpu_npartition %d, please try different cpu_npartitions value or set pattern string by cpu_pattern=STRING\n",
Peng Taod7e09d02013-05-02 16:46:55 +0800729 (int)num_online_cpus(), ncpt);
730 goto failed;
731 }
732
733 cptab = cfs_cpt_table_alloc(ncpt);
Oleg Drokin15d9f522016-02-16 00:46:44 -0500734 if (!cptab) {
Peng Taod7e09d02013-05-02 16:46:55 +0800735 CERROR("Failed to allocate CPU map(%d)\n", ncpt);
736 goto failed;
737 }
738
739 num = num_online_cpus() / ncpt;
740 if (num == 0) {
741 CERROR("CPU changed while setting CPU partition\n");
742 goto failed;
743 }
744
745 LIBCFS_ALLOC(mask, cpumask_size());
Oleg Drokin15d9f522016-02-16 00:46:44 -0500746 if (!mask) {
Peng Taod7e09d02013-05-02 16:46:55 +0800747 CERROR("Failed to allocate scratch cpumask\n");
748 goto failed;
749 }
750
751 for_each_online_node(i) {
Shivani Bhardwaj26da3232015-11-02 23:19:55 +0530752 cpumask_copy(mask, cpumask_of_node(i));
Peng Taod7e09d02013-05-02 16:46:55 +0800753
Oleg Drokinc96d2362015-03-02 01:01:47 -0500754 while (!cpumask_empty(mask)) {
Peng Taod7e09d02013-05-02 16:46:55 +0800755 struct cfs_cpu_partition *part;
756 int n;
757
Andriy Skulysh9ebd4892016-04-27 21:37:16 -0400758 /*
759 * Each emulated NUMA node has all allowed CPUs in
760 * the mask.
761 * End loop when all partitions have assigned CPUs.
762 */
763 if (cpt == ncpt)
764 break;
Peng Taod7e09d02013-05-02 16:46:55 +0800765
766 part = &cptab->ctb_parts[cpt];
767
Oleg Drokinc96d2362015-03-02 01:01:47 -0500768 n = num - cpumask_weight(part->cpt_cpumask);
Peng Taod7e09d02013-05-02 16:46:55 +0800769 LASSERT(n > 0);
770
771 rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n);
772 if (rc < 0)
773 goto failed;
774
Oleg Drokinc96d2362015-03-02 01:01:47 -0500775 LASSERT(num >= cpumask_weight(part->cpt_cpumask));
776 if (num == cpumask_weight(part->cpt_cpumask))
Peng Taod7e09d02013-05-02 16:46:55 +0800777 cpt++;
778 }
779 }
780
781 if (cpt != ncpt ||
Oleg Drokinc96d2362015-03-02 01:01:47 -0500782 num != cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask)) {
Joe Perches2d00bd12014-11-23 11:28:50 -0800783 CERROR("Expect %d(%d) CPU partitions but got %d(%d), CPU hotplug/unplug while setting?\n",
Peng Taod7e09d02013-05-02 16:46:55 +0800784 cptab->ctb_nparts, num, cpt,
Oleg Drokinc96d2362015-03-02 01:01:47 -0500785 cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask));
Peng Taod7e09d02013-05-02 16:46:55 +0800786 goto failed;
787 }
788
789 LIBCFS_FREE(mask, cpumask_size());
790
791 return cptab;
792
793 failed:
Joe Perches2d00bd12014-11-23 11:28:50 -0800794 CERROR("Failed to setup CPU-partition-table with %d CPU-partitions, online HW nodes: %d, HW cpus: %d.\n",
Peng Taod7e09d02013-05-02 16:46:55 +0800795 ncpt, num_online_nodes(), num_online_cpus());
796
Oleg Drokin15d9f522016-02-16 00:46:44 -0500797 if (mask)
Peng Taod7e09d02013-05-02 16:46:55 +0800798 LIBCFS_FREE(mask, cpumask_size());
799
Oleg Drokin15d9f522016-02-16 00:46:44 -0500800 if (cptab)
Peng Taod7e09d02013-05-02 16:46:55 +0800801 cfs_cpt_table_free(cptab);
802
803 return NULL;
804}
805
806static struct cfs_cpt_table *
807cfs_cpt_table_create_pattern(char *pattern)
808{
809 struct cfs_cpt_table *cptab;
810 char *str = pattern;
811 int node = 0;
812 int high;
813 int ncpt;
814 int c;
815
816 for (ncpt = 0;; ncpt++) { /* quick scan bracket */
817 str = strchr(str, '[');
Oleg Drokin15d9f522016-02-16 00:46:44 -0500818 if (!str)
Peng Taod7e09d02013-05-02 16:46:55 +0800819 break;
820 str++;
821 }
822
823 str = cfs_trimwhite(pattern);
824 if (*str == 'n' || *str == 'N') {
825 pattern = str + 1;
826 node = 1;
827 }
828
829 if (ncpt == 0 ||
830 (node && ncpt > num_online_nodes()) ||
831 (!node && ncpt > num_online_cpus())) {
832 CERROR("Invalid pattern %s, or too many partitions %d\n",
833 pattern, ncpt);
834 return NULL;
835 }
836
Oleg Drokinc96d2362015-03-02 01:01:47 -0500837 high = node ? MAX_NUMNODES - 1 : nr_cpu_ids - 1;
Peng Taod7e09d02013-05-02 16:46:55 +0800838
839 cptab = cfs_cpt_table_alloc(ncpt);
Oleg Drokin15d9f522016-02-16 00:46:44 -0500840 if (!cptab) {
Peng Taod7e09d02013-05-02 16:46:55 +0800841 CERROR("Failed to allocate cpu partition table\n");
842 return NULL;
843 }
844
845 for (str = cfs_trimwhite(pattern), c = 0;; c++) {
846 struct cfs_range_expr *range;
847 struct cfs_expr_list *el;
848 char *bracket = strchr(str, '[');
849 int cpt;
850 int rc;
851 int i;
852 int n;
853
Oleg Drokin15d9f522016-02-16 00:46:44 -0500854 if (!bracket) {
Peng Taod7e09d02013-05-02 16:46:55 +0800855 if (*str != 0) {
856 CERROR("Invalid pattern %s\n", str);
857 goto failed;
Janani Ravichandran6c441b92016-02-18 17:15:48 -0500858 }
859 if (c != ncpt) {
Peng Taod7e09d02013-05-02 16:46:55 +0800860 CERROR("expect %d partitions but found %d\n",
861 ncpt, c);
862 goto failed;
863 }
864 break;
865 }
866
Dmitry Eremin16e9f6d2014-04-27 13:06:55 -0400867 if (sscanf(str, "%d%n", &cpt, &n) < 1) {
Peng Taod7e09d02013-05-02 16:46:55 +0800868 CERROR("Invalid cpu pattern %s\n", str);
869 goto failed;
870 }
871
872 if (cpt < 0 || cpt >= ncpt) {
873 CERROR("Invalid partition id %d, total partitions %d\n",
874 cpt, ncpt);
875 goto failed;
876 }
877
878 if (cfs_cpt_weight(cptab, cpt) != 0) {
879 CERROR("Partition %d has already been set.\n", cpt);
880 goto failed;
881 }
882
883 str = cfs_trimwhite(str + n);
884 if (str != bracket) {
885 CERROR("Invalid pattern %s\n", str);
886 goto failed;
887 }
888
889 bracket = strchr(str, ']');
Oleg Drokin15d9f522016-02-16 00:46:44 -0500890 if (!bracket) {
Peng Taod7e09d02013-05-02 16:46:55 +0800891 CERROR("missing right bracket for cpt %d, %s\n",
892 cpt, str);
893 goto failed;
894 }
895
896 if (cfs_expr_list_parse(str, (bracket - str) + 1,
897 0, high, &el) != 0) {
898 CERROR("Can't parse number range: %s\n", str);
899 goto failed;
900 }
901
902 list_for_each_entry(range, &el->el_exprs, re_link) {
903 for (i = range->re_lo; i <= range->re_hi; i++) {
904 if ((i - range->re_lo) % range->re_stride != 0)
905 continue;
906
907 rc = node ? cfs_cpt_set_node(cptab, cpt, i) :
908 cfs_cpt_set_cpu(cptab, cpt, i);
909 if (!rc) {
910 cfs_expr_list_free(el);
911 goto failed;
912 }
913 }
914 }
915
916 cfs_expr_list_free(el);
917
918 if (!cfs_cpt_online(cptab, cpt)) {
919 CERROR("No online CPU is found on partition %d\n", cpt);
920 goto failed;
921 }
922
923 str = cfs_trimwhite(bracket + 1);
924 }
925
926 return cptab;
927
928 failed:
929 cfs_cpt_table_free(cptab);
930 return NULL;
931}
932
933#ifdef CONFIG_HOTPLUG_CPU
934static int
935cfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
936{
937 unsigned int cpu = (unsigned long)hcpu;
Oleg Drokin6fd67d82014-02-28 21:16:46 -0500938 bool warn;
Peng Taod7e09d02013-05-02 16:46:55 +0800939
940 switch (action) {
941 case CPU_DEAD:
942 case CPU_DEAD_FROZEN:
943 case CPU_ONLINE:
944 case CPU_ONLINE_FROZEN:
945 spin_lock(&cpt_data.cpt_lock);
946 cpt_data.cpt_version++;
947 spin_unlock(&cpt_data.cpt_lock);
Oleg Drokin4d8efec2016-02-16 00:47:11 -0500948 /* Fall through */
Peng Taod7e09d02013-05-02 16:46:55 +0800949 default:
Oleg Drokin6fd67d82014-02-28 21:16:46 -0500950 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) {
951 CDEBUG(D_INFO, "CPU changed [cpu %u action %lx]\n",
952 cpu, action);
953 break;
954 }
955
Dmitry Eremin6246dab2014-04-27 13:06:59 -0400956 mutex_lock(&cpt_data.cpt_mutex);
Oleg Drokin6fd67d82014-02-28 21:16:46 -0500957 /* if all HTs in a core are offline, it may break affinity */
Shivani Bhardwaj9561c252015-11-02 23:19:34 +0530958 cpumask_copy(cpt_data.cpt_cpumask,
959 topology_sibling_cpumask(cpu));
Oleg Drokin84177732015-03-07 19:24:26 -0500960 warn = cpumask_any_and(cpt_data.cpt_cpumask,
961 cpu_online_mask) >= nr_cpu_ids;
Dmitry Eremin6246dab2014-04-27 13:06:59 -0400962 mutex_unlock(&cpt_data.cpt_mutex);
Oleg Drokin6fd67d82014-02-28 21:16:46 -0500963 CDEBUG(warn ? D_WARNING : D_INFO,
Joe Perches2d00bd12014-11-23 11:28:50 -0800964 "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u action: %lx]\n",
965 cpu, action);
Peng Taod7e09d02013-05-02 16:46:55 +0800966 }
967
968 return NOTIFY_OK;
969}
970
971static struct notifier_block cfs_cpu_notifier = {
972 .notifier_call = cfs_cpu_notify,
973 .priority = 0
974};
975
976#endif
977
978void
979cfs_cpu_fini(void)
980{
Oleg Drokin15d9f522016-02-16 00:46:44 -0500981 if (cfs_cpt_table)
Peng Taod7e09d02013-05-02 16:46:55 +0800982 cfs_cpt_table_free(cfs_cpt_table);
983
984#ifdef CONFIG_HOTPLUG_CPU
985 unregister_hotcpu_notifier(&cfs_cpu_notifier);
986#endif
Oleg Drokin15d9f522016-02-16 00:46:44 -0500987 if (cpt_data.cpt_cpumask)
Peng Taod7e09d02013-05-02 16:46:55 +0800988 LIBCFS_FREE(cpt_data.cpt_cpumask, cpumask_size());
989}
990
991int
992cfs_cpu_init(void)
993{
Oleg Drokin15d9f522016-02-16 00:46:44 -0500994 LASSERT(!cfs_cpt_table);
Peng Taod7e09d02013-05-02 16:46:55 +0800995
996 memset(&cpt_data, 0, sizeof(cpt_data));
997
998 LIBCFS_ALLOC(cpt_data.cpt_cpumask, cpumask_size());
Oleg Drokin15d9f522016-02-16 00:46:44 -0500999 if (!cpt_data.cpt_cpumask) {
Peng Taod7e09d02013-05-02 16:46:55 +08001000 CERROR("Failed to allocate scratch buffer\n");
1001 return -1;
1002 }
1003
1004 spin_lock_init(&cpt_data.cpt_lock);
Dmitry Eremin6246dab2014-04-27 13:06:59 -04001005 mutex_init(&cpt_data.cpt_mutex);
Peng Taod7e09d02013-05-02 16:46:55 +08001006
1007#ifdef CONFIG_HOTPLUG_CPU
1008 register_hotcpu_notifier(&cfs_cpu_notifier);
1009#endif
1010
1011 if (*cpu_pattern != 0) {
1012 cfs_cpt_table = cfs_cpt_table_create_pattern(cpu_pattern);
Oleg Drokin15d9f522016-02-16 00:46:44 -05001013 if (!cfs_cpt_table) {
Peng Taod7e09d02013-05-02 16:46:55 +08001014 CERROR("Failed to create cptab from pattern %s\n",
1015 cpu_pattern);
1016 goto failed;
1017 }
1018
1019 } else {
1020 cfs_cpt_table = cfs_cpt_table_create(cpu_npartitions);
Oleg Drokin15d9f522016-02-16 00:46:44 -05001021 if (!cfs_cpt_table) {
Peng Taod7e09d02013-05-02 16:46:55 +08001022 CERROR("Failed to create ptable with npartitions %d\n",
1023 cpu_npartitions);
1024 goto failed;
1025 }
1026 }
1027
1028 spin_lock(&cpt_data.cpt_lock);
1029 if (cfs_cpt_table->ctb_version != cpt_data.cpt_version) {
1030 spin_unlock(&cpt_data.cpt_lock);
1031 CERROR("CPU hotplug/unplug during setup\n");
1032 goto failed;
1033 }
1034 spin_unlock(&cpt_data.cpt_lock);
1035
1036 LCONSOLE(0, "HW CPU cores: %d, npartitions: %d\n",
1037 num_online_cpus(), cfs_cpt_number(cfs_cpt_table));
1038 return 0;
1039
1040 failed:
1041 cfs_cpu_fini();
1042 return -1;
1043}
1044
1045#endif