blob: fbdd00a7aabf2ac37512d639fd060f469070e906 [file] [log] [blame]
Patrick Bellasiae710302015-06-23 09:17:54 +01001#include <linux/cgroup.h>
2#include <linux/err.h>
Patrick Bellasi2f369bb2016-01-12 18:12:13 +00003#include <linux/kernel.h>
Patrick Bellasiae710302015-06-23 09:17:54 +01004#include <linux/percpu.h>
5#include <linux/printk.h>
Patrick Bellasiedd28d32015-07-07 15:33:20 +01006#include <linux/rcupdate.h>
Patrick Bellasiae710302015-06-23 09:17:54 +01007#include <linux/slab.h>
8
Patrick Bellasi050dcb82015-06-22 13:49:07 +01009#include <trace/events/sched.h>
10
Patrick Bellasi69fa4c72015-06-22 18:11:44 +010011#include "sched.h"
Patrick Bellasic5b20422016-07-29 15:45:57 +010012#include "tune.h"
Patrick Bellasi69fa4c72015-06-22 18:11:44 +010013
Patrick Bellasid2489002016-07-28 18:44:40 +010014#ifdef CONFIG_CGROUP_SCHEDTUNE
Chris Redpath293edee2017-03-27 18:20:20 +010015bool schedtune_initialized = false;
Patrick Bellasid2489002016-07-28 18:44:40 +010016#endif
17
Patrick Bellasi69fa4c72015-06-22 18:11:44 +010018unsigned int sysctl_sched_cfs_boost __read_mostly;
19
Patrick Bellasid8460c72016-10-13 17:31:24 +010020extern struct reciprocal_value schedtune_spc_rdiv;
Patrick Bellasi7f44e922017-09-12 14:57:51 +010021struct target_nrg schedtune_target_nrg;
Patrick Bellasi2f369bb2016-01-12 18:12:13 +000022
23/* Performance Boost region (B) threshold params */
24static int perf_boost_idx;
25
26/* Performance Constraint region (C) threshold params */
27static int perf_constrain_idx;
28
29/**
30 * Performance-Energy (P-E) Space thresholds constants
31 */
32struct threshold_params {
33 int nrg_gain;
34 int cap_gain;
35};
36
37/*
38 * System specific P-E space thresholds constants
39 */
40static struct threshold_params
41threshold_gains[] = {
Patrick Bellasid5563d32016-07-29 15:32:26 +010042 { 0, 5 }, /* < 10% */
43 { 1, 5 }, /* < 20% */
44 { 2, 5 }, /* < 30% */
45 { 3, 5 }, /* < 40% */
46 { 4, 5 }, /* < 50% */
47 { 5, 4 }, /* < 60% */
48 { 5, 3 }, /* < 70% */
49 { 5, 2 }, /* < 80% */
50 { 5, 1 }, /* < 90% */
51 { 5, 0 } /* <= 100% */
Patrick Bellasi2f369bb2016-01-12 18:12:13 +000052};
53
54static int
55__schedtune_accept_deltas(int nrg_delta, int cap_delta,
56 int perf_boost_idx, int perf_constrain_idx)
57{
58 int payoff = -INT_MAX;
Patrick Bellasi2ed513e2016-07-28 17:38:25 +010059 int gain_idx = -1;
Patrick Bellasi2f369bb2016-01-12 18:12:13 +000060
61 /* Performance Boost (B) region */
Patrick Bellasi2ed513e2016-07-28 17:38:25 +010062 if (nrg_delta >= 0 && cap_delta > 0)
63 gain_idx = perf_boost_idx;
Patrick Bellasi2f369bb2016-01-12 18:12:13 +000064 /* Performance Constraint (C) region */
Patrick Bellasi2ed513e2016-07-28 17:38:25 +010065 else if (nrg_delta < 0 && cap_delta <= 0)
66 gain_idx = perf_constrain_idx;
Patrick Bellasi2f369bb2016-01-12 18:12:13 +000067
68 /* Default: reject schedule candidate */
Patrick Bellasi2ed513e2016-07-28 17:38:25 +010069 if (gain_idx == -1)
70 return payoff;
71
72 /*
73 * Evaluate "Performance Boost" vs "Energy Increase"
74 *
75 * - Performance Boost (B) region
76 *
77 * Condition: nrg_delta > 0 && cap_delta > 0
78 * Payoff criteria:
79 * cap_gain / nrg_gain < cap_delta / nrg_delta =
80 * cap_gain * nrg_delta < cap_delta * nrg_gain
81 * Note that since both nrg_gain and nrg_delta are positive, the
82 * inequality does not change. Thus:
83 *
84 * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
85 *
86 * - Performance Constraint (C) region
87 *
88 * Condition: nrg_delta < 0 && cap_delta < 0
89 * payoff criteria:
90 * cap_gain / nrg_gain > cap_delta / nrg_delta =
91 * cap_gain * nrg_delta < cap_delta * nrg_gain
92 * Note that since nrg_gain > 0 while nrg_delta < 0, the
93 * inequality change. Thus:
94 *
95 * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
96 *
97 * This means that, in case of same positive defined {cap,nrg}_gain
98 * for both the B and C regions, we can use the same payoff formula
99 * where a positive value represents the accept condition.
100 */
101 payoff = cap_delta * threshold_gains[gain_idx].nrg_gain;
102 payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain;
103
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000104 return payoff;
105}
106
Patrick Bellasiae710302015-06-23 09:17:54 +0100107#ifdef CONFIG_CGROUP_SCHEDTUNE
108
109/*
110 * EAS scheduler tunables for task groups.
Patrick Bellasidb2c5202018-02-12 16:09:28 +0000111 *
112 * When CGroup support is enabled, we have to synchronize two different
113 * paths:
114 * - slow path: where CGroups are created/updated/removed
115 * - fast path: where tasks in a CGroups are accounted
116 *
117 * The slow path tracks (a limited number of) CGroups and maps each on a
118 * "boost_group" index. The fastpath accounts tasks currently RUNNABLE on each
119 * "boost_group".
120 *
121 * Once a new CGroup is created, a boost group idx is assigned and the
122 * corresponding "boost_group" marked as valid on each CPU.
123 * Once a CGroup is release, the corresponding "boost_group" is marked as
124 * invalid on each CPU. The CPU boost value (boost_max) is aggregated by
125 * considering only valid boost_groups with a non null tasks counter.
126 *
127 * .:: Locking strategy
128 *
129 * The fast path uses a spin lock for each CPU boost_group which protects the
130 * tasks counter.
131 *
132 * The "valid" and "boost" values of each CPU boost_group is instead
133 * protected by the RCU lock provided by the CGroups callbacks. Thus, only the
134 * slow path can access and modify the boost_group attribtues of each CPU.
135 * The fast path will catch up the most updated values at the next scheduling
136 * event (i.e. enqueue/dequeue).
137 *
138 * |
139 * SLOW PATH | FAST PATH
140 * CGroup add/update/remove | Scheduler enqueue/dequeue events
141 * |
142 * |
143 * | DEFINE_PER_CPU(struct boost_groups)
144 * | +--------------+----+---+----+----+
145 * | | idle | | | | |
146 * | | boost_max | | | | |
147 * | +---->lock | | | | |
148 * struct schedtune allocated_groups | | | group[ ] | | | | |
149 * +------------------------------+ +-------+ | | +--+---------+-+----+---+----+----+
150 * | idx | | | | | | valid |
151 * | boots / prefer_idle | | | | | | boost |
152 * | perf_{boost/constraints}_idx | <---------+(*) | | | | tasks | <------------+
153 * | css | +-------+ | | +---------+ |
154 * +-+----------------------------+ | | | | | | |
155 * ^ | | | | | | |
156 * | +-------+ | | +---------+ |
157 * | | | | | | | |
158 * | | | | | | | |
159 * | +-------+ | | +---------+ |
160 * | zmalloc | | | | | | |
161 * | | | | | | | |
162 * | +-------+ | | +---------+ |
163 * + BOOSTGROUPS_COUNT | | BOOSTGROUPS_COUNT |
164 * schedtune_boostgroup_init() | + |
165 * | schedtune_{en,de}queue_task() |
166 * | +
167 * | schedtune_tasks_update()
168 * |
Patrick Bellasiae710302015-06-23 09:17:54 +0100169 */
170
171/* SchdTune tunables for a group of tasks */
172struct schedtune {
173 /* SchedTune CGroup subsystem */
174 struct cgroup_subsys_state css;
175
176 /* Boost group allocated ID */
177 int idx;
178
179 /* Boost value for tasks on that SchedTune CGroup */
180 int boost;
181
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000182 /* Performance Boost (B) region threshold params */
183 int perf_boost_idx;
184
185 /* Performance Constraint (C) region threshold params */
186 int perf_constrain_idx;
Srinath Sridharan42503db2016-07-14 13:09:03 -0700187
188 /* Hint to bias scheduling of tasks on that SchedTune CGroup
189 * towards idle CPUs */
190 int prefer_idle;
Patrick Bellasiae710302015-06-23 09:17:54 +0100191};
192
193static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
194{
195 return css ? container_of(css, struct schedtune, css) : NULL;
196}
197
198static inline struct schedtune *task_schedtune(struct task_struct *tsk)
199{
200 return css_st(task_css(tsk, schedtune_cgrp_id));
201}
202
203static inline struct schedtune *parent_st(struct schedtune *st)
204{
205 return css_st(st->css.parent);
206}
207
208/*
209 * SchedTune root control group
210 * The root control group is used to defined a system-wide boosting tuning,
211 * which is applied to all tasks in the system.
212 * Task specific boost tuning could be specified by creating and
213 * configuring a child control group under the root one.
214 * By default, system-wide boosting is disabled, i.e. no boosting is applied
215 * to tasks which are not into a child control group.
216 */
217static struct schedtune
218root_schedtune = {
219 .boost = 0,
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000220 .perf_boost_idx = 0,
221 .perf_constrain_idx = 0,
Srinath Sridharan42503db2016-07-14 13:09:03 -0700222 .prefer_idle = 0,
Patrick Bellasiae710302015-06-23 09:17:54 +0100223};
224
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000225int
226schedtune_accept_deltas(int nrg_delta, int cap_delta,
227 struct task_struct *task)
228{
229 struct schedtune *ct;
230 int perf_boost_idx;
231 int perf_constrain_idx;
232
233 /* Optimal (O) region */
Patrick Bellasi5824d982016-01-20 14:06:05 +0000234 if (nrg_delta < 0 && cap_delta > 0) {
235 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000236 return INT_MAX;
Patrick Bellasi5824d982016-01-20 14:06:05 +0000237 }
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000238
239 /* Suboptimal (S) region */
Patrick Bellasi5824d982016-01-20 14:06:05 +0000240 if (nrg_delta > 0 && cap_delta < 0) {
241 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000242 return -INT_MAX;
Patrick Bellasi5824d982016-01-20 14:06:05 +0000243 }
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000244
245 /* Get task specific perf Boost/Constraints indexes */
246 rcu_read_lock();
247 ct = task_schedtune(task);
248 perf_boost_idx = ct->perf_boost_idx;
249 perf_constrain_idx = ct->perf_constrain_idx;
250 rcu_read_unlock();
251
252 return __schedtune_accept_deltas(nrg_delta, cap_delta,
253 perf_boost_idx, perf_constrain_idx);
254}
255
Patrick Bellasiae710302015-06-23 09:17:54 +0100256/*
257 * Maximum number of boost groups to support
258 * When per-task boosting is used we still allow only limited number of
259 * boost groups for two main reasons:
260 * 1. on a real system we usually have only few classes of workloads which
261 * make sense to boost with different values (e.g. background vs foreground
262 * tasks, interactive vs low-priority tasks)
263 * 2. a limited number allows for a simpler and more memory/time efficient
264 * implementation especially for the computation of the per-CPU boost
265 * value
266 */
Chris Redpathe3cf6162017-03-24 17:40:51 +0000267#define BOOSTGROUPS_COUNT 5
Patrick Bellasiae710302015-06-23 09:17:54 +0100268
269/* Array of configured boostgroups */
270static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
271 &root_schedtune,
272 NULL,
273};
274
275/* SchedTune boost groups
276 * Keep track of all the boost groups which impact on CPU, for example when a
277 * CPU has two RUNNABLE tasks belonging to two different boost groups and thus
278 * likely with different boost values.
279 * Since on each system we expect only a limited number of boost groups, here
280 * we use a simple array to keep track of the metrics required to compute the
281 * maximum per-CPU boosting value.
282 */
283struct boost_groups {
284 /* Maximum boost value for all RUNNABLE tasks on a CPU */
Srinath Sridharane71c4252016-07-28 17:28:55 +0100285 int boost_max;
Patrick Bellasiae710302015-06-23 09:17:54 +0100286 struct {
Patrick Bellasidb2c5202018-02-12 16:09:28 +0000287 /* True when this boost group maps an actual cgroup */
288 bool valid;
Patrick Bellasiae710302015-06-23 09:17:54 +0100289 /* The boost for tasks on that boost group */
Srinath Sridharane71c4252016-07-28 17:28:55 +0100290 int boost;
Patrick Bellasiae710302015-06-23 09:17:54 +0100291 /* Count of RUNNABLE tasks on that boost group */
292 unsigned tasks;
293 } group[BOOSTGROUPS_COUNT];
Patrick Bellasid2489002016-07-28 18:44:40 +0100294 /* CPU's boost group locking */
295 raw_spinlock_t lock;
Patrick Bellasiae710302015-06-23 09:17:54 +0100296};
297
298/* Boost groups affecting each CPU in the system */
299DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
300
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000301static void
302schedtune_cpu_update(int cpu)
303{
304 struct boost_groups *bg;
Srinath Sridharane71c4252016-07-28 17:28:55 +0100305 int boost_max;
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000306 int idx;
307
308 bg = &per_cpu(cpu_boost_groups, cpu);
309
310 /* The root boost group is always active */
311 boost_max = bg->group[0].boost;
312 for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
Patrick Bellasidb2c5202018-02-12 16:09:28 +0000313
314 /* Ignore non boostgroups not mapping a cgroup */
315 if (!bg->group[idx].valid)
316 continue;
317
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000318 /*
319 * A boost group affects a CPU only if it has
320 * RUNNABLE tasks on that CPU
321 */
322 if (bg->group[idx].tasks == 0)
323 continue;
Srinath Sridharane71c4252016-07-28 17:28:55 +0100324
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000325 boost_max = max(boost_max, bg->group[idx].boost);
326 }
Patrick Bellasidb2c5202018-02-12 16:09:28 +0000327
Srinath Sridharane71c4252016-07-28 17:28:55 +0100328 /* Ensures boost_max is non-negative when all cgroup boost values
329 * are neagtive. Avoids under-accounting of cpu capacity which may cause
330 * task stacking and frequency spikes.*/
331 boost_max = max(boost_max, 0);
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000332 bg->boost_max = boost_max;
333}
334
335static int
336schedtune_boostgroup_update(int idx, int boost)
337{
338 struct boost_groups *bg;
339 int cur_boost_max;
340 int old_boost;
341 int cpu;
342
343 /* Update per CPU boost groups */
344 for_each_possible_cpu(cpu) {
345 bg = &per_cpu(cpu_boost_groups, cpu);
346
Patrick Bellasidb2c5202018-02-12 16:09:28 +0000347 /* CGroups are never associated to non active cgroups */
348 BUG_ON(!bg->group[idx].valid);
349
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000350 /*
351 * Keep track of current boost values to compute the per CPU
352 * maximum only when it has been affected by the new value of
353 * the updated boost group
354 */
355 cur_boost_max = bg->boost_max;
356 old_boost = bg->group[idx].boost;
357
358 /* Update the boost value of this boost group */
359 bg->group[idx].boost = boost;
360
361 /* Check if this update increase current max */
362 if (boost > cur_boost_max && bg->group[idx].tasks) {
363 bg->boost_max = boost;
Patrick Bellasi953b1042015-06-24 15:36:08 +0100364 trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000365 continue;
366 }
367
368 /* Check if this update has decreased current max */
Patrick Bellasi953b1042015-06-24 15:36:08 +0100369 if (cur_boost_max == old_boost && old_boost > boost) {
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000370 schedtune_cpu_update(cpu);
Patrick Bellasi953b1042015-06-24 15:36:08 +0100371 trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
372 continue;
373 }
374
375 trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max);
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000376 }
377
378 return 0;
379}
380
Patrick Bellasid2489002016-07-28 18:44:40 +0100381#define ENQUEUE_TASK 1
382#define DEQUEUE_TASK -1
383
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100384static inline void
385schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
386{
Patrick Bellasid2489002016-07-28 18:44:40 +0100387 struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
388 int tasks = bg->group[idx].tasks + task_count;
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100389
390 /* Update boosted tasks count while avoiding to make it negative */
Patrick Bellasid2489002016-07-28 18:44:40 +0100391 bg->group[idx].tasks = max(0, tasks);
Patrick Bellasi953b1042015-06-24 15:36:08 +0100392
393 trace_sched_tune_tasks_update(p, cpu, tasks, idx,
394 bg->group[idx].boost, bg->boost_max);
395
Patrick Bellasid2489002016-07-28 18:44:40 +0100396 /* Boost group activation or deactivation on that RQ */
397 if (tasks == 1 || tasks == 0)
398 schedtune_cpu_update(cpu);
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100399}
400
401/*
402 * NOTE: This function must be called while holding the lock on the CPU RQ
403 */
404void schedtune_enqueue_task(struct task_struct *p, int cpu)
405{
Patrick Bellasid2489002016-07-28 18:44:40 +0100406 struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
407 unsigned long irq_flags;
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100408 struct schedtune *st;
409 int idx;
410
Patrick Bellasid2489002016-07-28 18:44:40 +0100411 if (!unlikely(schedtune_initialized))
412 return;
413
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100414 /*
415 * When a task is marked PF_EXITING by do_exit() it's going to be
416 * dequeued and enqueued multiple times in the exit path.
417 * Thus we avoid any further update, since we do not want to change
418 * CPU boosting while the task is exiting.
419 */
420 if (p->flags & PF_EXITING)
421 return;
422
Patrick Bellasid2489002016-07-28 18:44:40 +0100423 /*
424 * Boost group accouting is protected by a per-cpu lock and requires
425 * interrupt to be disabled to avoid race conditions for example on
426 * do_exit()::cgroup_exit() and task migration.
427 */
428 raw_spin_lock_irqsave(&bg->lock, irq_flags);
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100429 rcu_read_lock();
Patrick Bellasid2489002016-07-28 18:44:40 +0100430
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100431 st = task_schedtune(p);
432 idx = st->idx;
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100433
Patrick Bellasid2489002016-07-28 18:44:40 +0100434 schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
435
436 rcu_read_unlock();
437 raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
438}
439
Patrick Bellasid2489002016-07-28 18:44:40 +0100440int schedtune_can_attach(struct cgroup_taskset *tset)
441{
442 struct task_struct *task;
443 struct cgroup_subsys_state *css;
444 struct boost_groups *bg;
445 struct rq_flags irq_flags;
446 unsigned int cpu;
447 struct rq *rq;
448 int src_bg; /* Source boost group index */
449 int dst_bg; /* Destination boost group index */
450 int tasks;
451
452 if (!unlikely(schedtune_initialized))
453 return 0;
454
455
456 cgroup_taskset_for_each(task, css, tset) {
457
458 /*
459 * Lock the CPU's RQ the task is enqueued to avoid race
460 * conditions with migration code while the task is being
461 * accounted
462 */
463 rq = lock_rq_of(task, &irq_flags);
464
465 if (!task->on_rq) {
466 unlock_rq_of(rq, task, &irq_flags);
467 continue;
468 }
469
470 /*
471 * Boost group accouting is protected by a per-cpu lock and requires
472 * interrupt to be disabled to avoid race conditions on...
473 */
474 cpu = cpu_of(rq);
475 bg = &per_cpu(cpu_boost_groups, cpu);
476 raw_spin_lock(&bg->lock);
477
478 dst_bg = css_st(css)->idx;
479 src_bg = task_schedtune(task)->idx;
480
481 /*
482 * Current task is not changing boostgroup, which can
483 * happen when the new hierarchy is in use.
484 */
485 if (unlikely(dst_bg == src_bg)) {
486 raw_spin_unlock(&bg->lock);
487 unlock_rq_of(rq, task, &irq_flags);
488 continue;
489 }
490
491 /*
492 * This is the case of a RUNNABLE task which is switching its
493 * current boost group.
494 */
495
496 /* Move task from src to dst boost group */
497 tasks = bg->group[src_bg].tasks - 1;
498 bg->group[src_bg].tasks = max(0, tasks);
499 bg->group[dst_bg].tasks += 1;
500
501 raw_spin_unlock(&bg->lock);
502 unlock_rq_of(rq, task, &irq_flags);
503
504 /* Update CPU boost group */
505 if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
506 schedtune_cpu_update(task_cpu(task));
507
508 }
509
510 return 0;
511}
512
513void schedtune_cancel_attach(struct cgroup_taskset *tset)
514{
515 /* This can happen only if SchedTune controller is mounted with
516 * other hierarchies ane one of them fails. Since usually SchedTune is
517 * mouted on its own hierarcy, for the time being we do not implement
518 * a proper rollback mechanism */
519 WARN(1, "SchedTune cancel attach not implemented");
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100520}
521
522/*
523 * NOTE: This function must be called while holding the lock on the CPU RQ
524 */
525void schedtune_dequeue_task(struct task_struct *p, int cpu)
526{
Patrick Bellasid2489002016-07-28 18:44:40 +0100527 struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
528 unsigned long irq_flags;
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100529 struct schedtune *st;
530 int idx;
531
Patrick Bellasid2489002016-07-28 18:44:40 +0100532 if (!unlikely(schedtune_initialized))
533 return;
534
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100535 /*
536 * When a task is marked PF_EXITING by do_exit() it's going to be
537 * dequeued and enqueued multiple times in the exit path.
538 * Thus we avoid any further update, since we do not want to change
539 * CPU boosting while the task is exiting.
Patrick Bellasid2489002016-07-28 18:44:40 +0100540 * The last dequeue is already enforce by the do_exit() code path
541 * via schedtune_exit_task().
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100542 */
543 if (p->flags & PF_EXITING)
544 return;
545
Patrick Bellasid2489002016-07-28 18:44:40 +0100546 /*
547 * Boost group accouting is protected by a per-cpu lock and requires
548 * interrupt to be disabled to avoid race conditions on...
549 */
550 raw_spin_lock_irqsave(&bg->lock, irq_flags);
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100551 rcu_read_lock();
Patrick Bellasid2489002016-07-28 18:44:40 +0100552
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100553 st = task_schedtune(p);
554 idx = st->idx;
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100555
Patrick Bellasid2489002016-07-28 18:44:40 +0100556 schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
557
558 rcu_read_unlock();
559 raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
560}
561
562void schedtune_exit_task(struct task_struct *tsk)
563{
564 struct schedtune *st;
565 struct rq_flags irq_flags;
566 unsigned int cpu;
567 struct rq *rq;
568 int idx;
569
570 if (!unlikely(schedtune_initialized))
571 return;
572
573 rq = lock_rq_of(tsk, &irq_flags);
574 rcu_read_lock();
575
576 cpu = cpu_of(rq);
577 st = task_schedtune(tsk);
578 idx = st->idx;
579 schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK);
580
581 rcu_read_unlock();
582 unlock_rq_of(rq, tsk, &irq_flags);
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100583}
584
585int schedtune_cpu_boost(int cpu)
586{
587 struct boost_groups *bg;
588
589 bg = &per_cpu(cpu_boost_groups, cpu);
590 return bg->boost_max;
591}
592
Patrick Bellasi9b2b8da2016-01-14 18:31:53 +0000593int schedtune_task_boost(struct task_struct *p)
594{
595 struct schedtune *st;
596 int task_boost;
597
Chris Redpath293edee2017-03-27 18:20:20 +0100598 if (!unlikely(schedtune_initialized))
599 return 0;
600
Patrick Bellasi9b2b8da2016-01-14 18:31:53 +0000601 /* Get task boost value */
602 rcu_read_lock();
603 st = task_schedtune(p);
604 task_boost = st->boost;
605 rcu_read_unlock();
606
607 return task_boost;
608}
609
Srinath Sridharan42503db2016-07-14 13:09:03 -0700610int schedtune_prefer_idle(struct task_struct *p)
611{
612 struct schedtune *st;
613 int prefer_idle;
614
Chris Redpath293edee2017-03-27 18:20:20 +0100615 if (!unlikely(schedtune_initialized))
616 return 0;
617
Srinath Sridharan42503db2016-07-14 13:09:03 -0700618 /* Get prefer_idle value */
619 rcu_read_lock();
620 st = task_schedtune(p);
621 prefer_idle = st->prefer_idle;
622 rcu_read_unlock();
623
624 return prefer_idle;
625}
626
627static u64
628prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft)
629{
630 struct schedtune *st = css_st(css);
631
632 return st->prefer_idle;
633}
634
635static int
636prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft,
637 u64 prefer_idle)
638{
639 struct schedtune *st = css_st(css);
640 st->prefer_idle = prefer_idle;
641
642 return 0;
643}
644
Srinath Sridharane71c4252016-07-28 17:28:55 +0100645static s64
Patrick Bellasiae710302015-06-23 09:17:54 +0100646boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
647{
648 struct schedtune *st = css_st(css);
649
650 return st->boost;
651}
652
653static int
654boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
Srinath Sridharane71c4252016-07-28 17:28:55 +0100655 s64 boost)
Patrick Bellasiae710302015-06-23 09:17:54 +0100656{
657 struct schedtune *st = css_st(css);
Patrick Bellasid5563d32016-07-29 15:32:26 +0100658 unsigned threshold_idx;
659 int boost_pct;
Patrick Bellasiae710302015-06-23 09:17:54 +0100660
Srinath Sridharane71c4252016-07-28 17:28:55 +0100661 if (boost < -100 || boost > 100)
Patrick Bellasiae710302015-06-23 09:17:54 +0100662 return -EINVAL;
Patrick Bellasid5563d32016-07-29 15:32:26 +0100663 boost_pct = boost;
664
665 /*
666 * Update threshold params for Performance Boost (B)
667 * and Performance Constraint (C) regions.
668 * The current implementatio uses the same cuts for both
669 * B and C regions.
670 */
671 threshold_idx = clamp(boost_pct, 0, 99) / 10;
672 st->perf_boost_idx = threshold_idx;
673 st->perf_constrain_idx = threshold_idx;
Patrick Bellasiae710302015-06-23 09:17:54 +0100674
675 st->boost = boost;
Patrick Bellasid5563d32016-07-29 15:32:26 +0100676 if (css == &root_schedtune.css) {
Patrick Bellasiae710302015-06-23 09:17:54 +0100677 sysctl_sched_cfs_boost = boost;
Patrick Bellasid5563d32016-07-29 15:32:26 +0100678 perf_boost_idx = threshold_idx;
679 perf_constrain_idx = threshold_idx;
680 }
Patrick Bellasiae710302015-06-23 09:17:54 +0100681
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000682 /* Update CPU boost */
683 schedtune_boostgroup_update(st->idx, st->boost);
684
Patrick Bellasi050dcb82015-06-22 13:49:07 +0100685 trace_sched_tune_config(st->boost);
686
Patrick Bellasiae710302015-06-23 09:17:54 +0100687 return 0;
688}
689
690static struct cftype files[] = {
691 {
692 .name = "boost",
Srinath Sridharane71c4252016-07-28 17:28:55 +0100693 .read_s64 = boost_read,
694 .write_s64 = boost_write,
Patrick Bellasiae710302015-06-23 09:17:54 +0100695 },
Srinath Sridharan42503db2016-07-14 13:09:03 -0700696 {
697 .name = "prefer_idle",
698 .read_u64 = prefer_idle_read,
699 .write_u64 = prefer_idle_write,
700 },
Patrick Bellasiae710302015-06-23 09:17:54 +0100701 { } /* terminate */
702};
703
Patrick Bellasieafebca2018-02-12 16:04:35 +0000704static void
705schedtune_boostgroup_init(struct schedtune *st, int idx)
Patrick Bellasiae710302015-06-23 09:17:54 +0100706{
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000707 struct boost_groups *bg;
708 int cpu;
709
Patrick Bellasieafebca2018-02-12 16:04:35 +0000710 /* Initialize per CPUs boost group support */
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000711 for_each_possible_cpu(cpu) {
712 bg = &per_cpu(cpu_boost_groups, cpu);
Patrick Bellasieafebca2018-02-12 16:04:35 +0000713 bg->group[idx].boost = 0;
Patrick Bellasidb2c5202018-02-12 16:09:28 +0000714 bg->group[idx].valid = true;
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000715 }
716
Patrick Bellasieafebca2018-02-12 16:04:35 +0000717 /* Keep track of allocated boost groups */
718 allocated_group[idx] = st;
719 st->idx = idx;
Patrick Bellasiae710302015-06-23 09:17:54 +0100720}
721
Patrick Bellasiae710302015-06-23 09:17:54 +0100722static struct cgroup_subsys_state *
723schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
724{
725 struct schedtune *st;
726 int idx;
727
Patrick Bellasi52cb67e2016-07-29 15:19:41 +0100728 if (!parent_css)
Patrick Bellasiae710302015-06-23 09:17:54 +0100729 return &root_schedtune.css;
Patrick Bellasiae710302015-06-23 09:17:54 +0100730
731 /* Allow only single level hierachies */
732 if (parent_css != &root_schedtune.css) {
733 pr_err("Nested SchedTune boosting groups not allowed\n");
734 return ERR_PTR(-ENOMEM);
735 }
736
737 /* Allow only a limited number of boosting groups */
738 for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
739 if (!allocated_group[idx])
740 break;
741 if (idx == BOOSTGROUPS_COUNT) {
742 pr_err("Trying to create more than %d SchedTune boosting groups\n",
743 BOOSTGROUPS_COUNT);
744 return ERR_PTR(-ENOSPC);
745 }
746
747 st = kzalloc(sizeof(*st), GFP_KERNEL);
748 if (!st)
749 goto out;
750
751 /* Initialize per CPUs boost group support */
Patrick Bellasieafebca2018-02-12 16:04:35 +0000752 schedtune_boostgroup_init(st, idx);
Patrick Bellasiae710302015-06-23 09:17:54 +0100753
754 return &st->css;
755
Patrick Bellasiae710302015-06-23 09:17:54 +0100756out:
757 return ERR_PTR(-ENOMEM);
758}
759
760static void
761schedtune_boostgroup_release(struct schedtune *st)
762{
Patrick Bellasieafebca2018-02-12 16:04:35 +0000763 struct boost_groups *bg;
764 int cpu;
765
766 /* Reset per CPUs boost group support */
767 for_each_possible_cpu(cpu) {
768 bg = &per_cpu(cpu_boost_groups, cpu);
Patrick Bellasidb2c5202018-02-12 16:09:28 +0000769 bg->group[st->idx].valid = false;
Patrick Bellasieafebca2018-02-12 16:04:35 +0000770 bg->group[st->idx].boost = 0;
771 }
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000772
Patrick Bellasiae710302015-06-23 09:17:54 +0100773 /* Keep track of allocated boost groups */
774 allocated_group[st->idx] = NULL;
775}
776
777static void
778schedtune_css_free(struct cgroup_subsys_state *css)
779{
780 struct schedtune *st = css_st(css);
781
Patrick Bellasieafebca2018-02-12 16:04:35 +0000782 /* Release per CPUs boost group support */
Patrick Bellasiae710302015-06-23 09:17:54 +0100783 schedtune_boostgroup_release(st);
784 kfree(st);
785}
786
787struct cgroup_subsys schedtune_cgrp_subsys = {
788 .css_alloc = schedtune_css_alloc,
789 .css_free = schedtune_css_free,
Patrick Bellasid2489002016-07-28 18:44:40 +0100790 .can_attach = schedtune_can_attach,
791 .cancel_attach = schedtune_cancel_attach,
Patrick Bellasiae710302015-06-23 09:17:54 +0100792 .legacy_cftypes = files,
793 .early_init = 1,
794};
795
Patrick Bellasi52cb67e2016-07-29 15:19:41 +0100796static inline void
797schedtune_init_cgroups(void)
798{
799 struct boost_groups *bg;
800 int cpu;
801
802 /* Initialize the per CPU boost groups */
803 for_each_possible_cpu(cpu) {
804 bg = &per_cpu(cpu_boost_groups, cpu);
805 memset(bg, 0, sizeof(struct boost_groups));
Patrick Bellasidb2c5202018-02-12 16:09:28 +0000806 bg->group[0].valid = true;
Ke Wang751e5092016-11-25 13:38:45 +0800807 raw_spin_lock_init(&bg->lock);
Patrick Bellasi52cb67e2016-07-29 15:19:41 +0100808 }
809
810 pr_info("schedtune: configured to support %d boost groups\n",
811 BOOSTGROUPS_COUNT);
Patrick Bellasi82ab2432016-08-24 11:02:29 +0100812
813 schedtune_initialized = true;
Patrick Bellasi52cb67e2016-07-29 15:19:41 +0100814}
815
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000816#else /* CONFIG_CGROUP_SCHEDTUNE */
817
818int
819schedtune_accept_deltas(int nrg_delta, int cap_delta,
820 struct task_struct *task)
821{
822 /* Optimal (O) region */
Patrick Bellasi5824d982016-01-20 14:06:05 +0000823 if (nrg_delta < 0 && cap_delta > 0) {
824 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000825 return INT_MAX;
Patrick Bellasi5824d982016-01-20 14:06:05 +0000826 }
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000827
828 /* Suboptimal (S) region */
Patrick Bellasi5824d982016-01-20 14:06:05 +0000829 if (nrg_delta > 0 && cap_delta < 0) {
830 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000831 return -INT_MAX;
Patrick Bellasi5824d982016-01-20 14:06:05 +0000832 }
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000833
834 return __schedtune_accept_deltas(nrg_delta, cap_delta,
835 perf_boost_idx, perf_constrain_idx);
836}
837
Patrick Bellasiae710302015-06-23 09:17:54 +0100838#endif /* CONFIG_CGROUP_SCHEDTUNE */
839
Patrick Bellasi69fa4c72015-06-22 18:11:44 +0100840int
841sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
842 void __user *buffer, size_t *lenp,
843 loff_t *ppos)
844{
845 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Patrick Bellasid5563d32016-07-29 15:32:26 +0100846 unsigned threshold_idx;
847 int boost_pct;
Patrick Bellasi69fa4c72015-06-22 18:11:44 +0100848
849 if (ret || !write)
850 return ret;
851
Patrick Bellasid5563d32016-07-29 15:32:26 +0100852 if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100)
853 return -EINVAL;
854 boost_pct = sysctl_sched_cfs_boost;
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000855
Patrick Bellasid5563d32016-07-29 15:32:26 +0100856 /*
857 * Update threshold params for Performance Boost (B)
858 * and Performance Constraint (C) regions.
859 * The current implementatio uses the same cuts for both
860 * B and C regions.
861 */
862 threshold_idx = clamp(boost_pct, 0, 99) / 10;
863 perf_boost_idx = threshold_idx;
864 perf_constrain_idx = threshold_idx;
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000865
Patrick Bellasi69fa4c72015-06-22 18:11:44 +0100866 return 0;
867}
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000868
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000869#ifdef CONFIG_SCHED_DEBUG
870static void
871schedtune_test_nrg(unsigned long delta_pwr)
872{
873 unsigned long test_delta_pwr;
874 unsigned long test_norm_pwr;
875 int idx;
876
877 /*
878 * Check normalization constants using some constant system
879 * energy values
880 */
881 pr_info("schedtune: verify normalization constants...\n");
882 for (idx = 0; idx < 6; ++idx) {
883 test_delta_pwr = delta_pwr >> idx;
884
885 /* Normalize on max energy for target platform */
886 test_norm_pwr = reciprocal_divide(
887 test_delta_pwr << SCHED_CAPACITY_SHIFT,
888 schedtune_target_nrg.rdiv);
889
890 pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n",
891 idx, test_delta_pwr, test_norm_pwr);
892 }
893}
894#else
895#define schedtune_test_nrg(delta_pwr)
896#endif
897
898/*
899 * Compute the min/max power consumption of a cluster and all its CPUs
900 */
901static void
902schedtune_add_cluster_nrg(
903 struct sched_domain *sd,
904 struct sched_group *sg,
905 struct target_nrg *ste)
906{
907 struct sched_domain *sd2;
908 struct sched_group *sg2;
909
910 struct cpumask *cluster_cpus;
911 char str[32];
912
913 unsigned long min_pwr;
914 unsigned long max_pwr;
915 int cpu;
916
917 /* Get Cluster energy using EM data for the first CPU */
918 cluster_cpus = sched_group_cpus(sg);
919 snprintf(str, 32, "CLUSTER[%*pbl]",
920 cpumask_pr_args(cluster_cpus));
921
922 min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power;
923 max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power;
924 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
925 str, min_pwr, max_pwr);
926
927 /*
928 * Keep track of this cluster's energy in the computation of the
929 * overall system energy
930 */
931 ste->min_power += min_pwr;
932 ste->max_power += max_pwr;
933
934 /* Get CPU energy using EM data for each CPU in the group */
935 for_each_cpu(cpu, cluster_cpus) {
936 /* Get a SD view for the specific CPU */
937 for_each_domain(cpu, sd2) {
938 /* Get the CPU group */
939 sg2 = sd2->groups;
940 min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power;
941 max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power;
942
943 ste->min_power += min_pwr;
944 ste->max_power += max_pwr;
945
946 snprintf(str, 32, "CPU[%d]", cpu);
947 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
948 str, min_pwr, max_pwr);
949
950 /*
951 * Assume we have EM data only at the CPU and
952 * the upper CLUSTER level
953 */
954 BUG_ON(!cpumask_equal(
955 sched_group_cpus(sg),
956 sched_group_cpus(sd2->parent->groups)
957 ));
958 break;
959 }
960 }
961}
962
963/*
964 * Initialize the constants required to compute normalized energy.
965 * The values of these constants depends on the EM data for the specific
966 * target system and topology.
967 * Thus, this function is expected to be called by the code
968 * that bind the EM to the topology information.
969 */
970static int
Patrick Bellasi52cb67e2016-07-29 15:19:41 +0100971schedtune_init(void)
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000972{
973 struct target_nrg *ste = &schedtune_target_nrg;
974 unsigned long delta_pwr = 0;
975 struct sched_domain *sd;
976 struct sched_group *sg;
977
978 pr_info("schedtune: init normalization constants...\n");
979 ste->max_power = 0;
980 ste->min_power = 0;
981
982 rcu_read_lock();
983
984 /*
985 * When EAS is in use, we always have a pointer to the highest SD
986 * which provides EM data.
987 */
988 sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
989 if (!sd) {
990 pr_info("schedtune: no energy model data\n");
991 goto nodata;
992 }
993
994 sg = sd->groups;
995 do {
996 schedtune_add_cluster_nrg(sd, sg, ste);
997 } while (sg = sg->next, sg != sd->groups);
998
999 rcu_read_unlock();
1000
1001 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
1002 "SYSTEM", ste->min_power, ste->max_power);
1003
1004 /* Compute normalization constants */
1005 delta_pwr = ste->max_power - ste->min_power;
1006 ste->rdiv = reciprocal_value(delta_pwr);
1007 pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n",
1008 ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
1009
1010 schedtune_test_nrg(delta_pwr);
Patrick Bellasi52cb67e2016-07-29 15:19:41 +01001011
1012#ifdef CONFIG_CGROUP_SCHEDTUNE
1013 schedtune_init_cgroups();
1014#else
1015 pr_info("schedtune: configured to support global boosting only\n");
1016#endif
1017
Patrick Bellasid8460c72016-10-13 17:31:24 +01001018 schedtune_spc_rdiv = reciprocal_value(100);
1019
Patrick Bellasi2f369bb2016-01-12 18:12:13 +00001020 return 0;
1021
1022nodata:
Patrick Bellasic0c5d552016-10-13 17:34:47 +01001023 pr_warning("schedtune: disabled!\n");
Patrick Bellasi2f369bb2016-01-12 18:12:13 +00001024 rcu_read_unlock();
1025 return -EINVAL;
1026}
Patrick Bellasif4725392016-07-29 16:09:03 +01001027postcore_initcall(schedtune_init);