blob: 6afc27cc722eb4b3b41dd7af1b0c3c24db3a6206 [file] [log] [blame]
Patrick Bellasiae710302015-06-23 09:17:54 +01001#include <linux/cgroup.h>
2#include <linux/err.h>
Patrick Bellasi2f369bb2016-01-12 18:12:13 +00003#include <linux/kernel.h>
Patrick Bellasiae710302015-06-23 09:17:54 +01004#include <linux/percpu.h>
5#include <linux/printk.h>
Patrick Bellasiedd28d32015-07-07 15:33:20 +01006#include <linux/rcupdate.h>
Patrick Bellasiae710302015-06-23 09:17:54 +01007#include <linux/slab.h>
8
Patrick Bellasi050dcb82015-06-22 13:49:07 +01009#include <trace/events/sched.h>
10
Patrick Bellasi69fa4c72015-06-22 18:11:44 +010011#include "sched.h"
Patrick Bellasic5b20422016-07-29 15:45:57 +010012#include "tune.h"
Patrick Bellasi69fa4c72015-06-22 18:11:44 +010013
Patrick Bellasid2489002016-07-28 18:44:40 +010014#ifdef CONFIG_CGROUP_SCHEDTUNE
15static bool schedtune_initialized = false;
16#endif
17
Patrick Bellasi69fa4c72015-06-22 18:11:44 +010018unsigned int sysctl_sched_cfs_boost __read_mostly;
19
Patrick Bellasic5b20422016-07-29 15:45:57 +010020extern struct target_nrg schedtune_target_nrg;
Patrick Bellasi2f369bb2016-01-12 18:12:13 +000021
22/* Performance Boost region (B) threshold params */
23static int perf_boost_idx;
24
25/* Performance Constraint region (C) threshold params */
26static int perf_constrain_idx;
27
28/**
29 * Performance-Energy (P-E) Space thresholds constants
30 */
31struct threshold_params {
32 int nrg_gain;
33 int cap_gain;
34};
35
36/*
37 * System specific P-E space thresholds constants
38 */
39static struct threshold_params
40threshold_gains[] = {
Patrick Bellasid5563d32016-07-29 15:32:26 +010041 { 0, 5 }, /* < 10% */
42 { 1, 5 }, /* < 20% */
43 { 2, 5 }, /* < 30% */
44 { 3, 5 }, /* < 40% */
45 { 4, 5 }, /* < 50% */
46 { 5, 4 }, /* < 60% */
47 { 5, 3 }, /* < 70% */
48 { 5, 2 }, /* < 80% */
49 { 5, 1 }, /* < 90% */
50 { 5, 0 } /* <= 100% */
Patrick Bellasi2f369bb2016-01-12 18:12:13 +000051};
52
53static int
54__schedtune_accept_deltas(int nrg_delta, int cap_delta,
55 int perf_boost_idx, int perf_constrain_idx)
56{
57 int payoff = -INT_MAX;
Patrick Bellasi2ed513e2016-07-28 17:38:25 +010058 int gain_idx = -1;
Patrick Bellasi2f369bb2016-01-12 18:12:13 +000059
60 /* Performance Boost (B) region */
Patrick Bellasi2ed513e2016-07-28 17:38:25 +010061 if (nrg_delta >= 0 && cap_delta > 0)
62 gain_idx = perf_boost_idx;
Patrick Bellasi2f369bb2016-01-12 18:12:13 +000063 /* Performance Constraint (C) region */
Patrick Bellasi2ed513e2016-07-28 17:38:25 +010064 else if (nrg_delta < 0 && cap_delta <= 0)
65 gain_idx = perf_constrain_idx;
Patrick Bellasi2f369bb2016-01-12 18:12:13 +000066
67 /* Default: reject schedule candidate */
Patrick Bellasi2ed513e2016-07-28 17:38:25 +010068 if (gain_idx == -1)
69 return payoff;
70
71 /*
72 * Evaluate "Performance Boost" vs "Energy Increase"
73 *
74 * - Performance Boost (B) region
75 *
76 * Condition: nrg_delta > 0 && cap_delta > 0
77 * Payoff criteria:
78 * cap_gain / nrg_gain < cap_delta / nrg_delta =
79 * cap_gain * nrg_delta < cap_delta * nrg_gain
80 * Note that since both nrg_gain and nrg_delta are positive, the
81 * inequality does not change. Thus:
82 *
83 * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
84 *
85 * - Performance Constraint (C) region
86 *
87 * Condition: nrg_delta < 0 && cap_delta < 0
88 * payoff criteria:
89 * cap_gain / nrg_gain > cap_delta / nrg_delta =
90 * cap_gain * nrg_delta < cap_delta * nrg_gain
91 * Note that since nrg_gain > 0 while nrg_delta < 0, the
92 * inequality change. Thus:
93 *
94 * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
95 *
96 * This means that, in case of same positive defined {cap,nrg}_gain
97 * for both the B and C regions, we can use the same payoff formula
98 * where a positive value represents the accept condition.
99 */
100 payoff = cap_delta * threshold_gains[gain_idx].nrg_gain;
101 payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain;
102
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000103 return payoff;
104}
105
Patrick Bellasiae710302015-06-23 09:17:54 +0100106#ifdef CONFIG_CGROUP_SCHEDTUNE
107
108/*
109 * EAS scheduler tunables for task groups.
110 */
111
112/* SchdTune tunables for a group of tasks */
113struct schedtune {
114 /* SchedTune CGroup subsystem */
115 struct cgroup_subsys_state css;
116
117 /* Boost group allocated ID */
118 int idx;
119
120 /* Boost value for tasks on that SchedTune CGroup */
121 int boost;
122
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000123 /* Performance Boost (B) region threshold params */
124 int perf_boost_idx;
125
126 /* Performance Constraint (C) region threshold params */
127 int perf_constrain_idx;
Srinath Sridharan42503db2016-07-14 13:09:03 -0700128
129 /* Hint to bias scheduling of tasks on that SchedTune CGroup
130 * towards idle CPUs */
131 int prefer_idle;
Patrick Bellasiae710302015-06-23 09:17:54 +0100132};
133
134static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
135{
136 return css ? container_of(css, struct schedtune, css) : NULL;
137}
138
139static inline struct schedtune *task_schedtune(struct task_struct *tsk)
140{
141 return css_st(task_css(tsk, schedtune_cgrp_id));
142}
143
144static inline struct schedtune *parent_st(struct schedtune *st)
145{
146 return css_st(st->css.parent);
147}
148
149/*
150 * SchedTune root control group
151 * The root control group is used to defined a system-wide boosting tuning,
152 * which is applied to all tasks in the system.
153 * Task specific boost tuning could be specified by creating and
154 * configuring a child control group under the root one.
155 * By default, system-wide boosting is disabled, i.e. no boosting is applied
156 * to tasks which are not into a child control group.
157 */
158static struct schedtune
159root_schedtune = {
160 .boost = 0,
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000161 .perf_boost_idx = 0,
162 .perf_constrain_idx = 0,
Srinath Sridharan42503db2016-07-14 13:09:03 -0700163 .prefer_idle = 0,
Patrick Bellasiae710302015-06-23 09:17:54 +0100164};
165
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000166int
167schedtune_accept_deltas(int nrg_delta, int cap_delta,
168 struct task_struct *task)
169{
170 struct schedtune *ct;
171 int perf_boost_idx;
172 int perf_constrain_idx;
173
174 /* Optimal (O) region */
Patrick Bellasi5824d982016-01-20 14:06:05 +0000175 if (nrg_delta < 0 && cap_delta > 0) {
176 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000177 return INT_MAX;
Patrick Bellasi5824d982016-01-20 14:06:05 +0000178 }
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000179
180 /* Suboptimal (S) region */
Patrick Bellasi5824d982016-01-20 14:06:05 +0000181 if (nrg_delta > 0 && cap_delta < 0) {
182 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000183 return -INT_MAX;
Patrick Bellasi5824d982016-01-20 14:06:05 +0000184 }
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000185
186 /* Get task specific perf Boost/Constraints indexes */
187 rcu_read_lock();
188 ct = task_schedtune(task);
189 perf_boost_idx = ct->perf_boost_idx;
190 perf_constrain_idx = ct->perf_constrain_idx;
191 rcu_read_unlock();
192
193 return __schedtune_accept_deltas(nrg_delta, cap_delta,
194 perf_boost_idx, perf_constrain_idx);
195}
196
Patrick Bellasiae710302015-06-23 09:17:54 +0100197/*
198 * Maximum number of boost groups to support
199 * When per-task boosting is used we still allow only limited number of
200 * boost groups for two main reasons:
201 * 1. on a real system we usually have only few classes of workloads which
202 * make sense to boost with different values (e.g. background vs foreground
203 * tasks, interactive vs low-priority tasks)
204 * 2. a limited number allows for a simpler and more memory/time efficient
205 * implementation especially for the computation of the per-CPU boost
206 * value
207 */
208#define BOOSTGROUPS_COUNT 4
209
210/* Array of configured boostgroups */
211static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
212 &root_schedtune,
213 NULL,
214};
215
216/* SchedTune boost groups
217 * Keep track of all the boost groups which impact on CPU, for example when a
218 * CPU has two RUNNABLE tasks belonging to two different boost groups and thus
219 * likely with different boost values.
220 * Since on each system we expect only a limited number of boost groups, here
221 * we use a simple array to keep track of the metrics required to compute the
222 * maximum per-CPU boosting value.
223 */
224struct boost_groups {
225 /* Maximum boost value for all RUNNABLE tasks on a CPU */
Srinath Sridharane71c4252016-07-28 17:28:55 +0100226 bool idle;
227 int boost_max;
Patrick Bellasiae710302015-06-23 09:17:54 +0100228 struct {
229 /* The boost for tasks on that boost group */
Srinath Sridharane71c4252016-07-28 17:28:55 +0100230 int boost;
Patrick Bellasiae710302015-06-23 09:17:54 +0100231 /* Count of RUNNABLE tasks on that boost group */
232 unsigned tasks;
233 } group[BOOSTGROUPS_COUNT];
Patrick Bellasid2489002016-07-28 18:44:40 +0100234 /* CPU's boost group locking */
235 raw_spinlock_t lock;
Patrick Bellasiae710302015-06-23 09:17:54 +0100236};
237
238/* Boost groups affecting each CPU in the system */
239DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
240
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000241static void
242schedtune_cpu_update(int cpu)
243{
244 struct boost_groups *bg;
Srinath Sridharane71c4252016-07-28 17:28:55 +0100245 int boost_max;
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000246 int idx;
247
248 bg = &per_cpu(cpu_boost_groups, cpu);
249
250 /* The root boost group is always active */
251 boost_max = bg->group[0].boost;
252 for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
253 /*
254 * A boost group affects a CPU only if it has
255 * RUNNABLE tasks on that CPU
256 */
257 if (bg->group[idx].tasks == 0)
258 continue;
Srinath Sridharane71c4252016-07-28 17:28:55 +0100259
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000260 boost_max = max(boost_max, bg->group[idx].boost);
261 }
Srinath Sridharane71c4252016-07-28 17:28:55 +0100262 /* Ensures boost_max is non-negative when all cgroup boost values
263 * are neagtive. Avoids under-accounting of cpu capacity which may cause
264 * task stacking and frequency spikes.*/
265 boost_max = max(boost_max, 0);
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000266 bg->boost_max = boost_max;
267}
268
269static int
270schedtune_boostgroup_update(int idx, int boost)
271{
272 struct boost_groups *bg;
273 int cur_boost_max;
274 int old_boost;
275 int cpu;
276
277 /* Update per CPU boost groups */
278 for_each_possible_cpu(cpu) {
279 bg = &per_cpu(cpu_boost_groups, cpu);
280
281 /*
282 * Keep track of current boost values to compute the per CPU
283 * maximum only when it has been affected by the new value of
284 * the updated boost group
285 */
286 cur_boost_max = bg->boost_max;
287 old_boost = bg->group[idx].boost;
288
289 /* Update the boost value of this boost group */
290 bg->group[idx].boost = boost;
291
292 /* Check if this update increase current max */
293 if (boost > cur_boost_max && bg->group[idx].tasks) {
294 bg->boost_max = boost;
Patrick Bellasi953b1042015-06-24 15:36:08 +0100295 trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000296 continue;
297 }
298
299 /* Check if this update has decreased current max */
Patrick Bellasi953b1042015-06-24 15:36:08 +0100300 if (cur_boost_max == old_boost && old_boost > boost) {
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000301 schedtune_cpu_update(cpu);
Patrick Bellasi953b1042015-06-24 15:36:08 +0100302 trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
303 continue;
304 }
305
306 trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max);
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000307 }
308
309 return 0;
310}
311
Patrick Bellasid2489002016-07-28 18:44:40 +0100312#define ENQUEUE_TASK 1
313#define DEQUEUE_TASK -1
314
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100315static inline void
316schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
317{
Patrick Bellasid2489002016-07-28 18:44:40 +0100318 struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
319 int tasks = bg->group[idx].tasks + task_count;
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100320
321 /* Update boosted tasks count while avoiding to make it negative */
Patrick Bellasid2489002016-07-28 18:44:40 +0100322 bg->group[idx].tasks = max(0, tasks);
Patrick Bellasi953b1042015-06-24 15:36:08 +0100323
324 trace_sched_tune_tasks_update(p, cpu, tasks, idx,
325 bg->group[idx].boost, bg->boost_max);
326
Patrick Bellasid2489002016-07-28 18:44:40 +0100327 /* Boost group activation or deactivation on that RQ */
328 if (tasks == 1 || tasks == 0)
329 schedtune_cpu_update(cpu);
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100330}
331
332/*
333 * NOTE: This function must be called while holding the lock on the CPU RQ
334 */
335void schedtune_enqueue_task(struct task_struct *p, int cpu)
336{
Patrick Bellasid2489002016-07-28 18:44:40 +0100337 struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
338 unsigned long irq_flags;
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100339 struct schedtune *st;
340 int idx;
341
Patrick Bellasid2489002016-07-28 18:44:40 +0100342 if (!unlikely(schedtune_initialized))
343 return;
344
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100345 /*
346 * When a task is marked PF_EXITING by do_exit() it's going to be
347 * dequeued and enqueued multiple times in the exit path.
348 * Thus we avoid any further update, since we do not want to change
349 * CPU boosting while the task is exiting.
350 */
351 if (p->flags & PF_EXITING)
352 return;
353
Patrick Bellasid2489002016-07-28 18:44:40 +0100354 /*
355 * Boost group accouting is protected by a per-cpu lock and requires
356 * interrupt to be disabled to avoid race conditions for example on
357 * do_exit()::cgroup_exit() and task migration.
358 */
359 raw_spin_lock_irqsave(&bg->lock, irq_flags);
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100360 rcu_read_lock();
Patrick Bellasid2489002016-07-28 18:44:40 +0100361
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100362 st = task_schedtune(p);
363 idx = st->idx;
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100364
Patrick Bellasid2489002016-07-28 18:44:40 +0100365 schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
366
367 rcu_read_unlock();
368 raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
369}
370
371int schedtune_allow_attach(struct cgroup_taskset *tset)
372{
373 /* We always allows tasks to be moved between existing CGroups */
374 return 0;
375}
376
377int schedtune_can_attach(struct cgroup_taskset *tset)
378{
379 struct task_struct *task;
380 struct cgroup_subsys_state *css;
381 struct boost_groups *bg;
382 struct rq_flags irq_flags;
383 unsigned int cpu;
384 struct rq *rq;
385 int src_bg; /* Source boost group index */
386 int dst_bg; /* Destination boost group index */
387 int tasks;
388
389 if (!unlikely(schedtune_initialized))
390 return 0;
391
392
393 cgroup_taskset_for_each(task, css, tset) {
394
395 /*
396 * Lock the CPU's RQ the task is enqueued to avoid race
397 * conditions with migration code while the task is being
398 * accounted
399 */
400 rq = lock_rq_of(task, &irq_flags);
401
402 if (!task->on_rq) {
403 unlock_rq_of(rq, task, &irq_flags);
404 continue;
405 }
406
407 /*
408 * Boost group accouting is protected by a per-cpu lock and requires
409 * interrupt to be disabled to avoid race conditions on...
410 */
411 cpu = cpu_of(rq);
412 bg = &per_cpu(cpu_boost_groups, cpu);
413 raw_spin_lock(&bg->lock);
414
415 dst_bg = css_st(css)->idx;
416 src_bg = task_schedtune(task)->idx;
417
418 /*
419 * Current task is not changing boostgroup, which can
420 * happen when the new hierarchy is in use.
421 */
422 if (unlikely(dst_bg == src_bg)) {
423 raw_spin_unlock(&bg->lock);
424 unlock_rq_of(rq, task, &irq_flags);
425 continue;
426 }
427
428 /*
429 * This is the case of a RUNNABLE task which is switching its
430 * current boost group.
431 */
432
433 /* Move task from src to dst boost group */
434 tasks = bg->group[src_bg].tasks - 1;
435 bg->group[src_bg].tasks = max(0, tasks);
436 bg->group[dst_bg].tasks += 1;
437
438 raw_spin_unlock(&bg->lock);
439 unlock_rq_of(rq, task, &irq_flags);
440
441 /* Update CPU boost group */
442 if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
443 schedtune_cpu_update(task_cpu(task));
444
445 }
446
447 return 0;
448}
449
450void schedtune_cancel_attach(struct cgroup_taskset *tset)
451{
452 /* This can happen only if SchedTune controller is mounted with
453 * other hierarchies ane one of them fails. Since usually SchedTune is
454 * mouted on its own hierarcy, for the time being we do not implement
455 * a proper rollback mechanism */
456 WARN(1, "SchedTune cancel attach not implemented");
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100457}
458
459/*
460 * NOTE: This function must be called while holding the lock on the CPU RQ
461 */
462void schedtune_dequeue_task(struct task_struct *p, int cpu)
463{
Patrick Bellasid2489002016-07-28 18:44:40 +0100464 struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
465 unsigned long irq_flags;
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100466 struct schedtune *st;
467 int idx;
468
Patrick Bellasid2489002016-07-28 18:44:40 +0100469 if (!unlikely(schedtune_initialized))
470 return;
471
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100472 /*
473 * When a task is marked PF_EXITING by do_exit() it's going to be
474 * dequeued and enqueued multiple times in the exit path.
475 * Thus we avoid any further update, since we do not want to change
476 * CPU boosting while the task is exiting.
Patrick Bellasid2489002016-07-28 18:44:40 +0100477 * The last dequeue is already enforce by the do_exit() code path
478 * via schedtune_exit_task().
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100479 */
480 if (p->flags & PF_EXITING)
481 return;
482
Patrick Bellasid2489002016-07-28 18:44:40 +0100483 /*
484 * Boost group accouting is protected by a per-cpu lock and requires
485 * interrupt to be disabled to avoid race conditions on...
486 */
487 raw_spin_lock_irqsave(&bg->lock, irq_flags);
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100488 rcu_read_lock();
Patrick Bellasid2489002016-07-28 18:44:40 +0100489
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100490 st = task_schedtune(p);
491 idx = st->idx;
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100492
Patrick Bellasid2489002016-07-28 18:44:40 +0100493 schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
494
495 rcu_read_unlock();
496 raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
497}
498
499void schedtune_exit_task(struct task_struct *tsk)
500{
501 struct schedtune *st;
502 struct rq_flags irq_flags;
503 unsigned int cpu;
504 struct rq *rq;
505 int idx;
506
507 if (!unlikely(schedtune_initialized))
508 return;
509
510 rq = lock_rq_of(tsk, &irq_flags);
511 rcu_read_lock();
512
513 cpu = cpu_of(rq);
514 st = task_schedtune(tsk);
515 idx = st->idx;
516 schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK);
517
518 rcu_read_unlock();
519 unlock_rq_of(rq, tsk, &irq_flags);
Patrick Bellasiedd28d32015-07-07 15:33:20 +0100520}
521
522int schedtune_cpu_boost(int cpu)
523{
524 struct boost_groups *bg;
525
526 bg = &per_cpu(cpu_boost_groups, cpu);
527 return bg->boost_max;
528}
529
Patrick Bellasi9b2b8da2016-01-14 18:31:53 +0000530int schedtune_task_boost(struct task_struct *p)
531{
532 struct schedtune *st;
533 int task_boost;
534
535 /* Get task boost value */
536 rcu_read_lock();
537 st = task_schedtune(p);
538 task_boost = st->boost;
539 rcu_read_unlock();
540
541 return task_boost;
542}
543
Srinath Sridharan42503db2016-07-14 13:09:03 -0700544int schedtune_prefer_idle(struct task_struct *p)
545{
546 struct schedtune *st;
547 int prefer_idle;
548
549 /* Get prefer_idle value */
550 rcu_read_lock();
551 st = task_schedtune(p);
552 prefer_idle = st->prefer_idle;
553 rcu_read_unlock();
554
555 return prefer_idle;
556}
557
558static u64
559prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft)
560{
561 struct schedtune *st = css_st(css);
562
563 return st->prefer_idle;
564}
565
566static int
567prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft,
568 u64 prefer_idle)
569{
570 struct schedtune *st = css_st(css);
571 st->prefer_idle = prefer_idle;
572
573 return 0;
574}
575
Srinath Sridharane71c4252016-07-28 17:28:55 +0100576static s64
Patrick Bellasiae710302015-06-23 09:17:54 +0100577boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
578{
579 struct schedtune *st = css_st(css);
580
581 return st->boost;
582}
583
584static int
585boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
Srinath Sridharane71c4252016-07-28 17:28:55 +0100586 s64 boost)
Patrick Bellasiae710302015-06-23 09:17:54 +0100587{
588 struct schedtune *st = css_st(css);
Patrick Bellasid5563d32016-07-29 15:32:26 +0100589 unsigned threshold_idx;
590 int boost_pct;
Patrick Bellasiae710302015-06-23 09:17:54 +0100591
Srinath Sridharane71c4252016-07-28 17:28:55 +0100592 if (boost < -100 || boost > 100)
Patrick Bellasiae710302015-06-23 09:17:54 +0100593 return -EINVAL;
Patrick Bellasid5563d32016-07-29 15:32:26 +0100594 boost_pct = boost;
595
596 /*
597 * Update threshold params for Performance Boost (B)
598 * and Performance Constraint (C) regions.
599 * The current implementatio uses the same cuts for both
600 * B and C regions.
601 */
602 threshold_idx = clamp(boost_pct, 0, 99) / 10;
603 st->perf_boost_idx = threshold_idx;
604 st->perf_constrain_idx = threshold_idx;
Patrick Bellasiae710302015-06-23 09:17:54 +0100605
606 st->boost = boost;
Patrick Bellasid5563d32016-07-29 15:32:26 +0100607 if (css == &root_schedtune.css) {
Patrick Bellasiae710302015-06-23 09:17:54 +0100608 sysctl_sched_cfs_boost = boost;
Patrick Bellasid5563d32016-07-29 15:32:26 +0100609 perf_boost_idx = threshold_idx;
610 perf_constrain_idx = threshold_idx;
611 }
Patrick Bellasiae710302015-06-23 09:17:54 +0100612
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000613 /* Update CPU boost */
614 schedtune_boostgroup_update(st->idx, st->boost);
615
Patrick Bellasi050dcb82015-06-22 13:49:07 +0100616 trace_sched_tune_config(st->boost);
617
Patrick Bellasiae710302015-06-23 09:17:54 +0100618 return 0;
619}
620
621static struct cftype files[] = {
622 {
623 .name = "boost",
Srinath Sridharane71c4252016-07-28 17:28:55 +0100624 .read_s64 = boost_read,
625 .write_s64 = boost_write,
Patrick Bellasiae710302015-06-23 09:17:54 +0100626 },
Srinath Sridharan42503db2016-07-14 13:09:03 -0700627 {
628 .name = "prefer_idle",
629 .read_u64 = prefer_idle_read,
630 .write_u64 = prefer_idle_write,
631 },
Patrick Bellasiae710302015-06-23 09:17:54 +0100632 { } /* terminate */
633};
634
635static int
636schedtune_boostgroup_init(struct schedtune *st)
637{
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000638 struct boost_groups *bg;
639 int cpu;
640
Patrick Bellasiae710302015-06-23 09:17:54 +0100641 /* Keep track of allocated boost groups */
642 allocated_group[st->idx] = st;
643
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000644 /* Initialize the per CPU boost groups */
645 for_each_possible_cpu(cpu) {
646 bg = &per_cpu(cpu_boost_groups, cpu);
647 bg->group[st->idx].boost = 0;
648 bg->group[st->idx].tasks = 0;
649 }
650
Patrick Bellasiae710302015-06-23 09:17:54 +0100651 return 0;
652}
653
Patrick Bellasiae710302015-06-23 09:17:54 +0100654static struct cgroup_subsys_state *
655schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
656{
657 struct schedtune *st;
658 int idx;
659
Patrick Bellasi52cb67e2016-07-29 15:19:41 +0100660 if (!parent_css)
Patrick Bellasiae710302015-06-23 09:17:54 +0100661 return &root_schedtune.css;
Patrick Bellasiae710302015-06-23 09:17:54 +0100662
663 /* Allow only single level hierachies */
664 if (parent_css != &root_schedtune.css) {
665 pr_err("Nested SchedTune boosting groups not allowed\n");
666 return ERR_PTR(-ENOMEM);
667 }
668
669 /* Allow only a limited number of boosting groups */
670 for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
671 if (!allocated_group[idx])
672 break;
673 if (idx == BOOSTGROUPS_COUNT) {
674 pr_err("Trying to create more than %d SchedTune boosting groups\n",
675 BOOSTGROUPS_COUNT);
676 return ERR_PTR(-ENOSPC);
677 }
678
679 st = kzalloc(sizeof(*st), GFP_KERNEL);
680 if (!st)
681 goto out;
682
683 /* Initialize per CPUs boost group support */
684 st->idx = idx;
685 if (schedtune_boostgroup_init(st))
686 goto release;
687
688 return &st->css;
689
690release:
691 kfree(st);
692out:
693 return ERR_PTR(-ENOMEM);
694}
695
696static void
697schedtune_boostgroup_release(struct schedtune *st)
698{
Patrick Bellasi9a871ed2016-01-14 12:31:35 +0000699 /* Reset this boost group */
700 schedtune_boostgroup_update(st->idx, 0);
701
Patrick Bellasiae710302015-06-23 09:17:54 +0100702 /* Keep track of allocated boost groups */
703 allocated_group[st->idx] = NULL;
704}
705
706static void
707schedtune_css_free(struct cgroup_subsys_state *css)
708{
709 struct schedtune *st = css_st(css);
710
711 schedtune_boostgroup_release(st);
712 kfree(st);
713}
714
715struct cgroup_subsys schedtune_cgrp_subsys = {
716 .css_alloc = schedtune_css_alloc,
717 .css_free = schedtune_css_free,
Patrick Bellasid2489002016-07-28 18:44:40 +0100718// .allow_attach = schedtune_allow_attach,
719 .can_attach = schedtune_can_attach,
720 .cancel_attach = schedtune_cancel_attach,
Patrick Bellasiae710302015-06-23 09:17:54 +0100721 .legacy_cftypes = files,
722 .early_init = 1,
723};
724
Patrick Bellasi52cb67e2016-07-29 15:19:41 +0100725static inline void
726schedtune_init_cgroups(void)
727{
728 struct boost_groups *bg;
729 int cpu;
730
731 /* Initialize the per CPU boost groups */
732 for_each_possible_cpu(cpu) {
733 bg = &per_cpu(cpu_boost_groups, cpu);
734 memset(bg, 0, sizeof(struct boost_groups));
735 }
736
737 pr_info("schedtune: configured to support %d boost groups\n",
738 BOOSTGROUPS_COUNT);
739}
740
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000741#else /* CONFIG_CGROUP_SCHEDTUNE */
742
743int
744schedtune_accept_deltas(int nrg_delta, int cap_delta,
745 struct task_struct *task)
746{
747 /* Optimal (O) region */
Patrick Bellasi5824d982016-01-20 14:06:05 +0000748 if (nrg_delta < 0 && cap_delta > 0) {
749 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000750 return INT_MAX;
Patrick Bellasi5824d982016-01-20 14:06:05 +0000751 }
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000752
753 /* Suboptimal (S) region */
Patrick Bellasi5824d982016-01-20 14:06:05 +0000754 if (nrg_delta > 0 && cap_delta < 0) {
755 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000756 return -INT_MAX;
Patrick Bellasi5824d982016-01-20 14:06:05 +0000757 }
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000758
759 return __schedtune_accept_deltas(nrg_delta, cap_delta,
760 perf_boost_idx, perf_constrain_idx);
761}
762
Patrick Bellasiae710302015-06-23 09:17:54 +0100763#endif /* CONFIG_CGROUP_SCHEDTUNE */
764
Patrick Bellasi69fa4c72015-06-22 18:11:44 +0100765int
766sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
767 void __user *buffer, size_t *lenp,
768 loff_t *ppos)
769{
770 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Patrick Bellasid5563d32016-07-29 15:32:26 +0100771 unsigned threshold_idx;
772 int boost_pct;
Patrick Bellasi69fa4c72015-06-22 18:11:44 +0100773
774 if (ret || !write)
775 return ret;
776
Patrick Bellasid5563d32016-07-29 15:32:26 +0100777 if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100)
778 return -EINVAL;
779 boost_pct = sysctl_sched_cfs_boost;
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000780
Patrick Bellasid5563d32016-07-29 15:32:26 +0100781 /*
782 * Update threshold params for Performance Boost (B)
783 * and Performance Constraint (C) regions.
784 * The current implementatio uses the same cuts for both
785 * B and C regions.
786 */
787 threshold_idx = clamp(boost_pct, 0, 99) / 10;
788 perf_boost_idx = threshold_idx;
789 perf_constrain_idx = threshold_idx;
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000790
Patrick Bellasi69fa4c72015-06-22 18:11:44 +0100791 return 0;
792}
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000793
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000794#ifdef CONFIG_SCHED_DEBUG
795static void
796schedtune_test_nrg(unsigned long delta_pwr)
797{
798 unsigned long test_delta_pwr;
799 unsigned long test_norm_pwr;
800 int idx;
801
802 /*
803 * Check normalization constants using some constant system
804 * energy values
805 */
806 pr_info("schedtune: verify normalization constants...\n");
807 for (idx = 0; idx < 6; ++idx) {
808 test_delta_pwr = delta_pwr >> idx;
809
810 /* Normalize on max energy for target platform */
811 test_norm_pwr = reciprocal_divide(
812 test_delta_pwr << SCHED_CAPACITY_SHIFT,
813 schedtune_target_nrg.rdiv);
814
815 pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n",
816 idx, test_delta_pwr, test_norm_pwr);
817 }
818}
819#else
820#define schedtune_test_nrg(delta_pwr)
821#endif
822
823/*
824 * Compute the min/max power consumption of a cluster and all its CPUs
825 */
826static void
827schedtune_add_cluster_nrg(
828 struct sched_domain *sd,
829 struct sched_group *sg,
830 struct target_nrg *ste)
831{
832 struct sched_domain *sd2;
833 struct sched_group *sg2;
834
835 struct cpumask *cluster_cpus;
836 char str[32];
837
838 unsigned long min_pwr;
839 unsigned long max_pwr;
840 int cpu;
841
842 /* Get Cluster energy using EM data for the first CPU */
843 cluster_cpus = sched_group_cpus(sg);
844 snprintf(str, 32, "CLUSTER[%*pbl]",
845 cpumask_pr_args(cluster_cpus));
846
847 min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power;
848 max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power;
849 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
850 str, min_pwr, max_pwr);
851
852 /*
853 * Keep track of this cluster's energy in the computation of the
854 * overall system energy
855 */
856 ste->min_power += min_pwr;
857 ste->max_power += max_pwr;
858
859 /* Get CPU energy using EM data for each CPU in the group */
860 for_each_cpu(cpu, cluster_cpus) {
861 /* Get a SD view for the specific CPU */
862 for_each_domain(cpu, sd2) {
863 /* Get the CPU group */
864 sg2 = sd2->groups;
865 min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power;
866 max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power;
867
868 ste->min_power += min_pwr;
869 ste->max_power += max_pwr;
870
871 snprintf(str, 32, "CPU[%d]", cpu);
872 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
873 str, min_pwr, max_pwr);
874
875 /*
876 * Assume we have EM data only at the CPU and
877 * the upper CLUSTER level
878 */
879 BUG_ON(!cpumask_equal(
880 sched_group_cpus(sg),
881 sched_group_cpus(sd2->parent->groups)
882 ));
883 break;
884 }
885 }
886}
887
888/*
889 * Initialize the constants required to compute normalized energy.
890 * The values of these constants depends on the EM data for the specific
891 * target system and topology.
892 * Thus, this function is expected to be called by the code
893 * that bind the EM to the topology information.
894 */
895static int
Patrick Bellasi52cb67e2016-07-29 15:19:41 +0100896schedtune_init(void)
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000897{
898 struct target_nrg *ste = &schedtune_target_nrg;
899 unsigned long delta_pwr = 0;
900 struct sched_domain *sd;
901 struct sched_group *sg;
902
903 pr_info("schedtune: init normalization constants...\n");
904 ste->max_power = 0;
905 ste->min_power = 0;
906
907 rcu_read_lock();
908
909 /*
910 * When EAS is in use, we always have a pointer to the highest SD
911 * which provides EM data.
912 */
913 sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
914 if (!sd) {
915 pr_info("schedtune: no energy model data\n");
916 goto nodata;
917 }
918
919 sg = sd->groups;
920 do {
921 schedtune_add_cluster_nrg(sd, sg, ste);
922 } while (sg = sg->next, sg != sd->groups);
923
924 rcu_read_unlock();
925
926 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
927 "SYSTEM", ste->min_power, ste->max_power);
928
929 /* Compute normalization constants */
930 delta_pwr = ste->max_power - ste->min_power;
931 ste->rdiv = reciprocal_value(delta_pwr);
932 pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n",
933 ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
934
935 schedtune_test_nrg(delta_pwr);
Patrick Bellasi52cb67e2016-07-29 15:19:41 +0100936
937#ifdef CONFIG_CGROUP_SCHEDTUNE
938 schedtune_init_cgroups();
939#else
940 pr_info("schedtune: configured to support global boosting only\n");
941#endif
942
Patrick Bellasi2f369bb2016-01-12 18:12:13 +0000943 return 0;
944
945nodata:
946 rcu_read_unlock();
947 return -EINVAL;
948}
Patrick Bellasif4725392016-07-29 16:09:03 +0100949postcore_initcall(schedtune_init);