Blame - kernel/sched/core_ctl.c - kernel/msm-4.9

blob: 8f071757d5162b54dc26489cb4c31dc8214fdccd [file] [log] [blame]

Olav Haugan	9306c80	2016-08-18 17:22:44 -0700	[diff] [blame^]	1	/* Copyright (c) 2014-2016, The Linux Foundation. All rights reserved.
				2	*
				3	* This program is free software; you can redistribute it and/or modify
				4	* it under the terms of the GNU General Public License version 2 and
				5	* only version 2 as published by the Free Software Foundation.
				6	*
				7	* This program is distributed in the hope that it will be useful,
				8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				10	* GNU General Public License for more details.
				11	*/
				12
				13	#include <linux/init.h>
				14	#include <linux/notifier.h>
				15	#include <linux/cpu.h>
				16	#include <linux/cpumask.h>
				17	#include <linux/cpufreq.h>
				18	#include <linux/timer.h>
				19	#include <linux/kthread.h>
				20	#include <linux/sched.h>
				21	#include <linux/sched/rt.h>
				22
				23	#include <trace/events/power.h>
				24
				25	#define MAX_CPUS_PER_GROUP 4
				26
				27	struct cpu_data {
				28	/* Per CPU data. */
				29	bool inited;
				30	bool online;
				31	bool rejected;
				32	bool is_busy;
				33	bool not_preferred;
				34	unsigned int busy;
				35	unsigned int cpu;
				36	struct list_head sib;
				37	unsigned int first_cpu;
				38
				39	/* Per cluster data set only on first CPU */
				40	unsigned int min_cpus;
				41	unsigned int max_cpus;
				42	unsigned int offline_delay_ms;
				43	unsigned int busy_up_thres[MAX_CPUS_PER_GROUP];
				44	unsigned int busy_down_thres[MAX_CPUS_PER_GROUP];
				45	unsigned int online_cpus;
				46	unsigned int avail_cpus;
				47	unsigned int num_cpus;
				48	unsigned int need_cpus;
				49	unsigned int task_thres;
				50	s64 need_ts;
				51	struct list_head lru;
				52	bool pending;
				53	spinlock_t pending_lock;
				54	bool is_big_cluster;
				55	int nrrun;
				56	bool nrrun_changed;
				57	struct timer_list timer;
				58	struct task_struct *hotplug_thread;
				59	struct kobject kobj;
				60	};
				61
				62	static DEFINE_PER_CPU(struct cpu_data, cpu_state);
				63	static DEFINE_SPINLOCK(state_lock);
				64	static void apply_need(struct cpu_data *f);
				65	static void wake_up_hotplug_thread(struct cpu_data *state);
				66
				67	/* ========================= sysfs interface =========================== */
				68
				69	static ssize_t store_min_cpus(struct cpu_data *state,
				70	const char *buf, size_t count)
				71	{
				72	unsigned int val;
				73
				74	if (sscanf(buf, "%u\n", &val) != 1)
				75	return -EINVAL;
				76
				77	state->min_cpus = min(val, state->max_cpus);
				78	wake_up_hotplug_thread(state);
				79
				80	return count;
				81	}
				82
				83	static ssize_t show_min_cpus(struct cpu_data state, char buf)
				84	{
				85	return snprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus);
				86	}
				87
				88	static ssize_t store_max_cpus(struct cpu_data *state,
				89	const char *buf, size_t count)
				90	{
				91	unsigned int val;
				92
				93	if (sscanf(buf, "%u\n", &val) != 1)
				94	return -EINVAL;
				95
				96	val = min(val, state->num_cpus);
				97	state->max_cpus = val;
				98	state->min_cpus = min(state->min_cpus, state->max_cpus);
				99	wake_up_hotplug_thread(state);
				100
				101	return count;
				102	}
				103
				104	static ssize_t show_max_cpus(struct cpu_data state, char buf)
				105	{
				106	return snprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus);
				107	}
				108
				109	static ssize_t store_offline_delay_ms(struct cpu_data *state,
				110	const char *buf, size_t count)
				111	{
				112	unsigned int val;
				113
				114	if (sscanf(buf, "%u\n", &val) != 1)
				115	return -EINVAL;
				116
				117	state->offline_delay_ms = val;
				118	apply_need(state);
				119
				120	return count;
				121	}
				122
				123	static ssize_t show_task_thres(struct cpu_data state, char buf)
				124	{
				125	return snprintf(buf, PAGE_SIZE, "%u\n", state->task_thres);
				126	}
				127
				128	static ssize_t store_task_thres(struct cpu_data *state,
				129	const char *buf, size_t count)
				130	{
				131	unsigned int val;
				132
				133	if (sscanf(buf, "%u\n", &val) != 1)
				134	return -EINVAL;
				135
				136	if (val < state->num_cpus)
				137	return -EINVAL;
				138
				139	state->task_thres = val;
				140	apply_need(state);
				141
				142	return count;
				143	}
				144
				145	static ssize_t show_offline_delay_ms(struct cpu_data state, char buf)
				146	{
				147	return snprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms);
				148	}
				149
				150	static ssize_t store_busy_up_thres(struct cpu_data *state,
				151	const char *buf, size_t count)
				152	{
				153	unsigned int val[MAX_CPUS_PER_GROUP];
				154	int ret, i;
				155
				156	ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
				157	if (ret != 1 && ret != state->num_cpus)
				158	return -EINVAL;
				159
				160	if (ret == 1) {
				161	for (i = 0; i < state->num_cpus; i++)
				162	state->busy_up_thres[i] = val[0];
				163	} else {
				164	for (i = 0; i < state->num_cpus; i++)
				165	state->busy_up_thres[i] = val[i];
				166	}
				167	apply_need(state);
				168	return count;
				169	}
				170
				171	static ssize_t show_busy_up_thres(struct cpu_data state, char buf)
				172	{
				173	int i, count = 0;
				174
				175	for (i = 0; i < state->num_cpus; i++)
				176	count += snprintf(buf + count, PAGE_SIZE - count, "%u ",
				177	state->busy_up_thres[i]);
				178	count += snprintf(buf + count, PAGE_SIZE - count, "\n");
				179	return count;
				180	}
				181
				182	static ssize_t store_busy_down_thres(struct cpu_data *state,
				183	const char *buf, size_t count)
				184	{
				185	unsigned int val[MAX_CPUS_PER_GROUP];
				186	int ret, i;
				187
				188	ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
				189	if (ret != 1 && ret != state->num_cpus)
				190	return -EINVAL;
				191
				192	if (ret == 1) {
				193	for (i = 0; i < state->num_cpus; i++)
				194	state->busy_down_thres[i] = val[0];
				195	} else {
				196	for (i = 0; i < state->num_cpus; i++)
				197	state->busy_down_thres[i] = val[i];
				198	}
				199	apply_need(state);
				200	return count;
				201	}
				202
				203	static ssize_t show_busy_down_thres(struct cpu_data state, char buf)
				204	{
				205	int i, count = 0;
				206
				207	for (i = 0; i < state->num_cpus; i++)
				208	count += snprintf(buf + count, PAGE_SIZE - count, "%u ",
				209	state->busy_down_thres[i]);
				210	count += snprintf(buf + count, PAGE_SIZE - count, "\n");
				211	return count;
				212	}
				213
				214	static ssize_t store_is_big_cluster(struct cpu_data *state,
				215	const char *buf, size_t count)
				216	{
				217	unsigned int val;
				218
				219	if (sscanf(buf, "%u\n", &val) != 1)
				220	return -EINVAL;
				221
				222	state->is_big_cluster = val ? 1 : 0;
				223	return count;
				224	}
				225
				226	static ssize_t show_is_big_cluster(struct cpu_data state, char buf)
				227	{
				228	return snprintf(buf, PAGE_SIZE, "%u\n", state->is_big_cluster);
				229	}
				230
				231	static ssize_t show_cpus(struct cpu_data state, char buf)
				232	{
				233	struct cpu_data *c;
				234	ssize_t count = 0;
				235	unsigned long flags;
				236
				237	spin_lock_irqsave(&state_lock, flags);
				238	list_for_each_entry(c, &state->lru, sib) {
				239	count += snprintf(buf + count, PAGE_SIZE - count,
				240	"CPU%u (%s)\n", c->cpu,
				241	c->online ? "Online" : "Offline");
				242	}
				243	spin_unlock_irqrestore(&state_lock, flags);
				244	return count;
				245	}
				246
				247	static ssize_t show_need_cpus(struct cpu_data state, char buf)
				248	{
				249	return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus);
				250	}
				251
				252	static ssize_t show_online_cpus(struct cpu_data state, char buf)
				253	{
				254	return snprintf(buf, PAGE_SIZE, "%u\n", state->online_cpus);
				255	}
				256
				257	static ssize_t show_global_state(struct cpu_data state, char buf)
				258	{
				259	struct cpu_data *c;
				260	ssize_t count = 0;
				261	unsigned int cpu;
				262
				263	for_each_possible_cpu(cpu) {
				264	count += snprintf(buf + count, PAGE_SIZE - count,
				265	"CPU%u\n", cpu);
				266	c = &per_cpu(cpu_state, cpu);
				267	if (!c->inited)
				268	continue;
				269	count += snprintf(buf + count, PAGE_SIZE - count,
				270	"\tCPU: %u\n", c->cpu);
				271	count += snprintf(buf + count, PAGE_SIZE - count,
				272	"\tOnline: %u\n", c->online);
				273	count += snprintf(buf + count, PAGE_SIZE - count,
				274	"\tRejected: %u\n", c->rejected);
				275	count += snprintf(buf + count, PAGE_SIZE - count,
				276	"\tFirst CPU: %u\n", c->first_cpu);
				277	count += snprintf(buf + count, PAGE_SIZE - count,
				278	"\tBusy%%: %u\n", c->busy);
				279	count += snprintf(buf + count, PAGE_SIZE - count,
				280	"\tIs busy: %u\n", c->is_busy);
				281	if (c->cpu != c->first_cpu)
				282	continue;
				283	count += snprintf(buf + count, PAGE_SIZE - count,
				284	"\tNr running: %u\n", c->nrrun);
				285	count += snprintf(buf + count, PAGE_SIZE - count,
				286	"\tAvail CPUs: %u\n", c->avail_cpus);
				287	count += snprintf(buf + count, PAGE_SIZE - count,
				288	"\tNeed CPUs: %u\n", c->need_cpus);
				289	}
				290
				291	return count;
				292	}
				293
				294	static ssize_t store_not_preferred(struct cpu_data *state,
				295	const char *buf, size_t count)
				296	{
				297	struct cpu_data *c;
				298	unsigned int i, first_cpu;
				299	unsigned int val[MAX_CPUS_PER_GROUP];
				300	int ret;
				301
				302	ret = sscanf(buf, "%u %u %u %u\n", &val[0], &val[1], &val[2], &val[3]);
				303	if (ret != 1 && ret != state->num_cpus)
				304	return -EINVAL;
				305
				306	first_cpu = state->first_cpu;
				307
				308	for (i = 0; i < state->num_cpus; i++) {
				309	c = &per_cpu(cpu_state, first_cpu);
				310	c->not_preferred = val[i];
				311	first_cpu++;
				312	}
				313
				314	return count;
				315	}
				316
				317	static ssize_t show_not_preferred(struct cpu_data state, char buf)
				318	{
				319	struct cpu_data *c;
				320	ssize_t count = 0;
				321	unsigned int i, first_cpu;
				322
				323	first_cpu = state->first_cpu;
				324
				325	for (i = 0; i < state->num_cpus; i++) {
				326	c = &per_cpu(cpu_state, first_cpu);
				327	count += snprintf(buf + count, PAGE_SIZE - count,
				328	"\tCPU:%d %u\n", first_cpu, c->not_preferred);
				329	first_cpu++;
				330	}
				331
				332	return count;
				333	}
				334
				335	struct core_ctl_attr {
				336	struct attribute attr;
				337	ssize_t (show)(struct cpu_data , char *);
				338	ssize_t (store)(struct cpu_data , const char *, size_t count);
				339	};
				340
				341	#define core_ctl_attr_ro(_name) \
				342	static struct core_ctl_attr _name = \
				343	__ATTR(_name, 0444, show_##_name, NULL)
				344
				345	#define core_ctl_attr_rw(_name) \
				346	static struct core_ctl_attr _name = \
				347	__ATTR(_name, 0644, show_##_name, store_##_name)
				348
				349	core_ctl_attr_rw(min_cpus);
				350	core_ctl_attr_rw(max_cpus);
				351	core_ctl_attr_rw(offline_delay_ms);
				352	core_ctl_attr_rw(busy_up_thres);
				353	core_ctl_attr_rw(busy_down_thres);
				354	core_ctl_attr_rw(task_thres);
				355	core_ctl_attr_rw(is_big_cluster);
				356	core_ctl_attr_ro(cpus);
				357	core_ctl_attr_ro(need_cpus);
				358	core_ctl_attr_ro(online_cpus);
				359	core_ctl_attr_ro(global_state);
				360	core_ctl_attr_rw(not_preferred);
				361
				362	static struct attribute *default_attrs[] = {
				363	&min_cpus.attr,
				364	&max_cpus.attr,
				365	&offline_delay_ms.attr,
				366	&busy_up_thres.attr,
				367	&busy_down_thres.attr,
				368	&task_thres.attr,
				369	&is_big_cluster.attr,
				370	&cpus.attr,
				371	&need_cpus.attr,
				372	&online_cpus.attr,
				373	&global_state.attr,
				374	&not_preferred.attr,
				375	NULL
				376	};
				377
				378	#define to_cpu_data(k) container_of(k, struct cpu_data, kobj)
				379	#define to_attr(a) container_of(a, struct core_ctl_attr, attr)
				380	static ssize_t show(struct kobject kobj, struct attribute attr, char *buf)
				381	{
				382	struct cpu_data *data = to_cpu_data(kobj);
				383	struct core_ctl_attr *cattr = to_attr(attr);
				384	ssize_t ret = -EIO;
				385
				386	if (cattr->show)
				387	ret = cattr->show(data, buf);
				388
				389	return ret;
				390	}
				391
				392	static ssize_t store(struct kobject kobj, struct attribute attr,
				393	const char *buf, size_t count)
				394	{
				395	struct cpu_data *data = to_cpu_data(kobj);
				396	struct core_ctl_attr *cattr = to_attr(attr);
				397	ssize_t ret = -EIO;
				398
				399	if (cattr->store)
				400	ret = cattr->store(data, buf, count);
				401
				402	return ret;
				403	}
				404
				405	static const struct sysfs_ops sysfs_ops = {
				406	.show = show,
				407	.store = store,
				408	};
				409
				410	static struct kobj_type ktype_core_ctl = {
				411	.sysfs_ops = &sysfs_ops,
				412	.default_attrs = default_attrs,
				413	};
				414
				415	/* ==================== runqueue based core count =================== */
				416
				417	#define RQ_AVG_TOLERANCE 2
				418	#define RQ_AVG_DEFAULT_MS 20
				419	#define NR_RUNNING_TOLERANCE 5
				420	static unsigned int rq_avg_period_ms = RQ_AVG_DEFAULT_MS;
				421
				422	static s64 rq_avg_timestamp_ms;
				423	static struct timer_list rq_avg_timer;
				424
				425	static void update_running_avg(bool trigger_update)
				426	{
				427	int cpu;
				428	struct cpu_data *pcpu;
				429	int avg, iowait_avg, big_avg, old_nrrun;
				430	s64 now;
				431	unsigned long flags;
				432
				433	spin_lock_irqsave(&state_lock, flags);
				434
				435	now = ktime_to_ms(ktime_get());
				436	if (now - rq_avg_timestamp_ms < rq_avg_period_ms - RQ_AVG_TOLERANCE) {
				437	spin_unlock_irqrestore(&state_lock, flags);
				438	return;
				439	}
				440	rq_avg_timestamp_ms = now;
				441	sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg);
				442
				443	spin_unlock_irqrestore(&state_lock, flags);
				444
				445	/*
				446	* Round up to the next integer if the average nr running tasks
				447	* is within NR_RUNNING_TOLERANCE/100 of the next integer.
				448	* If normal rounding up is used, it will allow a transient task
				449	* to trigger online event. By the time core is onlined, the task
				450	* has finished.
				451	* Rounding to closest suffers same problem because scheduler
				452	* might only provide running stats per jiffy, and a transient
				453	* task could skew the number for one jiffy. If core control
				454	* samples every 2 jiffies, it will observe 0.5 additional running
				455	* average which rounds up to 1 task.
				456	*/
				457	avg = (avg + NR_RUNNING_TOLERANCE) / 100;
				458	big_avg = (big_avg + NR_RUNNING_TOLERANCE) / 100;
				459
				460	for_each_possible_cpu(cpu) {
				461	pcpu = &per_cpu(cpu_state, cpu);
				462	if (!pcpu->inited \|\| pcpu->first_cpu != cpu)
				463	continue;
				464	old_nrrun = pcpu->nrrun;
				465	/*
				466	* Big cluster only need to take care of big tasks, but if
				467	* there are not enough big cores, big tasks need to be run
				468	* on little as well. Thus for little's runqueue stat, it
				469	* has to use overall runqueue average, or derive what big
				470	* tasks would have to be run on little. The latter approach
				471	* is not easy to get given core control reacts much slower
				472	* than scheduler, and can't predict scheduler's behavior.
				473	*/
				474	pcpu->nrrun = pcpu->is_big_cluster ? big_avg : avg;
				475	if (pcpu->nrrun != old_nrrun) {
				476	if (trigger_update)
				477	apply_need(pcpu);
				478	else
				479	pcpu->nrrun_changed = true;
				480	}
				481	}
				482	}
				483
				484	/* adjust needed CPUs based on current runqueue information */
				485	static unsigned int apply_task_need(struct cpu_data *f, unsigned int new_need)
				486	{
				487	/* Online all cores if there are enough tasks */
				488	if (f->nrrun >= f->task_thres)
				489	return f->num_cpus;
				490
				491	/* only online more cores if there are tasks to run */
				492	if (f->nrrun > new_need)
				493	return new_need + 1;
				494
				495	return new_need;
				496	}
				497
				498	static u64 round_to_nw_start(void)
				499	{
				500	unsigned long step = msecs_to_jiffies(rq_avg_period_ms);
				501	u64 jif = get_jiffies_64();
				502
				503	do_div(jif, step);
				504	return (jif + 1) * step;
				505	}
				506
				507	static void rq_avg_timer_func(unsigned long not_used)
				508	{
				509	update_running_avg(true);
				510	mod_timer(&rq_avg_timer, round_to_nw_start());
				511	}
				512
				513	/* ======================= load based core count ====================== */
				514
				515	static unsigned int apply_limits(struct cpu_data *f, unsigned int need_cpus)
				516	{
				517	return min(max(f->min_cpus, need_cpus), f->max_cpus);
				518	}
				519
				520	static bool eval_need(struct cpu_data *f)
				521	{
				522	unsigned long flags;
				523	struct cpu_data *c;
				524	unsigned int need_cpus = 0, last_need, thres_idx;
				525	int ret = 0;
				526	bool need_flag = false;
				527	s64 now;
				528
				529	if (unlikely(!f->inited))
				530	return 0;
				531
				532	spin_lock_irqsave(&state_lock, flags);
				533	thres_idx = f->online_cpus ? f->online_cpus - 1 : 0;
				534	list_for_each_entry(c, &f->lru, sib) {
				535	if (c->busy >= f->busy_up_thres[thres_idx])
				536	c->is_busy = true;
				537	else if (c->busy < f->busy_down_thres[thres_idx])
				538	c->is_busy = false;
				539	need_cpus += c->is_busy;
				540	}
				541	need_cpus = apply_task_need(f, need_cpus);
				542	need_flag = apply_limits(f, need_cpus) != apply_limits(f, f->need_cpus);
				543	last_need = f->need_cpus;
				544
				545	now = ktime_to_ms(ktime_get());
				546
				547	if (need_cpus == last_need) {
				548	f->need_ts = now;
				549	spin_unlock_irqrestore(&state_lock, flags);
				550	return 0;
				551	}
				552
				553	if (need_cpus > last_need) {
				554	ret = 1;
				555	} else if (need_cpus < last_need) {
				556	s64 elapsed = now - f->need_ts;
				557
				558	if (elapsed >= f->offline_delay_ms) {
				559	ret = 1;
				560	} else {
				561	mod_timer(&f->timer, jiffies +
				562	msecs_to_jiffies(f->offline_delay_ms));
				563	}
				564	}
				565
				566	if (ret) {
				567	f->need_ts = now;
				568	f->need_cpus = need_cpus;
				569	}
				570
				571	trace_core_ctl_eval_need(f->cpu, last_need, need_cpus,
				572	ret && need_flag);
				573	spin_unlock_irqrestore(&state_lock, flags);
				574
				575	return ret && need_flag;
				576	}
				577
				578	static void apply_need(struct cpu_data *f)
				579	{
				580	if (eval_need(f))
				581	wake_up_hotplug_thread(f);
				582	}
				583
				584	static int core_ctl_set_busy(unsigned int cpu, unsigned int busy)
				585	{
				586	struct cpu_data *c = &per_cpu(cpu_state, cpu);
				587	struct cpu_data *f;
				588	unsigned int old_is_busy = c->is_busy;
				589
				590	if (!c->inited)
				591	return 0;
				592	f = &per_cpu(cpu_state, c->first_cpu);
				593
				594	update_running_avg(false);
				595	if (c->busy == busy && !f->nrrun_changed)
				596	return 0;
				597	c->busy = busy;
				598	f->nrrun_changed = false;
				599
				600	apply_need(f);
				601	trace_core_ctl_set_busy(cpu, busy, old_is_busy, c->is_busy);
				602	return 0;
				603	}
				604
				605	/* ========================= core count enforcement ==================== */
				606
				607	/*
				608	* If current thread is hotplug thread, don't attempt to wake up
				609	* itself or other hotplug threads because it will deadlock. Instead,
				610	* schedule a timer to fire in next timer tick and wake up the thread.
				611	*/
				612	static void wake_up_hotplug_thread(struct cpu_data *state)
				613	{
				614	unsigned long flags;
				615	int cpu;
				616	struct cpu_data *pcpu;
				617	bool no_wakeup = false;
				618
				619	for_each_possible_cpu(cpu) {
				620	pcpu = &per_cpu(cpu_state, cpu);
				621	if (cpu != pcpu->first_cpu)
				622	continue;
				623	if (pcpu->hotplug_thread == current) {
				624	no_wakeup = true;
				625	break;
				626	}
				627	}
				628
				629	spin_lock_irqsave(&state->pending_lock, flags);
				630	state->pending = true;
				631	spin_unlock_irqrestore(&state->pending_lock, flags);
				632
				633	if (no_wakeup) {
				634	spin_lock_irqsave(&state_lock, flags);
				635	mod_timer(&state->timer, jiffies);
				636	spin_unlock_irqrestore(&state_lock, flags);
				637	} else {
				638	wake_up_process(state->hotplug_thread);
				639	}
				640	}
				641
				642	static void core_ctl_timer_func(unsigned long cpu)
				643	{
				644	struct cpu_data *state = &per_cpu(cpu_state, cpu);
				645	unsigned long flags;
				646
				647	if (eval_need(state)) {
				648	spin_lock_irqsave(&state->pending_lock, flags);
				649	state->pending = true;
				650	spin_unlock_irqrestore(&state->pending_lock, flags);
				651	wake_up_process(state->hotplug_thread);
				652	}
				653
				654	}
				655
				656	static int core_ctl_online_core(unsigned int cpu)
				657	{
				658	int ret;
				659	struct device *dev;
				660
				661	lock_device_hotplug();
				662	dev = get_cpu_device(cpu);
				663	if (!dev) {
				664	pr_err("%s: failed to get cpu%d device\n", __func__, cpu);
				665	ret = -ENODEV;
				666	} else {
				667	ret = device_online(dev);
				668	}
				669	unlock_device_hotplug();
				670	return ret;
				671	}
				672
				673	static int core_ctl_offline_core(unsigned int cpu)
				674	{
				675	int ret;
				676	struct device *dev;
				677
				678	lock_device_hotplug();
				679	dev = get_cpu_device(cpu);
				680	if (!dev) {
				681	pr_err("%s: failed to get cpu%d device\n", __func__, cpu);
				682	ret = -ENODEV;
				683	} else {
				684	ret = device_offline(dev);
				685	}
				686	unlock_device_hotplug();
				687	return ret;
				688	}
				689
				690	static void __ref do_hotplug(struct cpu_data *f)
				691	{
				692	unsigned int need;
				693	struct cpu_data c, tmp;
				694
				695	need = apply_limits(f, f->need_cpus);
				696	pr_debug("Trying to adjust group %u to %u\n", f->first_cpu, need);
				697
				698	if (f->online_cpus > need) {
				699	list_for_each_entry_safe(c, tmp, &f->lru, sib) {
				700	if (!c->online)
				701	continue;
				702
				703	if (f->online_cpus == need)
				704	break;
				705
				706	/* Don't offline busy CPUs. */
				707	if (c->is_busy)
				708	continue;
				709
				710	pr_debug("Trying to Offline CPU%u\n", c->cpu);
				711	if (core_ctl_offline_core(c->cpu))
				712	pr_debug("Unable to Offline CPU%u\n", c->cpu);
				713	}
				714
				715	/*
				716	* If the number of online CPUs is within the limits, then
				717	* don't force any busy CPUs offline.
				718	*/
				719	if (f->online_cpus <= f->max_cpus)
				720	return;
				721
				722	list_for_each_entry_safe(c, tmp, &f->lru, sib) {
				723	if (!c->online)
				724	continue;
				725
				726	if (f->online_cpus <= f->max_cpus)
				727	break;
				728
				729	pr_debug("Trying to Offline CPU%u\n", c->cpu);
				730	if (core_ctl_offline_core(c->cpu))
				731	pr_debug("Unable to Offline CPU%u\n", c->cpu);
				732	}
				733	} else if (f->online_cpus < need) {
				734	list_for_each_entry_safe(c, tmp, &f->lru, sib) {
				735	if (c->online \|\| c->rejected \|\| c->not_preferred)
				736	continue;
				737	if (f->online_cpus == need)
				738	break;
				739
				740	pr_debug("Trying to Online CPU%u\n", c->cpu);
				741	if (core_ctl_online_core(c->cpu))
				742	pr_debug("Unable to Online CPU%u\n", c->cpu);
				743	}
				744
				745	if (f->online_cpus == need)
				746	return;
				747
				748
				749	list_for_each_entry_safe(c, tmp, &f->lru, sib) {
				750	if (c->online \|\| c->rejected \|\| !c->not_preferred)
				751	continue;
				752	if (f->online_cpus == need)
				753	break;
				754
				755	pr_debug("Trying to Online CPU%u\n", c->cpu);
				756	if (core_ctl_online_core(c->cpu))
				757	pr_debug("Unable to Online CPU%u\n", c->cpu);
				758	}
				759
				760	}
				761	}
				762
				763	static int __ref try_hotplug(void *data)
				764	{
				765	struct cpu_data *f = data;
				766	unsigned long flags;
				767
				768	while (1) {
				769	set_current_state(TASK_INTERRUPTIBLE);
				770	spin_lock_irqsave(&f->pending_lock, flags);
				771	if (!f->pending) {
				772	spin_unlock_irqrestore(&f->pending_lock, flags);
				773	schedule();
				774	if (kthread_should_stop())
				775	break;
				776	spin_lock_irqsave(&f->pending_lock, flags);
				777	}
				778	set_current_state(TASK_RUNNING);
				779	f->pending = false;
				780	spin_unlock_irqrestore(&f->pending_lock, flags);
				781
				782	do_hotplug(f);
				783	}
				784
				785	return 0;
				786	}
				787
				788	static int __ref cpu_callback(struct notifier_block *nfb,
				789	unsigned long action, void *hcpu)
				790	{
				791	uint32_t cpu = (uintptr_t)hcpu;
				792	struct cpu_data *state = &per_cpu(cpu_state, cpu);
				793	struct cpu_data *f;
				794	int ret = NOTIFY_OK;
				795	unsigned long flags;
				796
				797	/* Don't affect suspend resume */
				798	if (action & CPU_TASKS_FROZEN)
				799	return NOTIFY_OK;
				800
				801	if (unlikely(!state->inited))
				802	return NOTIFY_OK;
				803
				804	f = &per_cpu(cpu_state, state->first_cpu);
				805
				806	switch (action) {
				807	case CPU_UP_PREPARE:
				808
				809	/* If online state of CPU somehow got out of sync, fix it. */
				810	if (state->online) {
				811	f->online_cpus--;
				812	state->online = false;
				813	pr_warn("CPU%d offline when state is online\n", cpu);
				814	}
				815
				816	if (state->rejected) {
				817	state->rejected = false;
				818	f->avail_cpus++;
				819	}
				820
				821	/*
				822	* If a CPU is in the process of coming up, mark it as online
				823	* so that there's no race with hotplug thread bringing up more
				824	* CPUs than necessary.
				825	*/
				826	if (apply_limits(f, f->need_cpus) <= f->online_cpus) {
				827	pr_debug("Prevent CPU%d onlining\n", cpu);
				828	ret = NOTIFY_BAD;
				829	} else {
				830	state->online = true;
				831	f->online_cpus++;
				832	}
				833	break;
				834
				835	case CPU_ONLINE:
				836	/*
				837	* Moving to the end of the list should only happen in
				838	* CPU_ONLINE and not on CPU_UP_PREPARE to prevent an
				839	* infinite list traversal when thermal (or other entities)
				840	* reject trying to online CPUs.
				841	*/
				842	spin_lock_irqsave(&state_lock, flags);
				843	list_del(&state->sib);
				844	list_add_tail(&state->sib, &f->lru);
				845	spin_unlock_irqrestore(&state_lock, flags);
				846	break;
				847
				848	case CPU_DEAD:
				849	/* Move a CPU to the end of the LRU when it goes offline. */
				850	spin_lock_irqsave(&state_lock, flags);
				851	list_del(&state->sib);
				852	list_add_tail(&state->sib, &f->lru);
				853	spin_unlock_irqrestore(&state_lock, flags);
				854
				855	/* Fall through */
				856
				857	case CPU_UP_CANCELED:
				858
				859	/* If online state of CPU somehow got out of sync, fix it. */
				860	if (!state->online) {
				861	f->online_cpus++;
				862	pr_warn("CPU%d online when state is offline\n", cpu);
				863	}
				864
				865	if (!state->rejected && action == CPU_UP_CANCELED) {
				866	state->rejected = true;
				867	f->avail_cpus--;
				868	}
				869
				870	state->online = false;
				871	state->busy = 0;
				872	f->online_cpus--;
				873	break;
				874	}
				875
				876	if (f->online_cpus < apply_limits(f, f->need_cpus)
				877	&& f->online_cpus < f->avail_cpus
				878	&& action == CPU_DEAD)
				879	wake_up_hotplug_thread(f);
				880
				881	return ret;
				882	}
				883
				884	static struct notifier_block __refdata cpu_notifier = {
				885	.notifier_call = cpu_callback,
				886	};
				887
				888	/* ============================ init code ============================== */
				889
				890	static int group_init(struct cpumask *mask)
				891	{
				892	struct device *dev;
				893	unsigned int first_cpu = cpumask_first(mask);
				894	struct cpu_data *f = &per_cpu(cpu_state, first_cpu);
				895	struct cpu_data *state;
				896	unsigned int cpu;
				897	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
				898
				899	if (likely(f->inited))
				900	return 0;
				901
				902	dev = get_cpu_device(first_cpu);
				903	if (!dev)
				904	return -ENODEV;
				905
				906	pr_info("Creating CPU group %d\n", first_cpu);
				907
				908	f->num_cpus = cpumask_weight(mask);
				909	if (f->num_cpus > MAX_CPUS_PER_GROUP) {
				910	pr_err("HW configuration not supported\n");
				911	return -EINVAL;
				912	}
				913	f->min_cpus = 1;
				914	f->max_cpus = f->num_cpus;
				915	f->need_cpus = f->num_cpus;
				916	f->avail_cpus = f->num_cpus;
				917	f->offline_delay_ms = 100;
				918	f->task_thres = UINT_MAX;
				919	f->nrrun = f->num_cpus;
				920	INIT_LIST_HEAD(&f->lru);
				921	init_timer(&f->timer);
				922	spin_lock_init(&f->pending_lock);
				923	f->timer.function = core_ctl_timer_func;
				924	f->timer.data = first_cpu;
				925
				926	for_each_cpu(cpu, mask) {
				927	pr_info("Init CPU%u state\n", cpu);
				928
				929	state = &per_cpu(cpu_state, cpu);
				930	state->cpu = cpu;
				931	state->first_cpu = first_cpu;
				932
				933	if (cpu_online(cpu)) {
				934	f->online_cpus++;
				935	state->online = true;
				936	}
				937
				938	list_add_tail(&state->sib, &f->lru);
				939	}
				940
				941	f->hotplug_thread = kthread_run(try_hotplug, (void *) f,
				942	"core_ctl/%d", first_cpu);
				943	sched_setscheduler_nocheck(f->hotplug_thread, SCHED_FIFO, &param);
				944
				945	for_each_cpu(cpu, mask) {
				946	state = &per_cpu(cpu_state, cpu);
				947	state->inited = true;
				948	}
				949
				950	kobject_init(&f->kobj, &ktype_core_ctl);
				951	return kobject_add(&f->kobj, &dev->kobj, "core_ctl");
				952	}
				953
				954	static int cpufreq_policy_cb(struct notifier_block *nb, unsigned long val,
				955	void *data)
				956	{
				957	struct cpufreq_policy *policy = data;
				958
				959	switch (val) {
				960	case CPUFREQ_CREATE_POLICY:
				961	group_init(policy->related_cpus);
				962	break;
				963	}
				964
				965	return NOTIFY_OK;
				966	}
				967
				968	static struct notifier_block cpufreq_pol_nb = {
				969	.notifier_call = cpufreq_policy_cb,
				970	};
				971
				972	static int cpufreq_gov_cb(struct notifier_block *nb, unsigned long val,
				973	void *data)
				974	{
				975	struct cpufreq_govinfo *info = data;
				976
				977	switch (val) {
				978	case CPUFREQ_LOAD_CHANGE:
				979	core_ctl_set_busy(info->cpu, info->load);
				980	break;
				981	}
				982
				983	return NOTIFY_OK;
				984	}
				985
				986	static struct notifier_block cpufreq_gov_nb = {
				987	.notifier_call = cpufreq_gov_cb,
				988	};
				989
				990	static int __init core_ctl_init(void)
				991	{
				992	struct cpufreq_policy *policy;
				993	unsigned int cpu;
				994
				995	register_cpu_notifier(&cpu_notifier);
				996	cpufreq_register_notifier(&cpufreq_pol_nb, CPUFREQ_POLICY_NOTIFIER);
				997	cpufreq_register_notifier(&cpufreq_gov_nb, CPUFREQ_GOVINFO_NOTIFIER);
				998	init_timer_deferrable(&rq_avg_timer);
				999	rq_avg_timer.function = rq_avg_timer_func;
				1000
				1001	get_online_cpus();
				1002	for_each_online_cpu(cpu) {
				1003	policy = cpufreq_cpu_get(cpu);
				1004	if (policy) {
				1005	group_init(policy->related_cpus);
				1006	cpufreq_cpu_put(policy);
				1007	}
				1008	}
				1009	put_online_cpus();
				1010	mod_timer(&rq_avg_timer, round_to_nw_start());
				1011	return 0;
				1012	}
				1013
				1014	late_initcall(core_ctl_init);