kernel/sched/walt.c - kernel/msm-4.9 - Gitiles

 /*
  * Copyright (c) 2016-2020, The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
  * only version 2 as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  *
  * Window Assisted Load Tracking (WALT) implementation credits:
  * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
  * Pavan Kumar Kondeti, Olav Haugan
  *
  * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
  *             and Todd Kjos
  */

 #include <linux/syscore_ops.h>
 #include <linux/cpufreq.h>
 #include <linux/list_sort.h>
 #include <linux/jiffies.h>
 #include <linux/sched/core_ctl.h>
 #include <trace/events/sched.h>
 #include "sched.h"
 #include "walt.h"

 #include <trace/events/sched.h>

 const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK",
 				  "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE",
 				"IRQ_UPDATE"};

 const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP",
 					 "RQ_TO_RQ", "GROUP_TO_GROUP"};

 #define SCHED_FREQ_ACCOUNT_WAIT_TIME 0
 #define SCHED_ACCOUNT_WAIT_TIME 1

 #define EARLY_DETECTION_DURATION 9500000

 static ktime_t ktime_last;
 static bool sched_ktime_suspended;
 static struct cpu_cycle_counter_cb cpu_cycle_counter_cb;
 static bool use_cycle_counter;
 DEFINE_MUTEX(cluster_lock);
 static atomic64_t walt_irq_work_lastq_ws;
 u64 walt_load_reported_window;

 static struct irq_work walt_cpufreq_irq_work;
 static struct irq_work walt_migration_irq_work;

 void
 walt_fixup_cumulative_runnable_avg(struct rq *rq,
 				   struct task_struct *p, u64 new_task_load)
 {
 	s64 task_load_delta = (s64)new_task_load - task_load(p);
 	struct walt_sched_stats *stats = &rq->walt_stats;

 	stats->cumulative_runnable_avg += task_load_delta;
 	if ((s64)stats->cumulative_runnable_avg < 0)
 		panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
 			task_load_delta, task_load(p));

 	walt_fixup_cum_window_demand(rq, task_load_delta);
 }

 u64 sched_ktime_clock(void)
 {
 	if (unlikely(sched_ktime_suspended))
 		return ktime_to_ns(ktime_last);
 	return ktime_get_ns();
 }

 static void sched_resume(void)
 {
 	sched_ktime_suspended = false;
 }

 static int sched_suspend(void)
 {
 	ktime_last = ktime_get();
 	sched_ktime_suspended = true;
 	return 0;
 }

 static struct syscore_ops sched_syscore_ops = {
 	.resume	= sched_resume,
 	.suspend = sched_suspend
 };

 static int __init sched_init_ops(void)
 {
 	register_syscore_ops(&sched_syscore_ops);
 	return 0;
 }
 late_initcall(sched_init_ops);

 static void acquire_rq_locks_irqsave(const cpumask_t *cpus,
 				     unsigned long *flags)
 {
 	int cpu, level = 0;

 	local_irq_save(*flags);
 	for_each_cpu(cpu, cpus) {
 		if (level == 0)
 			raw_spin_lock(&cpu_rq(cpu)->lock);
 		else
 			raw_spin_lock_nested(&cpu_rq(cpu)->lock, level);
 		level++;
 	}
 }

 static void release_rq_locks_irqrestore(const cpumask_t *cpus,
 					unsigned long *flags)
 {
 	int cpu;

 	for_each_cpu(cpu, cpus)
 		raw_spin_unlock(&cpu_rq(cpu)->lock);
 	local_irq_restore(*flags);
 }

 #ifdef CONFIG_HZ_300
 /*
  * Tick interval becomes to 3333333 due to
  * rounding error when HZ=300.
  */
 #define MIN_SCHED_RAVG_WINDOW (3333333 * 6)
 #else
 /* Min window size (in ns) = 20ms */
 #define MIN_SCHED_RAVG_WINDOW 20000000
 #endif

 /* Max window size (in ns) = 1s */
 #define MAX_SCHED_RAVG_WINDOW 1000000000

 /* 1 -> use PELT based load stats, 0 -> use window-based load stats */
 unsigned int __read_mostly walt_disabled = 0;

 __read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC);

 unsigned int sysctl_sched_walt_rotate_big_tasks;
 unsigned int walt_rotation_enabled;

 /*
  * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy
  * associated with them. This is required for atomic update of those variables
  * when being modifed via sysctl interface.
  *
  * IMPORTANT: Initialize both copies to same value!!
  */

 __read_mostly unsigned int sched_ravg_hist_size = 5;
 __read_mostly unsigned int sysctl_sched_ravg_hist_size = 5;

 static __read_mostly unsigned int sched_io_is_busy = 1;

 __read_mostly unsigned int sched_window_stats_policy =
 	WINDOW_STATS_MAX_RECENT_AVG;
 __read_mostly unsigned int sysctl_sched_window_stats_policy =
 	WINDOW_STATS_MAX_RECENT_AVG;

 /* Window size (in ns) */
 __read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW;

 /*
  * A after-boot constant divisor for cpu_util_freq_walt() to apply the load
  * boost.
  */
 __read_mostly unsigned int walt_cpu_util_freq_divisor;

 /* Initial task load. Newly created tasks are assigned this load. */
 unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;

 /*
  * Maximum possible frequency across all cpus. Task demand and cpu
  * capacity (cpu_power) metrics are scaled in reference to it.
  */
 unsigned int max_possible_freq = 1;

 /*
  * Minimum possible max_freq across all cpus. This will be same as
  * max_possible_freq on homogeneous systems and could be different from
  * max_possible_freq on heterogenous systems. min_max_freq is used to derive
  * capacity (cpu_power) of cpus.
  */
 unsigned int min_max_freq = 1;

 unsigned int max_capacity = 1024; /* max(rq->capacity) */
 unsigned int min_capacity = 1024; /* min(rq->capacity) */
 unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
 unsigned int
 min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */

 /* Temporarily disable window-stats activity on all cpus */
 unsigned int __read_mostly sched_disable_window_stats;

 /*
  * Task load is categorized into buckets for the purpose of top task tracking.
  * The entire range of load from 0 to sched_ravg_window needs to be covered
  * in NUM_LOAD_INDICES number of buckets. Therefore the size of each bucket
  * is given by sched_ravg_window / NUM_LOAD_INDICES. Since the default value
  * of sched_ravg_window is MIN_SCHED_RAVG_WINDOW, use that to compute
  * sched_load_granule.
  */
 __read_mostly unsigned int sched_load_granule =
 			MIN_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES;
 /* Size of bitmaps maintained to track top tasks */
 static const unsigned int top_tasks_bitmap_size =
 		BITS_TO_LONGS(NUM_LOAD_INDICES + 1) * sizeof(unsigned long);

 /*
  * This governs what load needs to be used when reporting CPU busy time
  * to the cpufreq governor.
  */
 __read_mostly unsigned int sysctl_sched_freq_reporting_policy;

 static int __init set_sched_ravg_window(char *str)
 {
 	unsigned int window_size;

 	get_option(&str, &window_size);

 	if (window_size < MIN_SCHED_RAVG_WINDOW ||
 			window_size > MAX_SCHED_RAVG_WINDOW) {
 		WARN_ON(1);
 		return -EINVAL;
 	}

 	sched_ravg_window = window_size;
 	return 0;
 }

 early_param("sched_ravg_window", set_sched_ravg_window);

 static int __init set_sched_predl(char *str)
 {
 	unsigned int predl;

 	get_option(&str, &predl);
 	sched_predl = !!predl;
 	return 0;
 }
 early_param("sched_predl", set_sched_predl);

 void inc_rq_walt_stats(struct rq *rq, struct task_struct *p)
 {
 	inc_nr_big_task(&rq->walt_stats, p);
 	walt_inc_cumulative_runnable_avg(rq, p);
 }

 void dec_rq_walt_stats(struct rq *rq, struct task_struct *p)
 {
 	dec_nr_big_task(&rq->walt_stats, p);
 	walt_dec_cumulative_runnable_avg(rq, p);
 }

 void fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p,
 				   u32 new_task_load, u32 new_pred_demand)
 {
 	s64 task_load_delta = (s64)new_task_load - task_load(p);
 	s64 pred_demand_delta = PRED_DEMAND_DELTA;

 	fixup_cumulative_runnable_avg(&rq->walt_stats, task_load_delta,
 				      pred_demand_delta);

 	walt_fixup_cum_window_demand(rq, task_load_delta);
 }

 /*
  * Demand aggregation for frequency purpose:
  *
  * CPU demand of tasks from various related groups is aggregated per-cluster and
  * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined
  * by just rq->prev_runnable_sum.
  *
  * Some examples follow, which assume:
  *	Cluster0 = CPU0-3, Cluster1 = CPU4-7
  *	One related thread group A that has tasks A0, A1, A2
  *
  *	A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of
  *	tasks belonging to group A are accumulated when they run on cpu X.
  *
  *	CX->curr/prev_sum = counters in which cpu execution stats of all tasks
  *	not belonging to group A are accumulated when they run on cpu X
  *
  * Lets say the stats for window M was as below:
  *
  *	C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms
  *		Task A0 ran 5ms on CPU0
  *		Task B0 ran 1ms on CPU0
  *
  *	C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms
  *		Task A1 ran 4ms on CPU1
  *		Task A2 ran 2ms on CPU1
  *		Task B1 ran 5ms on CPU1
  *
  *	C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0
  *		CPU2 idle
  *
  *	C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0
  *		CPU3 idle
  *
  * In this case, CPU1 was most busy going by just its prev_sum counter. Demand
  * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy
  * time reported to governor will be:
  *
  *
  *	C0 busy time = 1ms
  *	C1 busy time = 5 + 5 + 6 = 16ms
  *
  */
 __read_mostly int sched_freq_aggregate_threshold;

 static u64
 update_window_start(struct rq *rq, u64 wallclock, int event)
 {
 	s64 delta;
 	int nr_windows;
 	u64 old_window_start = rq->window_start;

 	delta = wallclock - rq->window_start;
 	BUG_ON(delta < 0);
 	if (delta < sched_ravg_window)
 		return old_window_start;

 	nr_windows = div64_u64(delta, sched_ravg_window);
 	rq->window_start += (u64)nr_windows * (u64)sched_ravg_window;

 	rq->cum_window_demand = rq->walt_stats.cumulative_runnable_avg;

 	return old_window_start;
 }

 int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
 {
 	unsigned long flags;

 	mutex_lock(&cluster_lock);
 	if (!cb->get_cpu_cycle_counter) {
 		mutex_unlock(&cluster_lock);
 		return -EINVAL;
 	}

 	acquire_rq_locks_irqsave(cpu_possible_mask, &flags);
 	cpu_cycle_counter_cb = *cb;
 	use_cycle_counter = true;
 	release_rq_locks_irqrestore(cpu_possible_mask, &flags);

 	mutex_unlock(&cluster_lock);

 	return 0;
 }

 /*
  * Assumes rq_lock is held and wallclock was recorded in the same critical
  * section as this function's invocation.
  */
 static inline u64 read_cycle_counter(int cpu, u64 wallclock)
 {
 	struct rq *rq = cpu_rq(cpu);

 	if (rq->last_cc_update != wallclock) {
 		rq->cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
 		rq->last_cc_update = wallclock;
 	}

 	return rq->cycles;
 }

 static void update_task_cpu_cycles(struct task_struct *p, int cpu,
 				   u64 wallclock)
 {
 	if (use_cycle_counter)
 		p->cpu_cycles = read_cycle_counter(cpu, wallclock);
 }

 void clear_ed_task(struct task_struct *p, struct rq *rq)
 {
 	if (p == rq->ed_task)
 		rq->ed_task = NULL;
 }

 bool early_detection_notify(struct rq *rq, u64 wallclock)
 {
 	struct task_struct *p;
 	int loop_max = 10;

 	rq->ed_task = NULL;

 	if ((!walt_rotation_enabled && sched_boost_policy() ==
 			SCHED_BOOST_NONE) || !rq->cfs.h_nr_running)
 		return 0;

 	list_for_each_entry(p, &rq->cfs_tasks, se.group_node) {
 		if (!loop_max)
 			break;

 		if (wallclock - p->last_wake_ts >= EARLY_DETECTION_DURATION) {
 			rq->ed_task = p;
 			return 1;
 		}

 		loop_max--;
 	}

 	return 0;
 }

 void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock)
 {
 	struct rq *rq = cpu_rq(cpu);

 	if (!rq->window_start || sched_disable_window_stats)
 		return;

 	/*
 	 * We don’t have to note down an irqstart event when cycle
 	 * counter is not used.
 	 */
 	if (!use_cycle_counter)
 		return;

 	if (is_idle_task(curr)) {
 		/* We're here without rq->lock held, IRQ disabled */
 		raw_spin_lock(&rq->lock);
 		update_task_cpu_cycles(curr, cpu, sched_ktime_clock());
 		raw_spin_unlock(&rq->lock);
 	}
 }

 /*
  * Return total number of tasks "eligible" to run on highest capacity cpu
  *
  * This is simply nr_big_tasks for cpus which are not of max_capacity and
  * nr_running for cpus of max_capacity
  */
 unsigned int nr_eligible_big_tasks(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);

 	if (!is_max_capacity_cpu(cpu))
 		return rq->walt_stats.nr_big_tasks;

 	return rq->nr_running;
 }

 void clear_walt_request(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;

 	clear_boost_kick(cpu);
 	clear_reserved(cpu);
 	if (rq->push_task) {
 		struct task_struct *push_task = NULL;

 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->push_task) {
 			clear_reserved(rq->push_cpu);
 			push_task = rq->push_task;
 			rq->push_task = NULL;
 		}
 		rq->active_balance = 0;
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		if (push_task)
 			put_task_struct(push_task);
 	}
 }

 void sched_account_irqtime(int cpu, struct task_struct *curr,
 				 u64 delta, u64 wallclock)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags, nr_windows;
 	u64 cur_jiffies_ts;

 	raw_spin_lock_irqsave(&rq->lock, flags);

 	/*
 	 * cputime (wallclock) uses sched_clock so use the same here for
 	 * consistency.
 	 */
 	delta += sched_clock() - wallclock;
 	cur_jiffies_ts = get_jiffies_64();

 	if (is_idle_task(curr))
 		update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(),
 				 delta);

 	nr_windows = cur_jiffies_ts - rq->irqload_ts;

 	if (nr_windows) {
 		if (nr_windows < 10) {
 			/* Decay CPU's irqload by 3/4 for each window. */
 			rq->avg_irqload *= (3 * nr_windows);
 			rq->avg_irqload = div64_u64(rq->avg_irqload,
 						    4 * nr_windows);
 		} else {
 			rq->avg_irqload = 0;
 		}
 		rq->avg_irqload += rq->cur_irqload;
 		rq->cur_irqload = 0;
 	}

 	rq->cur_irqload += delta;
 	rq->irqload_ts = cur_jiffies_ts;
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }

 /*
  * Special case the last index and provide a fast path for index = 0.
  * Note that sched_load_granule can change underneath us if we are not
  * holding any runqueue locks while calling the two functions below.
  */
 static u32  top_task_load(struct rq *rq)
 {
 	int index = rq->prev_top;
 	u8 prev = 1 - rq->curr_table;

 	if (!index) {
 		int msb = NUM_LOAD_INDICES - 1;

 		if (!test_bit(msb, rq->top_tasks_bitmap[prev]))
 			return 0;
 		else
 			return sched_load_granule;
 	} else if (index == NUM_LOAD_INDICES - 1) {
 		return sched_ravg_window;
 	} else {
 		return (index + 1) * sched_load_granule;
 	}
 }

 u64 freq_policy_load(struct rq *rq)
 {
 	unsigned int reporting_policy = sysctl_sched_freq_reporting_policy;
 	int freq_aggr_thresh = sched_freq_aggregate_threshold;
 	struct sched_cluster *cluster = rq->cluster;
 	u64 aggr_grp_load = cluster->aggr_grp_load;
 	u64 load, tt_load = 0;
 	u64 coloc_boost_load = cluster->coloc_boost_load;

 	if (rq->ed_task != NULL) {
 		load = sched_ravg_window;
 		goto done;
 	}

 	if (aggr_grp_load > freq_aggr_thresh)
 		load = rq->prev_runnable_sum + aggr_grp_load;
 	else
 		load = rq->prev_runnable_sum + rq->grp_time.prev_runnable_sum;

 	if (coloc_boost_load)
 		load = max_t(u64, load, coloc_boost_load);

 	tt_load = top_task_load(rq);
 	switch (reporting_policy) {
 	case FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK:
 		load = max_t(u64, load, tt_load);
 		break;
 	case FREQ_REPORT_TOP_TASK:
 		load = tt_load;
 		break;
 	case FREQ_REPORT_CPU_LOAD:
 		break;
 	default:
 		break;
 	}

 done:
 	trace_sched_load_to_gov(rq, aggr_grp_load, tt_load, freq_aggr_thresh,
 				load, reporting_policy, walt_rotation_enabled,
 				sysctl_sched_little_cluster_coloc_fmin_khz,
 				coloc_boost_load);
 	return load;
 }

 /*
  * In this function we match the accumulated subtractions with the current
  * and previous windows we are operating with. Ignore any entries where
  * the window start in the load_subtraction struct does not match either
  * the curent or the previous window. This could happen whenever CPUs
  * become idle or busy with interrupts disabled for an extended period.
  */
 static inline void account_load_subtractions(struct rq *rq)
 {
 	u64 ws = rq->window_start;
 	u64 prev_ws = ws - sched_ravg_window;
 	struct load_subtractions *ls = rq->load_subs;
 	int i;

 	for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
 		if (ls[i].window_start == ws) {
 			rq->curr_runnable_sum -= ls[i].subs;
 			rq->nt_curr_runnable_sum -= ls[i].new_subs;
 		} else if (ls[i].window_start == prev_ws) {
 			rq->prev_runnable_sum -= ls[i].subs;
 			rq->nt_prev_runnable_sum -= ls[i].new_subs;
 		}

 		ls[i].subs = 0;
 		ls[i].new_subs = 0;
 	}

 	BUG_ON((s64)rq->prev_runnable_sum < 0);
 	BUG_ON((s64)rq->curr_runnable_sum < 0);
 	BUG_ON((s64)rq->nt_prev_runnable_sum < 0);
 	BUG_ON((s64)rq->nt_curr_runnable_sum < 0);
 }

 static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index)
 {
 	rq->load_subs[index].window_start = ws;
 	rq->load_subs[index].subs = 0;
 	rq->load_subs[index].new_subs = 0;
 }

 static int get_top_index(unsigned long *bitmap, unsigned long old_top)
 {
 	int index = find_next_bit(bitmap, NUM_LOAD_INDICES, old_top);

 	if (index == NUM_LOAD_INDICES)
 		return 0;

 	return NUM_LOAD_INDICES - 1 - index;
 }

 static bool get_subtraction_index(struct rq *rq, u64 ws)
 {
 	int i;
 	u64 oldest = ULLONG_MAX;
 	int oldest_index = 0;

 	for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
 		u64 entry_ws = rq->load_subs[i].window_start;

 		if (ws == entry_ws)
 			return i;

 		if (entry_ws < oldest) {
 			oldest = entry_ws;
 			oldest_index = i;
 		}
 	}

 	create_subtraction_entry(rq, ws, oldest_index);
 	return oldest_index;
 }

 static void update_rq_load_subtractions(int index, struct rq *rq,
 					u32 sub_load, bool new_task)
 {
 	rq->load_subs[index].subs +=  sub_load;
 	if (new_task)
 		rq->load_subs[index].new_subs += sub_load;
 }

 void update_cluster_load_subtractions(struct task_struct *p,
 					int cpu, u64 ws, bool new_task)
 {
 	struct sched_cluster *cluster = cpu_cluster(cpu);
 	struct cpumask cluster_cpus = cluster->cpus;
 	u64 prev_ws = ws - sched_ravg_window;
 	int i;

 	cpumask_clear_cpu(cpu, &cluster_cpus);
 	raw_spin_lock(&cluster->load_lock);

 	for_each_cpu(i, &cluster_cpus) {
 		struct rq *rq = cpu_rq(i);
 		int index;

 		if (p->ravg.curr_window_cpu[i]) {
 			index = get_subtraction_index(rq, ws);
 			update_rq_load_subtractions(index, rq,
 				p->ravg.curr_window_cpu[i], new_task);
 			p->ravg.curr_window_cpu[i] = 0;
 		}

 		if (p->ravg.prev_window_cpu[i]) {
 			index = get_subtraction_index(rq, prev_ws);
 			update_rq_load_subtractions(index, rq,
 				p->ravg.prev_window_cpu[i], new_task);
 			p->ravg.prev_window_cpu[i] = 0;
 		}
 	}

 	raw_spin_unlock(&cluster->load_lock);
 }

 static inline void inter_cluster_migration_fixup
 	(struct task_struct *p, int new_cpu, int task_cpu, bool new_task)
 {
 	struct rq *dest_rq = cpu_rq(new_cpu);
 	struct rq *src_rq = cpu_rq(task_cpu);

 	if (same_freq_domain(new_cpu, task_cpu))
 		return;

 	p->ravg.curr_window_cpu[new_cpu] = p->ravg.curr_window;
 	p->ravg.prev_window_cpu[new_cpu] = p->ravg.prev_window;

 	dest_rq->curr_runnable_sum += p->ravg.curr_window;
 	dest_rq->prev_runnable_sum += p->ravg.prev_window;

 	src_rq->curr_runnable_sum -=  p->ravg.curr_window_cpu[task_cpu];
 	src_rq->prev_runnable_sum -=  p->ravg.prev_window_cpu[task_cpu];

 	if (new_task) {
 		dest_rq->nt_curr_runnable_sum += p->ravg.curr_window;
 		dest_rq->nt_prev_runnable_sum += p->ravg.prev_window;

 		src_rq->nt_curr_runnable_sum -=
 				p->ravg.curr_window_cpu[task_cpu];
 		src_rq->nt_prev_runnable_sum -=
 				p->ravg.prev_window_cpu[task_cpu];
 	}

 	p->ravg.curr_window_cpu[task_cpu] = 0;
 	p->ravg.prev_window_cpu[task_cpu] = 0;

 	update_cluster_load_subtractions(p, task_cpu,
 			src_rq->window_start, new_task);

 	BUG_ON((s64)src_rq->prev_runnable_sum < 0);
 	BUG_ON((s64)src_rq->curr_runnable_sum < 0);
 	BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0);
 	BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0);
 }

 static u32 load_to_index(u32 load)
 {
 	u32 index = load / sched_load_granule;

 	return min(index, (u32)(NUM_LOAD_INDICES - 1));
 }

 static void
 migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq)
 {
 	int index;
 	int top_index;
 	u32 curr_window = p->ravg.curr_window;
 	u32 prev_window = p->ravg.prev_window;
 	u8 src = src_rq->curr_table;
 	u8 dst = dst_rq->curr_table;
 	u8 *src_table;
 	u8 *dst_table;

 	if (curr_window) {
 		src_table = src_rq->top_tasks[src];
 		dst_table = dst_rq->top_tasks[dst];
 		index = load_to_index(curr_window);
 		src_table[index] -= 1;
 		dst_table[index] += 1;

 		if (!src_table[index])
 			__clear_bit(NUM_LOAD_INDICES - index - 1,
 				src_rq->top_tasks_bitmap[src]);

 		if (dst_table[index] == 1)
 			__set_bit(NUM_LOAD_INDICES - index - 1,
 				dst_rq->top_tasks_bitmap[dst]);

 		if (index > dst_rq->curr_top)
 			dst_rq->curr_top = index;

 		top_index = src_rq->curr_top;
 		if (index == top_index && !src_table[index])
 			src_rq->curr_top = get_top_index(
 				src_rq->top_tasks_bitmap[src], top_index);
 	}

 	if (prev_window) {
 		src = 1 - src;
 		dst = 1 - dst;
 		src_table = src_rq->top_tasks[src];
 		dst_table = dst_rq->top_tasks[dst];
 		index = load_to_index(prev_window);
 		src_table[index] -= 1;
 		dst_table[index] += 1;

 		if (!src_table[index])
 			__clear_bit(NUM_LOAD_INDICES - index - 1,
 				src_rq->top_tasks_bitmap[src]);

 		if (dst_table[index] == 1)
 			__set_bit(NUM_LOAD_INDICES - index - 1,
 				dst_rq->top_tasks_bitmap[dst]);

 		if (index > dst_rq->prev_top)
 			dst_rq->prev_top = index;

 		top_index = src_rq->prev_top;
 		if (index == top_index && !src_table[index])
 			src_rq->prev_top = get_top_index(
 				src_rq->top_tasks_bitmap[src], top_index);
 	}
 }

 void fixup_busy_time(struct task_struct *p, int new_cpu)
 {
 	struct rq *src_rq = task_rq(p);
 	struct rq *dest_rq = cpu_rq(new_cpu);
 	u64 wallclock;
 	u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
 	u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
 	u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
 	u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
 	bool new_task;
 	struct related_thread_group *grp;

 	if (!p->on_rq && p->state != TASK_WAKING)
 		return;

 	if (exiting_task(p)) {
 		clear_ed_task(p, src_rq);
 		return;
 	}

 	if (p->state == TASK_WAKING)
 		double_rq_lock(src_rq, dest_rq);

 	if (sched_disable_window_stats)
 		goto done;

 	wallclock = sched_ktime_clock();

 	update_task_ravg(task_rq(p)->curr, task_rq(p),
 			 TASK_UPDATE,
 			 wallclock, 0);
 	update_task_ravg(dest_rq->curr, dest_rq,
 			 TASK_UPDATE, wallclock, 0);

 	update_task_ravg(p, task_rq(p), TASK_MIGRATE,
 			 wallclock, 0);

 	update_task_cpu_cycles(p, new_cpu, wallclock);

 	/*
 	 * When a task is migrating during the wakeup, adjust
 	 * the task's contribution towards cumulative window
 	 * demand.
 	 */
 	if (p->state == TASK_WAKING && p->last_sleep_ts >=
 				       src_rq->window_start) {
 		walt_fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand);
 		walt_fixup_cum_window_demand(dest_rq, p->ravg.demand);
 	}

 	new_task = is_new_task(p);
 	/* Protected by rq_lock */
 	grp = p->grp;

 	/*
 	 * For frequency aggregation, we continue to do migration fixups
 	 * even for intra cluster migrations. This is because, the aggregated
 	 * load has to reported on a single CPU regardless.
 	 */
 	if (grp) {
 		struct group_cpu_time *cpu_time;

 		cpu_time = &src_rq->grp_time;
 		src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
 		src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
 		src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
 		src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;

 		cpu_time = &dest_rq->grp_time;
 		dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
 		dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
 		dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
 		dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;

 		if (p->ravg.curr_window) {
 			*src_curr_runnable_sum -= p->ravg.curr_window;
 			*dst_curr_runnable_sum += p->ravg.curr_window;
 			if (new_task) {
 				*src_nt_curr_runnable_sum -=
 							p->ravg.curr_window;
 				*dst_nt_curr_runnable_sum +=
 							p->ravg.curr_window;
 			}
 		}

 		if (p->ravg.prev_window) {
 			*src_prev_runnable_sum -= p->ravg.prev_window;
 			*dst_prev_runnable_sum += p->ravg.prev_window;
 			if (new_task) {
 				*src_nt_prev_runnable_sum -=
 							p->ravg.prev_window;
 				*dst_nt_prev_runnable_sum +=
 							p->ravg.prev_window;
 			}
 		}
 	} else {
 		inter_cluster_migration_fixup(p, new_cpu,
 						task_cpu(p), new_task);
 	}

 	migrate_top_tasks(p, src_rq, dest_rq);

 	if (!same_freq_domain(new_cpu, task_cpu(p))) {
 		src_rq->notif_pending = true;
 		dest_rq->notif_pending = true;
 		sched_irq_work_queue(&walt_migration_irq_work);
 	}

 	if (p == src_rq->ed_task) {
 		src_rq->ed_task = NULL;
 		dest_rq->ed_task = p;
 	}

 done:
 	if (p->state == TASK_WAKING)
 		double_rq_unlock(src_rq, dest_rq);
 }

 void set_window_start(struct rq *rq)
 {
 	static int sync_cpu_available;

 	if (likely(rq->window_start))
 		return;

 	if (!sync_cpu_available) {
 		rq->window_start = 1;
 		sync_cpu_available = 1;
 		atomic64_set(&walt_irq_work_lastq_ws, rq->window_start);
 		walt_load_reported_window =
 					atomic64_read(&walt_irq_work_lastq_ws);

 	} else {
 		struct rq *sync_rq = cpu_rq(cpumask_any(cpu_online_mask));

 		raw_spin_unlock(&rq->lock);
 		double_rq_lock(rq, sync_rq);
 		rq->window_start = sync_rq->window_start;
 		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
 		rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
 		raw_spin_unlock(&sync_rq->lock);
 	}

 	rq->curr->ravg.mark_start = rq->window_start;
 }

 unsigned int max_possible_efficiency = 1;
 unsigned int min_possible_efficiency = UINT_MAX;

 #define INC_STEP 8
 #define DEC_STEP 2
 #define CONSISTENT_THRES 16
 #define INC_STEP_BIG 16
 /*
  * bucket_increase - update the count of all buckets
  *
  * @buckets: array of buckets tracking busy time of a task
  * @idx: the index of bucket to be incremented
  *
  * Each time a complete window finishes, count of bucket that runtime
  * falls in (@idx) is incremented. Counts of all other buckets are
  * decayed. The rate of increase and decay could be different based
  * on current count in the bucket.
  */
 static inline void bucket_increase(u8 *buckets, int idx)
 {
 	int i, step;

 	for (i = 0; i < NUM_BUSY_BUCKETS; i++) {
 		if (idx != i) {
 			if (buckets[i] > DEC_STEP)
 				buckets[i] -= DEC_STEP;
 			else
 				buckets[i] = 0;
 		} else {
 			step = buckets[i] >= CONSISTENT_THRES ?
 						INC_STEP_BIG : INC_STEP;
 			if (buckets[i] > U8_MAX - step)
 				buckets[i] = U8_MAX;
 			else
 				buckets[i] += step;
 		}
 	}
 }

 static inline int busy_to_bucket(u32 normalized_rt)
 {
 	int bidx;

 	bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load());
 	bidx = min(bidx, NUM_BUSY_BUCKETS - 1);

 	/*
 	 * Combine lowest two buckets. The lowest frequency falls into
 	 * 2nd bucket and thus keep predicting lowest bucket is not
 	 * useful.
 	 */
 	if (!bidx)
 		bidx++;

 	return bidx;
 }

 /*
  * get_pred_busy - calculate predicted demand for a task on runqueue
  *
  * @rq: runqueue of task p
  * @p: task whose prediction is being updated
  * @start: starting bucket. returned prediction should not be lower than
  *         this bucket.
  * @runtime: runtime of the task. returned prediction should not be lower
  *           than this runtime.
  * Note: @start can be derived from @runtime. It's passed in only to
  * avoid duplicated calculation in some cases.
  *
  * A new predicted busy time is returned for task @p based on @runtime
  * passed in. The function searches through buckets that represent busy
  * time equal to or bigger than @runtime and attempts to find the bucket to
  * to use for prediction. Once found, it searches through historical busy
  * time and returns the latest that falls into the bucket. If no such busy
  * time exists, it returns the medium of that bucket.
  */
 static u32 get_pred_busy(struct rq *rq, struct task_struct *p,
 				int start, u32 runtime)
 {
 	int i;
 	u8 *buckets = p->ravg.busy_buckets;
 	u32 *hist = p->ravg.sum_history;
 	u32 dmin, dmax;
 	u64 cur_freq_runtime = 0;
 	int first = NUM_BUSY_BUCKETS, final;
 	u32 ret = runtime;

 	/* skip prediction for new tasks due to lack of history */
 	if (unlikely(is_new_task(p)))
 		goto out;

 	/* find minimal bucket index to pick */
 	for (i = start; i < NUM_BUSY_BUCKETS; i++) {
 		if (buckets[i]) {
 			first = i;
 			break;
 		}
 	}
 	/* if no higher buckets are filled, predict runtime */
 	if (first >= NUM_BUSY_BUCKETS)
 		goto out;

 	/* compute the bucket for prediction */
 	final = first;

 	/* determine demand range for the predicted bucket */
 	if (final < 2) {
 		/* lowest two buckets are combined */
 		dmin = 0;
 		final = 1;
 	} else {
 		dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS);
 	}
 	dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS);

 	/*
 	 * search through runtime history and return first runtime that falls
 	 * into the range of predicted bucket.
 	 */
 	for (i = 0; i < sched_ravg_hist_size; i++) {
 		if (hist[i] >= dmin && hist[i] < dmax) {
 			ret = hist[i];
 			break;
 		}
 	}
 	/* no historical runtime within bucket found, use average of the bin */
 	if (ret < dmin)
 		ret = (dmin + dmax) / 2;
 	/*
 	 * when updating in middle of a window, runtime could be higher
 	 * than all recorded history. Always predict at least runtime.
 	 */
 	ret = max(runtime, ret);
 out:
 	trace_sched_update_pred_demand(rq, p, runtime,
 		mult_frac((unsigned int)cur_freq_runtime, 100,
 			  sched_ravg_window), ret);
 	return ret;
 }

 static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p)
 {
 	if (p->ravg.pred_demand >= p->ravg.curr_window)
 		return p->ravg.pred_demand;

 	return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window),
 			     p->ravg.curr_window);
 }

 /*
  * predictive demand of a task is calculated at the window roll-over.
  * if the task current window busy time exceeds the predicted
  * demand, update it here to reflect the task needs.
  */
 void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
 {
 	u32 new, old;

 	if (!sched_predl)
 		return;

 	if (is_idle_task(p) || exiting_task(p))
 		return;

 	if (event != PUT_PREV_TASK && event != TASK_UPDATE &&
 			(!SCHED_FREQ_ACCOUNT_WAIT_TIME ||
 			 (event != TASK_MIGRATE &&
 			 event != PICK_NEXT_TASK)))
 		return;

 	/*
 	 * TASK_UPDATE can be called on sleeping task, when its moved between
 	 * related groups
 	 */
 	if (event == TASK_UPDATE) {
 		if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME)
 			return;
 	}

 	new = calc_pred_demand(rq, p);
 	old = p->ravg.pred_demand;

 	if (old >= new)
 		return;

 	if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
 				!p->dl.dl_throttled))
 		p->sched_class->fixup_walt_sched_stats(rq, p,
 				p->ravg.demand,
 				new);

 	p->ravg.pred_demand = new;
 }

 void clear_top_tasks_bitmap(unsigned long *bitmap)
 {
 	memset(bitmap, 0, top_tasks_bitmap_size);
 	__set_bit(NUM_LOAD_INDICES, bitmap);
 }

 static void update_top_tasks(struct task_struct *p, struct rq *rq,
 		u32 old_curr_window, int new_window, bool full_window)
 {
 	u8 curr = rq->curr_table;
 	u8 prev = 1 - curr;
 	u8 *curr_table = rq->top_tasks[curr];
 	u8 *prev_table = rq->top_tasks[prev];
 	int old_index, new_index, update_index;
 	u32 curr_window = p->ravg.curr_window;
 	u32 prev_window = p->ravg.prev_window;
 	bool zero_index_update;

 	if (old_curr_window == curr_window && !new_window)
 		return;

 	old_index = load_to_index(old_curr_window);
 	new_index = load_to_index(curr_window);

 	if (!new_window) {
 		zero_index_update = !old_curr_window && curr_window;
 		if (old_index != new_index || zero_index_update) {
 			if (old_curr_window)
 				curr_table[old_index] -= 1;
 			if (curr_window)
 				curr_table[new_index] += 1;
 			if (new_index > rq->curr_top)
 				rq->curr_top = new_index;
 		}

 		if (!curr_table[old_index])
 			__clear_bit(NUM_LOAD_INDICES - old_index - 1,
 				rq->top_tasks_bitmap[curr]);

 		if (curr_table[new_index] == 1)
 			__set_bit(NUM_LOAD_INDICES - new_index - 1,
 				rq->top_tasks_bitmap[curr]);

 		return;
 	}

 	/*
 	 * The window has rolled over for this task. By the time we get
 	 * here, curr/prev swaps would has already occurred. So we need
 	 * to use prev_window for the new index.
 	 */
 	update_index = load_to_index(prev_window);

 	if (full_window) {
 		/*
 		 * Two cases here. Either 'p' ran for the entire window or
 		 * it didn't run at all. In either case there is no entry
 		 * in the prev table. If 'p' ran the entire window, we just
 		 * need to create a new entry in the prev table. In this case
 		 * update_index will be correspond to sched_ravg_window
 		 * so we can unconditionally update the top index.
 		 */
 		if (prev_window) {
 			prev_table[update_index] += 1;
 			rq->prev_top = update_index;
 		}

 		if (prev_table[update_index] == 1)
 			__set_bit(NUM_LOAD_INDICES - update_index - 1,
 				rq->top_tasks_bitmap[prev]);
 	} else {
 		zero_index_update = !old_curr_window && prev_window;
 		if (old_index != update_index || zero_index_update) {
 			if (old_curr_window)
 				prev_table[old_index] -= 1;

 			prev_table[update_index] += 1;

 			if (update_index > rq->prev_top)
 				rq->prev_top = update_index;

 			if (!prev_table[old_index])
 				__clear_bit(NUM_LOAD_INDICES - old_index - 1,
 						rq->top_tasks_bitmap[prev]);

 			if (prev_table[update_index] == 1)
 				__set_bit(NUM_LOAD_INDICES - update_index - 1,
 						rq->top_tasks_bitmap[prev]);
 		}
 	}

 	if (curr_window) {
 		curr_table[new_index] += 1;

 		if (new_index > rq->curr_top)
 			rq->curr_top = new_index;

 		if (curr_table[new_index] == 1)
 			__set_bit(NUM_LOAD_INDICES - new_index - 1,
 				rq->top_tasks_bitmap[curr]);
 	}
 }

 static void rollover_top_tasks(struct rq *rq, bool full_window)
 {
 	u8 curr_table = rq->curr_table;
 	u8 prev_table = 1 - curr_table;
 	int curr_top = rq->curr_top;

 	clear_top_tasks_table(rq->top_tasks[prev_table]);
 	clear_top_tasks_bitmap(rq->top_tasks_bitmap[prev_table]);

 	if (full_window) {
 		curr_top = 0;
 		clear_top_tasks_table(rq->top_tasks[curr_table]);
 		clear_top_tasks_bitmap(
 				rq->top_tasks_bitmap[curr_table]);
 	}

 	rq->curr_table = prev_table;
 	rq->prev_top = curr_top;
 	rq->curr_top = 0;
 }

 static u32 empty_windows[NR_CPUS];

 static void rollover_task_window(struct task_struct *p, bool full_window)
 {
 	u32 *curr_cpu_windows = empty_windows;
 	u32 curr_window;
 	int i;

 	/* Rollover the sum */
 	curr_window = 0;

 	if (!full_window) {
 		curr_window = p->ravg.curr_window;
 		curr_cpu_windows = p->ravg.curr_window_cpu;
 	}

 	p->ravg.prev_window = curr_window;
 	p->ravg.curr_window = 0;

 	/* Roll over individual CPU contributions */
 	for (i = 0; i < nr_cpu_ids; i++) {
 		p->ravg.prev_window_cpu[i] = curr_cpu_windows[i];
 		p->ravg.curr_window_cpu[i] = 0;
 	}
 }

 void sched_set_io_is_busy(int val)
 {
 	sched_io_is_busy = val;
 }

 static inline int cpu_is_waiting_on_io(struct rq *rq)
 {
 	if (!sched_io_is_busy)
 		return 0;

 	return atomic_read(&rq->nr_iowait);
 }

 static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
 				     u64 irqtime, int event)
 {
 	if (is_idle_task(p)) {
 		/* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
 		if (event == PICK_NEXT_TASK)
 			return 0;

 		/* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
 		return irqtime || cpu_is_waiting_on_io(rq);
 	}

 	if (event == TASK_WAKE)
 		return 0;

 	if (event == PUT_PREV_TASK || event == IRQ_UPDATE)
 		return 1;

 	/*
 	 * TASK_UPDATE can be called on sleeping task, when its moved between
 	 * related groups
 	 */
 	if (event == TASK_UPDATE) {
 		if (rq->curr == p)
 			return 1;

 		return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0;
 	}

 	/* TASK_MIGRATE, PICK_NEXT_TASK left */
 	return SCHED_FREQ_ACCOUNT_WAIT_TIME;
 }

 #define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)

 static inline u64 scale_exec_time(u64 delta, struct rq *rq)
 {
 	u32 freq;

 	freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
 	delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq);
 	delta *= rq->cluster->exec_scale_factor;
 	delta >>= 10;

 	return delta;
 }

 /* Convert busy time to frequency equivalent
  * Assumes load is scaled to 1024
  */
 static inline unsigned int load_to_freq(struct rq *rq, unsigned int load)
 {
 	return mult_frac(cpu_max_possible_freq(cpu_of(rq)), load,
 			 (unsigned int) capacity_orig_of(cpu_of(rq)));
 }

 bool do_pl_notif(struct rq *rq)
 {
 	u64 prev = rq->old_busy_time;
 	u64 pl = rq->walt_stats.pred_demands_sum;
 	int cpu = cpu_of(rq);

 	/* If already at max freq, bail out */
 	if (capacity_orig_of(cpu) == capacity_curr_of(cpu))
 		return false;

 	prev = max(prev, rq->old_estimated_time);

 	pl = div64_u64(pl, sched_ravg_window >> SCHED_CAPACITY_SHIFT);

 	/* 400 MHz filter. */
 	return (pl > prev) && (load_to_freq(rq, pl - prev) > 400000);
 }

 static void rollover_cpu_window(struct rq *rq, bool full_window)
 {
 	u64 curr_sum = rq->curr_runnable_sum;
 	u64 nt_curr_sum = rq->nt_curr_runnable_sum;
 	u64 grp_curr_sum = rq->grp_time.curr_runnable_sum;
 	u64 grp_nt_curr_sum = rq->grp_time.nt_curr_runnable_sum;

 	if (unlikely(full_window)) {
 		curr_sum = 0;
 		nt_curr_sum = 0;
 		grp_curr_sum = 0;
 		grp_nt_curr_sum = 0;
 	}

 	rq->prev_runnable_sum = curr_sum;
 	rq->nt_prev_runnable_sum = nt_curr_sum;
 	rq->grp_time.prev_runnable_sum = grp_curr_sum;
 	rq->grp_time.nt_prev_runnable_sum = grp_nt_curr_sum;

 	rq->curr_runnable_sum = 0;
 	rq->nt_curr_runnable_sum = 0;
 	rq->grp_time.curr_runnable_sum = 0;
 	rq->grp_time.nt_curr_runnable_sum = 0;
 }

 /*
  * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
  */
 static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
 				 int event, u64 wallclock, u64 irqtime)
 {
 	int new_window, full_window = 0;
 	int p_is_curr_task = (p == rq->curr);
 	u64 mark_start = p->ravg.mark_start;
 	u64 window_start = rq->window_start;
 	u32 window_size = sched_ravg_window;
 	u64 delta;
 	u64 *curr_runnable_sum = &rq->curr_runnable_sum;
 	u64 *prev_runnable_sum = &rq->prev_runnable_sum;
 	u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
 	u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
 	bool new_task;
 	struct related_thread_group *grp;
 	int cpu = rq->cpu;
 	u32 old_curr_window = p->ravg.curr_window;

 	new_window = mark_start < window_start;
 	if (new_window) {
 		full_window = (window_start - mark_start) >= window_size;
 		if (p->ravg.active_windows < USHRT_MAX)
 			p->ravg.active_windows++;
 	}

 	new_task = is_new_task(p);

 	/*
 	 * Handle per-task window rollover. We don't care about the idle
 	 * task or exiting tasks.
 	 */
 	if (!is_idle_task(p) && !exiting_task(p)) {
 		if (new_window)
 			rollover_task_window(p, full_window);
 	}

 	if (p_is_curr_task && new_window) {
 		rollover_cpu_window(rq, full_window);
 		rollover_top_tasks(rq, full_window);
 	}

 	if (!account_busy_for_cpu_time(rq, p, irqtime, event))
 		goto done;

 	grp = p->grp;
 	if (grp) {
 		struct group_cpu_time *cpu_time = &rq->grp_time;

 		curr_runnable_sum = &cpu_time->curr_runnable_sum;
 		prev_runnable_sum = &cpu_time->prev_runnable_sum;

 		nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
 		nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
 	}

 	if (!new_window) {
 		/*
 		 * account_busy_for_cpu_time() = 1 so busy time needs
 		 * to be accounted to the current window. No rollover
 		 * since we didn't start a new window. An example of this is
 		 * when a task starts execution and then sleeps within the
 		 * same window.
 		 */

 		if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
 			delta = wallclock - mark_start;
 		else
 			delta = irqtime;
 		delta = scale_exec_time(delta, rq);
 		*curr_runnable_sum += delta;
 		if (new_task)
 			*nt_curr_runnable_sum += delta;

 		if (!is_idle_task(p) && !exiting_task(p)) {
 			p->ravg.curr_window += delta;
 			p->ravg.curr_window_cpu[cpu] += delta;
 		}

 		goto done;
 	}

 	if (!p_is_curr_task) {
 		/*
 		 * account_busy_for_cpu_time() = 1 so busy time needs
 		 * to be accounted to the current window. A new window
 		 * has also started, but p is not the current task, so the
 		 * window is not rolled over - just split up and account
 		 * as necessary into curr and prev. The window is only
 		 * rolled over when a new window is processed for the current
 		 * task.
 		 *
 		 * Irqtime can't be accounted by a task that isn't the
 		 * currently running task.
 		 */

 		if (!full_window) {
 			/*
 			 * A full window hasn't elapsed, account partial
 			 * contribution to previous completed window.
 			 */
 			delta = scale_exec_time(window_start - mark_start, rq);
 			if (!exiting_task(p)) {
 				p->ravg.prev_window += delta;
 				p->ravg.prev_window_cpu[cpu] += delta;
 			}
 		} else {
 			/*
 			 * Since at least one full window has elapsed,
 			 * the contribution to the previous window is the
 			 * full window (window_size).
 			 */
 			delta = scale_exec_time(window_size, rq);
 			if (!exiting_task(p)) {
 				p->ravg.prev_window = delta;
 				p->ravg.prev_window_cpu[cpu] = delta;
 			}
 		}

 		*prev_runnable_sum += delta;
 		if (new_task)
 			*nt_prev_runnable_sum += delta;

 		/* Account piece of busy time in the current window. */
 		delta = scale_exec_time(wallclock - window_start, rq);
 		*curr_runnable_sum += delta;
 		if (new_task)
 			*nt_curr_runnable_sum += delta;

 		if (!exiting_task(p)) {
 			p->ravg.curr_window = delta;
 			p->ravg.curr_window_cpu[cpu] = delta;
 		}

 		goto done;
 	}

 	if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
 		/*
 		 * account_busy_for_cpu_time() = 1 so busy time needs
 		 * to be accounted to the current window. A new window
 		 * has started and p is the current task so rollover is
 		 * needed. If any of these three above conditions are true
 		 * then this busy time can't be accounted as irqtime.
 		 *
 		 * Busy time for the idle task or exiting tasks need not
 		 * be accounted.
 		 *
 		 * An example of this would be a task that starts execution
 		 * and then sleeps once a new window has begun.
 		 */

 		if (!full_window) {
 			/*
 			 * A full window hasn't elapsed, account partial
 			 * contribution to previous completed window.
 			 */
 			delta = scale_exec_time(window_start - mark_start, rq);
 			if (!is_idle_task(p) && !exiting_task(p)) {
 				p->ravg.prev_window += delta;
 				p->ravg.prev_window_cpu[cpu] += delta;
 			}
 		} else {
 			/*
 			 * Since at least one full window has elapsed,
 			 * the contribution to the previous window is the
 			 * full window (window_size).
 			 */
 			delta = scale_exec_time(window_size, rq);
 			if (!is_idle_task(p) && !exiting_task(p)) {
 				p->ravg.prev_window = delta;
 				p->ravg.prev_window_cpu[cpu] = delta;
 			}
 		}

 		/*
 		 * Rollover is done here by overwriting the values in
 		 * prev_runnable_sum and curr_runnable_sum.
 		 */
 		*prev_runnable_sum += delta;
 		if (new_task)
 			*nt_prev_runnable_sum += delta;

 		/* Account piece of busy time in the current window. */
 		delta = scale_exec_time(wallclock - window_start, rq);
 		*curr_runnable_sum += delta;
 		if (new_task)
 			*nt_curr_runnable_sum += delta;

 		if (!is_idle_task(p) && !exiting_task(p)) {
 			p->ravg.curr_window = delta;
 			p->ravg.curr_window_cpu[cpu] = delta;
 		}

 		goto done;
 	}

 	if (irqtime) {
 		/*
 		 * account_busy_for_cpu_time() = 1 so busy time needs
 		 * to be accounted to the current window. A new window
 		 * has started and p is the current task so rollover is
 		 * needed. The current task must be the idle task because
 		 * irqtime is not accounted for any other task.
 		 *
 		 * Irqtime will be accounted each time we process IRQ activity
 		 * after a period of idleness, so we know the IRQ busy time
 		 * started at wallclock - irqtime.
 		 */

 		BUG_ON(!is_idle_task(p));
 		mark_start = wallclock - irqtime;

 		/*
 		 * Roll window over. If IRQ busy time was just in the current
 		 * window then that is all that need be accounted.
 		 */
 		if (mark_start > window_start) {
 			*curr_runnable_sum = scale_exec_time(irqtime, rq);
 			return;
 		}

 		/*
 		 * The IRQ busy time spanned multiple windows. Process the
 		 * busy time preceding the current window start first.
 		 */
 		delta = window_start - mark_start;
 		if (delta > window_size)
 			delta = window_size;
 		delta = scale_exec_time(delta, rq);
 		*prev_runnable_sum += delta;

 		/* Process the remaining IRQ busy time in the current window. */
 		delta = wallclock - window_start;
 		rq->curr_runnable_sum = scale_exec_time(delta, rq);

 		return;
 	}

 done:
 	if (!is_idle_task(p) && !exiting_task(p))
 		update_top_tasks(p, rq, old_curr_window,
 					new_window, full_window);
 }


 static inline u32 predict_and_update_buckets(struct rq *rq,
 			struct task_struct *p, u32 runtime) {

 	int bidx;
 	u32 pred_demand;

 	if (!sched_predl)
 		return 0;

 	bidx = busy_to_bucket(runtime);
 	pred_demand = get_pred_busy(rq, p, bidx, runtime);
 	bucket_increase(p->ravg.busy_buckets, bidx);

 	return pred_demand;
 }

 static int
 account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event)
 {
 	/*
 	 * No need to bother updating task demand for exiting tasks
 	 * or the idle task.
 	 */
 	if (exiting_task(p) || is_idle_task(p))
 		return 0;

 	/*
 	 * When a task is waking up it is completing a segment of non-busy
 	 * time. Likewise, if wait time is not treated as busy time, then
 	 * when a task begins to run or is migrated, it is not running and
 	 * is completing a segment of non-busy time.
 	 */
 	if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME &&
 			 (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
 		return 0;

 	/*
 	 * TASK_UPDATE can be called on sleeping task, when its moved between
 	 * related groups
 	 */
 	if (event == TASK_UPDATE) {
 		if (rq->curr == p)
 			return 1;

 		return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0;
 	}

 	return 1;
 }

 /*
  * Called when new window is starting for a task, to record cpu usage over
  * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
  * when, say, a real-time task runs without preemption for several windows at a
  * stretch.
  */
 static void update_history(struct rq *rq, struct task_struct *p,
 			 u32 runtime, int samples, int event)
 {
 	u32 *hist = &p->ravg.sum_history[0];
 	int ridx, widx;
 	u32 max = 0, avg, demand, pred_demand;
 	u64 sum = 0;
 	u64 prev_demand;

 	/* Ignore windows where task had no activity */
 	if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
 		goto done;

 	prev_demand = p->ravg.demand;

 	/* Push new 'runtime' value onto stack */
 	widx = sched_ravg_hist_size - 1;
 	ridx = widx - samples;
 	for (; ridx >= 0; --widx, --ridx) {
 		hist[widx] = hist[ridx];
 		sum += hist[widx];
 		if (hist[widx] > max)
 			max = hist[widx];
 	}

 	for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {
 		hist[widx] = runtime;
 		sum += hist[widx];
 		if (hist[widx] > max)
 			max = hist[widx];
 	}

 	p->ravg.sum = 0;

 	if (sched_window_stats_policy == WINDOW_STATS_RECENT) {
 		demand = runtime;
 	} else if (sched_window_stats_policy == WINDOW_STATS_MAX) {
 		demand = max;
 	} else {
 		avg = div64_u64(sum, sched_ravg_hist_size);
 		if (sched_window_stats_policy == WINDOW_STATS_AVG)
 			demand = avg;
 		else
 			demand = max(avg, runtime);
 	}
 	pred_demand = predict_and_update_buckets(rq, p, runtime);

 	/*
 	 * A throttled deadline sched class task gets dequeued without
 	 * changing p->on_rq. Since the dequeue decrements walt stats
 	 * avoid decrementing it here again.
 	 *
 	 * When window is rolled over, the cumulative window demand
 	 * is reset to the cumulative runnable average (contribution from
 	 * the tasks on the runqueue). If the current task is dequeued
 	 * already, it's demand is not included in the cumulative runnable
 	 * average. So add the task demand separately to cumulative window
 	 * demand.
 	 */
 	if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
 		if (task_on_rq_queued(p))
 			p->sched_class->fixup_walt_sched_stats(rq, p, demand,
 							       pred_demand);
 		else if (rq->curr == p)
 			walt_fixup_cum_window_demand(rq, demand);
 	}

 	p->ravg.demand = demand;
 	p->ravg.coloc_demand = div64_u64(sum, sched_ravg_hist_size);
 	p->ravg.pred_demand = pred_demand;

 done:
 	trace_sched_update_history(rq, p, runtime, samples, event);
 }

 static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
 {
 	delta = scale_exec_time(delta, rq);
 	p->ravg.sum += delta;
 	if (unlikely(p->ravg.sum > sched_ravg_window))
 		p->ravg.sum = sched_ravg_window;

 	return delta;
 }

 /*
  * Account cpu demand of task and/or update task's cpu demand history
  *
  * ms = p->ravg.mark_start;
  * wc = wallclock
  * ws = rq->window_start
  *
  * Three possibilities:
  *
  *	a) Task event is contained within one window.
  *		window_start < mark_start < wallclock
  *
  *		ws   ms  wc
  *		|    |   |
  *		V    V   V
  *		|---------------|
  *
  *	In this case, p->ravg.sum is updated *iff* event is appropriate
  *	(ex: event == PUT_PREV_TASK)
  *
  *	b) Task event spans two windows.
  *		mark_start < window_start < wallclock
  *
  *		ms   ws   wc
  *		|    |    |
  *		V    V    V
  *		-----|-------------------
  *
  *	In this case, p->ravg.sum is updated with (ws - ms) *iff* event
  *	is appropriate, then a new window sample is recorded followed
  *	by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
  *
  *	c) Task event spans more than two windows.
  *
  *		ms ws_tmp			   ws  wc
  *		|  |				   |   |
  *		V  V				   V   V
  *		---|-------|-------|-------|-------|------
  *		   |				   |
  *		   |<------ nr_full_windows ------>|
  *
  *	In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
  *	event is appropriate, window sample of p->ravg.sum is recorded,
  *	'nr_full_window' samples of window_size is also recorded *iff*
  *	event is appropriate and finally p->ravg.sum is set to (wc - ws)
  *	*iff* event is appropriate.
  *
  * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
  * depends on it!
  */
 static u64 update_task_demand(struct task_struct *p, struct rq *rq,
 			       int event, u64 wallclock)
 {
 	u64 mark_start = p->ravg.mark_start;
 	u64 delta, window_start = rq->window_start;
 	int new_window, nr_full_windows;
 	u32 window_size = sched_ravg_window;
 	u64 runtime;

 	new_window = mark_start < window_start;
 	if (!account_busy_for_task_demand(rq, p, event)) {
 		if (new_window)
 			/*
 			 * If the time accounted isn't being accounted as
 			 * busy time, and a new window started, only the
 			 * previous window need be closed out with the
 			 * pre-existing demand. Multiple windows may have
 			 * elapsed, but since empty windows are dropped,
 			 * it is not necessary to account those.
 			 */
 			update_history(rq, p, p->ravg.sum, 1, event);
 		return 0;
 	}

 	if (!new_window) {
 		/*
 		 * The simple case - busy time contained within the existing
 		 * window.
 		 */
 		return add_to_task_demand(rq, p, wallclock - mark_start);
 	}

 	/*
 	 * Busy time spans at least two windows. Temporarily rewind
 	 * window_start to first window boundary after mark_start.
 	 */
 	delta = window_start - mark_start;
 	nr_full_windows = div64_u64(delta, window_size);
 	window_start -= (u64)nr_full_windows * (u64)window_size;

 	/* Process (window_start - mark_start) first */
 	runtime = add_to_task_demand(rq, p, window_start - mark_start);

 	/* Push new sample(s) into task's demand history */
 	update_history(rq, p, p->ravg.sum, 1, event);
 	if (nr_full_windows) {
 		u64 scaled_window = scale_exec_time(window_size, rq);

 		update_history(rq, p, scaled_window, nr_full_windows, event);
 		runtime += nr_full_windows * scaled_window;
 	}

 	/*
 	 * Roll window_start back to current to process any remainder
 	 * in current window.
 	 */
 	window_start += (u64)nr_full_windows * (u64)window_size;

 	/* Process (wallclock - window_start) next */
 	mark_start = window_start;
 	runtime += add_to_task_demand(rq, p, wallclock - mark_start);

 	return runtime;
 }

 static void
 update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
 			  u64 wallclock, u64 irqtime)
 {
 	u64 cur_cycles;
 	int cpu = cpu_of(rq);

 	lockdep_assert_held(&rq->lock);

 	if (!use_cycle_counter) {
 		rq->cc.cycles = cpu_cur_freq(cpu);
 		rq->cc.time = 1;
 		return;
 	}

 	cur_cycles = read_cycle_counter(cpu, wallclock);

 	/*
 	 * If current task is idle task and irqtime == 0 CPU was
 	 * indeed idle and probably its cycle counter was not
 	 * increasing.  We still need estimatied CPU frequency
 	 * for IO wait time accounting.  Use the previously
 	 * calculated frequency in such a case.
 	 */
 	if (!is_idle_task(rq->curr) || irqtime) {
 		if (unlikely(cur_cycles < p->cpu_cycles))
 			rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles);
 		else
 			rq->cc.cycles = cur_cycles - p->cpu_cycles;
 		rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC;

 		if (event == IRQ_UPDATE && is_idle_task(p))
 			/*
 			 * Time between mark_start of idle task and IRQ handler
 			 * entry time is CPU cycle counter stall period.
 			 * Upon IRQ handler entry sched_account_irqstart()
 			 * replenishes idle task's cpu cycle counter so
 			 * rq->cc.cycles now represents increased cycles during
 			 * IRQ handler rather than time between idle entry and
 			 * IRQ exit.  Thus use irqtime as time delta.
 			 */
 			rq->cc.time = irqtime;
 		else
 			rq->cc.time = wallclock - p->ravg.mark_start;
 		BUG_ON((s64)rq->cc.time < 0);
 	}

 	p->cpu_cycles = cur_cycles;

 	trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time, p);
 }

 static inline void run_walt_irq_work(u64 old_window_start, struct rq *rq)
 {
 	u64 result;

 	if (old_window_start == rq->window_start)
 		return;

 	result = atomic64_cmpxchg(&walt_irq_work_lastq_ws, old_window_start,
 				   rq->window_start);
 	if (result == old_window_start)
 		sched_irq_work_queue(&walt_cpufreq_irq_work);
 }

 /* Reflect task activity on its demand and cpu's busy time statistics */
 void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
 						u64 wallclock, u64 irqtime)
 {
 	u64 old_window_start;

 	if (!rq->window_start || sched_disable_window_stats ||
 	    p->ravg.mark_start == wallclock)
 		return;

 	lockdep_assert_held(&rq->lock);

 	old_window_start = update_window_start(rq, wallclock, event);

 	if (!p->ravg.mark_start) {
 		update_task_cpu_cycles(p, cpu_of(rq), wallclock);
 		goto done;
 	}

 	update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime);
 	update_task_demand(p, rq, event, wallclock);
 	update_cpu_busy_time(p, rq, event, wallclock, irqtime);
 	update_task_pred_demand(rq, p, event);

 	if (exiting_task(p))
 		goto done;

 	trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
 				rq->cc.cycles, rq->cc.time, &rq->grp_time);
 	trace_sched_update_task_ravg_mini(p, rq, event, wallclock, irqtime,
 				rq->cc.cycles, rq->cc.time, &rq->grp_time);

 done:
 	p->ravg.mark_start = wallclock;

 	run_walt_irq_work(old_window_start, rq);
 }

 u32 sched_get_init_task_load(struct task_struct *p)
 {
 	return p->init_load_pct;
 }

 int sched_set_init_task_load(struct task_struct *p, int init_load_pct)
 {
 	if (init_load_pct < 0 || init_load_pct > 100)
 		return -EINVAL;

 	p->init_load_pct = init_load_pct;

 	return 0;
 }

 void init_new_task_load(struct task_struct *p)
 {
 	int i;
 	u32 init_load_windows;
 	u32 init_load_pct;

 	p->init_load_pct = 0;
 	rcu_assign_pointer(p->grp, NULL);
 	INIT_LIST_HEAD(&p->grp_list);
 	memset(&p->ravg, 0, sizeof(struct ravg));
 	p->cpu_cycles = 0;

 	p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);
 	p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);

 	/* Don't have much choice. CPU frequency would be bogus */
 	BUG_ON(!p->ravg.curr_window_cpu || !p->ravg.prev_window_cpu);

 	if (current->init_load_pct)
 		init_load_pct = current->init_load_pct;
 	else
 		init_load_pct = sysctl_sched_init_task_load_pct;

 	init_load_windows = div64_u64((u64)init_load_pct *
 				(u64)sched_ravg_window, 100);

 	p->ravg.demand = init_load_windows;
 	p->ravg.coloc_demand = init_load_windows;
 	p->ravg.pred_demand = 0;
 	for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
 		p->ravg.sum_history[i] = init_load_windows;
 	p->misfit = false;
 }

 /*
  * kfree() may wakeup kswapd. So this function should NOT be called
  * with any CPU's rq->lock acquired.
  */
 void free_task_load_ptrs(struct task_struct *p)
 {
 	kfree(p->ravg.curr_window_cpu);
 	kfree(p->ravg.prev_window_cpu);

 	/*
 	 * update_task_ravg() can be called for exiting tasks. While the
 	 * function itself ensures correct behavior, the corresponding
 	 * trace event requires that these pointers be NULL.
 	 */
 	p->ravg.curr_window_cpu = NULL;
 	p->ravg.prev_window_cpu = NULL;
 }

 void reset_task_stats(struct task_struct *p)
 {
 	u32 sum = 0;
 	u32 *curr_window_ptr = NULL;
 	u32 *prev_window_ptr = NULL;

 	if (exiting_task(p)) {
 		sum = EXITING_TASK_MARKER;
 	} else {
 		curr_window_ptr =  p->ravg.curr_window_cpu;
 		prev_window_ptr = p->ravg.prev_window_cpu;
 		memset(curr_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
 		memset(prev_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
 	}

 	memset(&p->ravg, 0, sizeof(struct ravg));

 	p->ravg.curr_window_cpu = curr_window_ptr;
 	p->ravg.prev_window_cpu = prev_window_ptr;

 	/* Retain EXITING_TASK marker */
 	p->ravg.sum_history[0] = sum;
 }

 void mark_task_starting(struct task_struct *p)
 {
 	u64 wallclock;
 	struct rq *rq = task_rq(p);

 	if (!rq->window_start || sched_disable_window_stats) {
 		reset_task_stats(p);
 		return;
 	}

 	wallclock = sched_ktime_clock();
 	p->ravg.mark_start = p->last_wake_ts = wallclock;
 	p->last_enqueued_ts = wallclock;
 	p->last_switch_out_ts = 0;
 	update_task_cpu_cycles(p, cpu_of(rq), wallclock);
 }

 static cpumask_t all_cluster_cpus = CPU_MASK_NONE;
 DECLARE_BITMAP(all_cluster_ids, NR_CPUS);
 struct sched_cluster *sched_cluster[NR_CPUS];
 int num_clusters;

 struct list_head cluster_head;

 static void
 insert_cluster(struct sched_cluster *cluster, struct list_head *head)
 {
 	struct sched_cluster *tmp;
 	struct list_head *iter = head;

 	list_for_each_entry(tmp, head, list) {
 		if (cluster->max_power_cost < tmp->max_power_cost)
 			break;
 		iter = &tmp->list;
 	}

 	list_add(&cluster->list, iter);
 }

 static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus)
 {
 	struct sched_cluster *cluster = NULL;

 	cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC);
 	if (!cluster) {
 		__WARN_printf("Cluster allocation failed.  Possible bad scheduling\n");
 		return NULL;
 	}

 	INIT_LIST_HEAD(&cluster->list);
 	cluster->max_power_cost		=	1;
 	cluster->min_power_cost		=	1;
 	cluster->capacity		=	1024;
 	cluster->max_possible_capacity	=	1024;
 	cluster->efficiency		=	1;
 	cluster->load_scale_factor	=	1024;
 	cluster->cur_freq		=	1;
 	cluster->max_freq		=	1;
 	cluster->max_mitigated_freq	=	UINT_MAX;
 	cluster->min_freq		=	1;
 	cluster->max_possible_freq	=	1;
 	cluster->dstate			=	0;
 	cluster->dstate_wakeup_energy	=	0;
 	cluster->dstate_wakeup_latency	=	0;
 	cluster->freq_init_done		=	false;

 	raw_spin_lock_init(&cluster->load_lock);
 	cluster->cpus = *cpus;
 	cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus));

 	if (cluster->efficiency > max_possible_efficiency)
 		max_possible_efficiency = cluster->efficiency;
 	if (cluster->efficiency < min_possible_efficiency)
 		min_possible_efficiency = cluster->efficiency;

 	cluster->notifier_sent = 0;
 	return cluster;
 }

 static void add_cluster(const struct cpumask *cpus, struct list_head *head)
 {
 	struct sched_cluster *cluster = alloc_new_cluster(cpus);
 	int i;

 	if (!cluster)
 		return;

 	for_each_cpu(i, cpus)
 		cpu_rq(i)->cluster = cluster;

 	insert_cluster(cluster, head);
 	set_bit(num_clusters, all_cluster_ids);
 	num_clusters++;
 }

 static int compute_max_possible_capacity(struct sched_cluster *cluster)
 {
 	int capacity = 1024;

 	capacity *= capacity_scale_cpu_efficiency(cluster);
 	capacity >>= 10;

 	capacity *= (1024 * cluster->max_possible_freq) / min_max_freq;
 	capacity >>= 10;

 	return capacity;
 }

 void walt_update_min_max_capacity(void)
 {
 	unsigned long flags;

 	acquire_rq_locks_irqsave(cpu_possible_mask, &flags);
 	__update_min_max_capacity();
 	release_rq_locks_irqrestore(cpu_possible_mask, &flags);
 }

 unsigned int max_power_cost = 1;

 static int
 compare_clusters(void *priv, struct list_head *a, struct list_head *b)
 {
 	struct sched_cluster *cluster1, *cluster2;
 	int ret;

 	cluster1 = container_of(a, struct sched_cluster, list);
 	cluster2 = container_of(b, struct sched_cluster, list);

 	/*
 	 * Don't assume higher capacity means higher power. If the
 	 * power cost is same, sort the higher capacity cluster before
 	 * the lower capacity cluster to start placing the tasks
 	 * on the higher capacity cluster.
 	 */
 	ret = cluster1->max_power_cost > cluster2->max_power_cost ||
 		(cluster1->max_power_cost == cluster2->max_power_cost &&
 		cluster1->max_possible_capacity <
 				cluster2->max_possible_capacity);

 	return ret;
 }

 static void sort_clusters(void)
 {
 	struct sched_cluster *cluster;
 	struct list_head new_head;
 	unsigned int tmp_max = 1;

 	INIT_LIST_HEAD(&new_head);

 	for_each_sched_cluster(cluster) {
 		cluster->max_power_cost = power_cost(cluster_first_cpu(cluster),
 							       true);
 		cluster->min_power_cost = power_cost(cluster_first_cpu(cluster),
 							       false);

 		if (cluster->max_power_cost > tmp_max)
 			tmp_max = cluster->max_power_cost;
 	}
 	max_power_cost = tmp_max;

 	move_list(&new_head, &cluster_head, true);

 	list_sort(NULL, &new_head, compare_clusters);
 	assign_cluster_ids(&new_head);

 	/*
 	 * Ensure cluster ids are visible to all CPUs before making
 	 * cluster_head visible.
 	 */
 	move_list(&cluster_head, &new_head, false);
 }

 int __read_mostly min_power_cpu;

 void walt_sched_energy_populated_callback(void)
 {
 	struct sched_cluster *cluster;
 	int prev_max = 0, next_min = 0;

 	mutex_lock(&cluster_lock);

 	if (num_clusters == 1) {
 		sysctl_sched_is_big_little = 0;
 		mutex_unlock(&cluster_lock);
 		return;
 	}

 	sort_clusters();

 	for_each_sched_cluster(cluster) {
 		if (cluster->min_power_cost > prev_max) {
 			prev_max = cluster->max_power_cost;
 			continue;
 		}
 		/*
 		 * We assume no overlap in the power curves of
 		 * clusters on a big.LITTLE system.
 		 */
 		sysctl_sched_is_big_little = 0;
 		next_min = cluster->min_power_cost;
 	}

 	/*
 	 * Find the OPP at which the lower power cluster
 	 * power is overlapping with the next cluster.
 	 */
 	if (!sysctl_sched_is_big_little) {
 		int cpu = cluster_first_cpu(sched_cluster[0]);
 		struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL1];
 		int i;

 		for (i = 1; i < sge->nr_cap_states; i++) {
 			if (sge->cap_states[i].power >= next_min) {
 				sched_smp_overlap_capacity =
 						sge->cap_states[i-1].cap;
 				break;
 			}
 		}

 		min_power_cpu = cpu;
 	}

 	mutex_unlock(&cluster_lock);
 }

 static void update_all_clusters_stats(void)
 {
 	struct sched_cluster *cluster;
 	u64 highest_mpc = 0, lowest_mpc = U64_MAX;
 	unsigned long flags;

 	acquire_rq_locks_irqsave(cpu_possible_mask, &flags);

 	for_each_sched_cluster(cluster) {
 		u64 mpc;

 		cluster->capacity = compute_capacity(cluster);
 		mpc = cluster->max_possible_capacity =
 			compute_max_possible_capacity(cluster);
 		cluster->load_scale_factor = compute_load_scale_factor(cluster);

 		cluster->exec_scale_factor =
 			DIV_ROUND_UP(cluster->efficiency * 1024,
 				     max_possible_efficiency);

 		if (mpc > highest_mpc)
 			highest_mpc = mpc;

 		if (mpc < lowest_mpc)
 			lowest_mpc = mpc;
 	}

 	max_possible_capacity = highest_mpc;
 	min_max_possible_capacity = lowest_mpc;

 	__update_min_max_capacity();
 	sched_update_freq_max_load(cpu_possible_mask);
 	release_rq_locks_irqrestore(cpu_possible_mask, &flags);
 }

 void update_cluster_topology(void)
 {
 	struct cpumask cpus = *cpu_possible_mask;
 	const struct cpumask *cluster_cpus;
 	struct list_head new_head;
 	int i;

 	INIT_LIST_HEAD(&new_head);

 	for_each_cpu(i, &cpus) {
 		cluster_cpus = cpu_coregroup_mask(i);
 		cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus);
 		cpumask_andnot(&cpus, &cpus, cluster_cpus);
 		add_cluster(cluster_cpus, &new_head);
 	}

 	assign_cluster_ids(&new_head);

 	/*
 	 * Ensure cluster ids are visible to all CPUs before making
 	 * cluster_head visible.
 	 */
 	move_list(&cluster_head, &new_head, false);
 	update_all_clusters_stats();
 }

 struct sched_cluster init_cluster = {
 	.list			=	LIST_HEAD_INIT(init_cluster.list),
 	.id			=	0,
 	.max_power_cost		=	1,
 	.min_power_cost		=	1,
 	.capacity		=	1024,
 	.max_possible_capacity	=	1024,
 	.efficiency		=	1,
 	.load_scale_factor	=	1024,
 	.cur_freq		=	1,
 	.max_freq		=	1,
 	.max_mitigated_freq	=	UINT_MAX,
 	.min_freq		=	1,
 	.max_possible_freq	=	1,
 	.dstate			=	0,
 	.dstate_wakeup_energy	=	0,
 	.dstate_wakeup_latency	=	0,
 	.exec_scale_factor	=	1024,
 	.notifier_sent		=	0,
 	.wake_up_idle		=	0,
 	.aggr_grp_load		=	0,
 	.coloc_boost_load	=	0,
 };

 void init_clusters(void)
 {
 	bitmap_clear(all_cluster_ids, 0, NR_CPUS);
 	init_cluster.cpus = *cpu_possible_mask;
 	raw_spin_lock_init(&init_cluster.load_lock);
 	INIT_LIST_HEAD(&cluster_head);
 }

 static unsigned long cpu_max_table_freq[NR_CPUS];

 static int cpufreq_notifier_policy(struct notifier_block *nb,
 		unsigned long val, void *data)
 {
 	struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
 	struct sched_cluster *cluster = NULL;
 	struct cpumask policy_cluster = *policy->related_cpus;
 	unsigned int orig_max_freq = 0;
 	int i, j, update_capacity = 0;

 	if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
 						val != CPUFREQ_CREATE_POLICY)
 		return 0;

 	if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
 		walt_update_min_max_capacity();
 		return 0;
 	}

 	max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
 	if (min_max_freq == 1)
 		min_max_freq = UINT_MAX;
 	min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
 	BUG_ON(!min_max_freq);
 	BUG_ON(!policy->max);

 	for_each_cpu(i, &policy_cluster)
 		cpu_max_table_freq[i] = policy->cpuinfo.max_freq;

 	for_each_cpu(i, &policy_cluster) {
 		cluster = cpu_rq(i)->cluster;
 		cpumask_andnot(&policy_cluster, &policy_cluster,
 						&cluster->cpus);

 		orig_max_freq = cluster->max_freq;
 		cluster->min_freq = policy->min;
 		cluster->max_freq = policy->max;
 		cluster->cur_freq = policy->cur;

 		if (!cluster->freq_init_done) {
 			mutex_lock(&cluster_lock);
 			for_each_cpu(j, &cluster->cpus)
 				cpumask_copy(&cpu_rq(j)->freq_domain_cpumask,
 						policy->related_cpus);
 			cluster->max_possible_freq = policy->cpuinfo.max_freq;
 			cluster->max_possible_capacity =
 				compute_max_possible_capacity(cluster);
 			cluster->freq_init_done = true;

 			sort_clusters();
 			update_all_clusters_stats();
 			mutex_unlock(&cluster_lock);
 			continue;
 		}

 		update_capacity += (orig_max_freq != cluster->max_freq);
 	}

 	if (update_capacity)
 		update_cpu_cluster_capacity(policy->related_cpus);

 	return 0;
 }

 static struct notifier_block notifier_policy_block = {
 	.notifier_call = cpufreq_notifier_policy
 };

 static int cpufreq_notifier_trans(struct notifier_block *nb,
 		unsigned long val, void *data)
 {
 	struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
 	unsigned int cpu = freq->cpu, new_freq = freq->new;
 	unsigned long flags;
 	struct sched_cluster *cluster;
 	struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask;
 	int i, j;

 	if (val != CPUFREQ_POSTCHANGE)
 		return NOTIFY_DONE;

 	if (cpu_cur_freq(cpu) == new_freq)
 		return NOTIFY_OK;

 	for_each_cpu(i, &policy_cpus) {
 		cluster = cpu_rq(i)->cluster;

 		if (!use_cycle_counter) {
 			for_each_cpu(j, &cluster->cpus) {
 				struct rq *rq = cpu_rq(j);

 				raw_spin_lock_irqsave(&rq->lock, flags);
 				update_task_ravg(rq->curr, rq, TASK_UPDATE,
 						 sched_ktime_clock(), 0);
 				raw_spin_unlock_irqrestore(&rq->lock, flags);
 			}
 		}

 		cluster->cur_freq = new_freq;
 		cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus);
 	}

 	return NOTIFY_OK;
 }

 static struct notifier_block notifier_trans_block = {
 	.notifier_call = cpufreq_notifier_trans
 };

 static int register_walt_callback(void)
 {
 	int ret;

 	ret = cpufreq_register_notifier(&notifier_policy_block,
 					CPUFREQ_POLICY_NOTIFIER);
 	if (!ret)
 		ret = cpufreq_register_notifier(&notifier_trans_block,
 						CPUFREQ_TRANSITION_NOTIFIER);

 	return ret;
 }
 /*
  * cpufreq callbacks can be registered at core_initcall or later time.
  * Any registration done prior to that is "forgotten" by cpufreq. See
  * initialization of variable init_cpufreq_transition_notifier_list_called
  * for further information.
  */
 core_initcall(register_walt_callback);

 static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
 				struct task_struct *p, int event);

 /*
  * Enable colocation and frequency aggregation for all threads in a process.
  * The children inherits the group id from the parent.
  */

 /* Maximum allowed threshold before freq aggregation must be enabled */
 #define MAX_FREQ_AGGR_THRESH 1000

 struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID];
 static LIST_HEAD(active_related_thread_groups);
 DEFINE_RWLOCK(related_thread_group_lock);

 unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct;

 /*
  * Task groups whose aggregate demand on a cpu is more than
  * sched_group_upmigrate need to be up-migrated if possible.
  */
 unsigned int __read_mostly sched_group_upmigrate = 20000000;
 unsigned int __read_mostly sysctl_sched_group_upmigrate_pct = 100;

 /*
  * Task groups, once up-migrated, will need to drop their aggregate
  * demand to less than sched_group_downmigrate before they are "down"
  * migrated.
  */
 unsigned int __read_mostly sched_group_downmigrate = 19000000;
 unsigned int __read_mostly sysctl_sched_group_downmigrate_pct = 95;

 static int
 group_will_fit(struct sched_cluster *cluster, struct related_thread_group *grp,
 						u64 demand, bool group_boost)
 {
 	int cpu = cluster_first_cpu(cluster);
 	int prev_capacity = 0;
 	unsigned int threshold = sched_group_upmigrate;
 	u64 load;

 	if (cluster->capacity == max_capacity)
 		return 1;

 	if (group_boost)
 		return 0;

 	if (!demand)
 		return 1;

 	if (grp->preferred_cluster)
 		prev_capacity = grp->preferred_cluster->capacity;

 	if (cluster->capacity < prev_capacity)
 		threshold = sched_group_downmigrate;

 	load = scale_load_to_cpu(demand, cpu);
 	if (load < threshold)
 		return 1;

 	return 0;
 }

 unsigned long __weak arch_get_cpu_efficiency(int cpu)
 {
 	return SCHED_CAPACITY_SCALE;
 }

 /* Return cluster which can offer required capacity for group */
 static struct sched_cluster *best_cluster(struct related_thread_group *grp,
 					u64 total_demand, bool group_boost)
 {
 	struct sched_cluster *cluster = NULL;

 	for_each_sched_cluster(cluster) {
 		if (group_will_fit(cluster, grp, total_demand, group_boost))
 			return cluster;
 	}

 	return sched_cluster[0];
 }

 int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
 {
 	struct related_thread_group *grp;
 	int rc = 1;

 	rcu_read_lock();

 	grp = task_related_thread_group(p);
 	if (grp)
 		rc = (grp->preferred_cluster == cluster);

 	rcu_read_unlock();
 	return rc;
 }

 static void _set_preferred_cluster(struct related_thread_group *grp)
 {
 	struct task_struct *p;
 	u64 combined_demand = 0;
 	bool group_boost = false;
 	u64 wallclock;

 	if (list_empty(&grp->tasks))
 		return;

 	if (!sysctl_sched_is_big_little) {
 		grp->preferred_cluster = sched_cluster[0];
 		return;
 	}

 	wallclock = sched_ktime_clock();

 	/*
 	 * wakeup of two or more related tasks could race with each other and
 	 * could result in multiple calls to _set_preferred_cluster being issued
 	 * at same time. Avoid overhead in such cases of rechecking preferred
 	 * cluster
 	 */
 	if (wallclock - grp->last_update < sched_ravg_window / 10)
 		return;

 	list_for_each_entry(p, &grp->tasks, grp_list) {
 		if (task_boost_policy(p) == SCHED_BOOST_ON_BIG) {
 			group_boost = true;
 			break;
 		}

 		if (p->ravg.mark_start < wallclock -
 		    (sched_ravg_window * sched_ravg_hist_size))
 			continue;

 		combined_demand += p->ravg.coloc_demand;

 	}

 	grp->preferred_cluster = best_cluster(grp,
 			combined_demand, group_boost);
 	grp->last_update = sched_ktime_clock();
 	trace_sched_set_preferred_cluster(grp, combined_demand);
 }

 void set_preferred_cluster(struct related_thread_group *grp)
 {
 	raw_spin_lock(&grp->lock);
 	_set_preferred_cluster(grp);
 	raw_spin_unlock(&grp->lock);
 }

 int update_preferred_cluster(struct related_thread_group *grp,
 		struct task_struct *p, u32 old_load)
 {
 	u32 new_load = task_load(p);

 	if (!grp)
 		return 0;

 	/*
 	 * Update if task's load has changed significantly or a complete window
 	 * has passed since we last updated preference
 	 */
 	if (abs(new_load - old_load) > sched_ravg_window / 4 ||
 		sched_ktime_clock() - grp->last_update > sched_ravg_window)
 		return 1;

 	return 0;
 }

 DEFINE_MUTEX(policy_mutex);

 #define pct_to_real(tunable)	\
 		(div64_u64((u64)tunable * (u64)max_task_load(), 100))

 unsigned int update_freq_aggregate_threshold(unsigned int threshold)
 {
 	unsigned int old_threshold;

 	mutex_lock(&policy_mutex);

 	old_threshold = sysctl_sched_freq_aggregate_threshold_pct;

 	sysctl_sched_freq_aggregate_threshold_pct = threshold;
 	sched_freq_aggregate_threshold =
 		pct_to_real(sysctl_sched_freq_aggregate_threshold_pct);

 	mutex_unlock(&policy_mutex);

 	return old_threshold;
 }

 #define ADD_TASK	0
 #define REM_TASK	1

 #define DEFAULT_CGROUP_COLOC_ID 1

 static inline struct related_thread_group*
 lookup_related_thread_group(unsigned int group_id)
 {
 	return related_thread_groups[group_id];
 }

 int alloc_related_thread_groups(void)
 {
 	int i, ret;
 	struct related_thread_group *grp;

 	/* groupd_id = 0 is invalid as it's special id to remove group. */
 	for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
 		grp = kzalloc(sizeof(*grp), GFP_NOWAIT);
 		if (!grp) {
 			ret = -ENOMEM;
 			goto err;
 		}

 		grp->id = i;
 		INIT_LIST_HEAD(&grp->tasks);
 		INIT_LIST_HEAD(&grp->list);
 		raw_spin_lock_init(&grp->lock);

 		related_thread_groups[i] = grp;
 	}

 	return 0;

 err:
 	for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
 		grp = lookup_related_thread_group(i);
 		if (grp) {
 			kfree(grp);
 			related_thread_groups[i] = NULL;
 		} else {
 			break;
 		}
 	}

 	return ret;
 }

 static void remove_task_from_group(struct task_struct *p)
 {
 	struct related_thread_group *grp = p->grp;
 	struct rq *rq;
 	int empty_group = 1;
 	struct rq_flags rf;

 	raw_spin_lock(&grp->lock);

 	rq = __task_rq_lock(p, &rf);
 	transfer_busy_time(rq, p->grp, p, REM_TASK);
 	list_del_init(&p->grp_list);
 	rcu_assign_pointer(p->grp, NULL);
 	__task_rq_unlock(rq, &rf);


 	if (!list_empty(&grp->tasks)) {
 		empty_group = 0;
 		_set_preferred_cluster(grp);
 	}

 	raw_spin_unlock(&grp->lock);

 	/* Reserved groups cannot be destroyed */
 	if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID)
 		 /*
 		  * We test whether grp->list is attached with list_empty()
 		  * hence re-init the list after deletion.
 		  */
 		list_del_init(&grp->list);
 }

 static int
 add_task_to_group(struct task_struct *p, struct related_thread_group *grp)
 {
 	struct rq *rq;
 	struct rq_flags rf;

 	raw_spin_lock(&grp->lock);

 	/*
 	 * Change p->grp under rq->lock. Will prevent races with read-side
 	 * reference of p->grp in various hot-paths
 	 */
 	rq = __task_rq_lock(p, &rf);
 	transfer_busy_time(rq, grp, p, ADD_TASK);
 	list_add(&p->grp_list, &grp->tasks);
 	rcu_assign_pointer(p->grp, grp);
 	__task_rq_unlock(rq, &rf);

 	_set_preferred_cluster(grp);

 	raw_spin_unlock(&grp->lock);

 	return 0;
 }

 void add_new_task_to_grp(struct task_struct *new)
 {
 	unsigned long flags;
 	struct related_thread_group *grp;

 	/*
 	 * If the task does not belong to colocated schedtune
 	 * cgroup, nothing to do. We are checking this without
 	 * lock. Even if there is a race, it will be added
 	 * to the co-located cgroup via cgroup attach.
 	 */
 	if (!schedtune_task_colocated(new))
 		return;

 	grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
 	write_lock_irqsave(&related_thread_group_lock, flags);

 	/*
 	 * It's possible that someone already added the new task to the
 	 * group. or it might have taken out from the colocated schedtune
 	 * cgroup. check these conditions under lock.
 	 */
 	if (!schedtune_task_colocated(new) || new->grp) {
 		write_unlock_irqrestore(&related_thread_group_lock, flags);
 		return;
 	}

 	raw_spin_lock(&grp->lock);

 	rcu_assign_pointer(new->grp, grp);
 	list_add(&new->grp_list, &grp->tasks);

 	raw_spin_unlock(&grp->lock);
 	write_unlock_irqrestore(&related_thread_group_lock, flags);
 }

 static int __sched_set_group_id(struct task_struct *p, unsigned int group_id)
 {
 	int rc = 0;
 	unsigned long flags;
 	struct related_thread_group *grp = NULL;

 	if (group_id >= MAX_NUM_CGROUP_COLOC_ID)
 		return -EINVAL;

 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	write_lock(&related_thread_group_lock);

 	/* Switching from one group to another directly is not permitted */
 	if ((current != p && p->flags & PF_EXITING) ||
 			(!p->grp && !group_id) ||
 			(p->grp && group_id))
 		goto done;

 	if (!group_id) {
 		remove_task_from_group(p);
 		goto done;
 	}

 	grp = lookup_related_thread_group(group_id);
 	if (list_empty(&grp->list))
 		list_add(&grp->list, &active_related_thread_groups);

 	rc = add_task_to_group(p, grp);
 done:
 	write_unlock(&related_thread_group_lock);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

 	return rc;
 }

 int sched_set_group_id(struct task_struct *p, unsigned int group_id)
 {
 	/* DEFAULT_CGROUP_COLOC_ID is a reserved id */
 	if (group_id == DEFAULT_CGROUP_COLOC_ID)
 		return -EINVAL;

 	return __sched_set_group_id(p, group_id);
 }

 unsigned int sched_get_group_id(struct task_struct *p)
 {
 	unsigned int group_id;
 	struct related_thread_group *grp;

 	rcu_read_lock();
 	grp = task_related_thread_group(p);
 	group_id = grp ? grp->id : 0;
 	rcu_read_unlock();

 	return group_id;
 }

 #if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
 /*
  * We create a default colocation group at boot. There is no need to
  * synchronize tasks between cgroups at creation time because the
  * correct cgroup hierarchy is not available at boot. Therefore cgroup
  * colocation is turned off by default even though the colocation group
  * itself has been allocated. Furthermore this colocation group cannot
  * be destroyted once it has been created. All of this has been as part
  * of runtime optimizations.
  *
  * The job of synchronizing tasks to the colocation group is done when
  * the colocation flag in the cgroup is turned on.
  */
 static int __init create_default_coloc_group(void)
 {
 	struct related_thread_group *grp = NULL;
 	unsigned long flags;

 	grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
 	write_lock_irqsave(&related_thread_group_lock, flags);
 	list_add(&grp->list, &active_related_thread_groups);
 	write_unlock_irqrestore(&related_thread_group_lock, flags);

 	update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH);
 	return 0;
 }
 late_initcall(create_default_coloc_group);

 int sync_cgroup_colocation(struct task_struct *p, bool insert)
 {
 	unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0;

 	return __sched_set_group_id(p, grp_id);
 }
 #endif

 void update_cpu_cluster_capacity(const cpumask_t *cpus)
 {
 	int i;
 	struct sched_cluster *cluster;
 	struct cpumask cpumask;
 	unsigned long flags;

 	cpumask_copy(&cpumask, cpus);
 	acquire_rq_locks_irqsave(cpu_possible_mask, &flags);

 	for_each_cpu(i, &cpumask) {
 		cluster = cpu_rq(i)->cluster;
 		cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);

 		cluster->capacity = compute_capacity(cluster);
 		cluster->load_scale_factor = compute_load_scale_factor(cluster);
 	}

 	__update_min_max_capacity();

 	release_rq_locks_irqrestore(cpu_possible_mask, &flags);
 }

 static unsigned long max_cap[NR_CPUS];
 static unsigned long thermal_cap_cpu[NR_CPUS];

 unsigned long thermal_cap(int cpu)
 {
 	return thermal_cap_cpu[cpu] ?: SCHED_CAPACITY_SCALE;
 }

 unsigned long do_thermal_cap(int cpu, unsigned long thermal_max_freq)
 {
 	struct sched_domain *sd;
 	struct sched_group *sg;
 	struct rq *rq = cpu_rq(cpu);
 	int nr_cap_states;

 	if (!max_cap[cpu]) {
 		rcu_read_lock();
 		sd = rcu_dereference(per_cpu(sd_ea, cpu));
 		if (!sd || !sd->groups || !sd->groups->sge ||
 		    !sd->groups->sge->cap_states) {
 			rcu_read_unlock();
 			return rq->cpu_capacity_orig;
 		}
 		sg = sd->groups;
 		nr_cap_states = sg->sge->nr_cap_states;
 		max_cap[cpu] = sg->sge->cap_states[nr_cap_states - 1].cap;
 		rcu_read_unlock();
 	}

 	if (cpu_max_table_freq[cpu])
 		return div64_ul(thermal_max_freq * max_cap[cpu],
 				cpu_max_table_freq[cpu]);
 	else
 		return rq->cpu_capacity_orig;
 }

 static DEFINE_SPINLOCK(cpu_freq_min_max_lock);
 void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax)
 {
 	struct cpumask cpumask;
 	struct sched_cluster *cluster;
 	int i, update_capacity = 0;
 	unsigned long flags;

 	spin_lock_irqsave(&cpu_freq_min_max_lock, flags);
 	cpumask_copy(&cpumask, cpus);

 	for_each_cpu(i, &cpumask)
 		thermal_cap_cpu[i] = do_thermal_cap(i, fmax);

 	for_each_cpu(i, &cpumask) {
 		cluster = cpu_rq(i)->cluster;
 		cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
 		update_capacity += (cluster->max_mitigated_freq != fmax);
 		cluster->max_mitigated_freq = fmax;
 	}
 	spin_unlock_irqrestore(&cpu_freq_min_max_lock, flags);

 	if (update_capacity)
 		update_cpu_cluster_capacity(cpus);
 }

 void note_task_waking(struct task_struct *p, u64 wallclock)
 {
 	p->last_wake_ts = wallclock;
 }

 /*
  * Task's cpu usage is accounted in:
  *	rq->curr/prev_runnable_sum,  when its ->grp is NULL
  *	grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL
  *
  * Transfer task's cpu usage between those counters when transitioning between
  * groups
  */
 static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
 				struct task_struct *p, int event)
 {
 	u64 wallclock;
 	struct group_cpu_time *cpu_time;
 	u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
 	u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
 	u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
 	u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
 	int migrate_type;
 	int cpu = cpu_of(rq);
 	bool new_task;
 	int i;

 	wallclock = sched_ktime_clock();

 	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
 	update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
 	new_task = is_new_task(p);

 	cpu_time = &rq->grp_time;
 	if (event == ADD_TASK) {
 		migrate_type = RQ_TO_GROUP;

 		src_curr_runnable_sum = &rq->curr_runnable_sum;
 		dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
 		src_prev_runnable_sum = &rq->prev_runnable_sum;
 		dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;

 		src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
 		dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
 		src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
 		dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;

 		*src_curr_runnable_sum -= p->ravg.curr_window_cpu[cpu];
 		*src_prev_runnable_sum -= p->ravg.prev_window_cpu[cpu];
 		if (new_task) {
 			*src_nt_curr_runnable_sum -=
 					p->ravg.curr_window_cpu[cpu];
 			*src_nt_prev_runnable_sum -=
 					p->ravg.prev_window_cpu[cpu];
 		}

 		update_cluster_load_subtractions(p, cpu,
 				rq->window_start, new_task);

 	} else {
 		migrate_type = GROUP_TO_RQ;

 		src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
 		dst_curr_runnable_sum = &rq->curr_runnable_sum;
 		src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
 		dst_prev_runnable_sum = &rq->prev_runnable_sum;

 		src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
 		dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
 		src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
 		dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;

 		*src_curr_runnable_sum -= p->ravg.curr_window;
 		*src_prev_runnable_sum -= p->ravg.prev_window;
 		if (new_task) {
 			*src_nt_curr_runnable_sum -= p->ravg.curr_window;
 			*src_nt_prev_runnable_sum -= p->ravg.prev_window;
 		}

 		/*
 		 * Need to reset curr/prev windows for all CPUs, not just the
 		 * ones in the same cluster. Since inter cluster migrations
 		 * did not result in the appropriate book keeping, the values
 		 * per CPU would be inaccurate.
 		 */
 		for_each_possible_cpu(i) {
 			p->ravg.curr_window_cpu[i] = 0;
 			p->ravg.prev_window_cpu[i] = 0;
 		}
 	}

 	*dst_curr_runnable_sum += p->ravg.curr_window;
 	*dst_prev_runnable_sum += p->ravg.prev_window;
 	if (new_task) {
 		*dst_nt_curr_runnable_sum += p->ravg.curr_window;
 		*dst_nt_prev_runnable_sum += p->ravg.prev_window;
 	}

 	/*
 	 * When a task enter or exits a group, it's curr and prev windows are
 	 * moved to a single CPU. This behavior might be sub-optimal in the
 	 * exit case, however, it saves us the overhead of handling inter
 	 * cluster migration fixups while the task is part of a related group.
 	 */
 	p->ravg.curr_window_cpu[cpu] = p->ravg.curr_window;
 	p->ravg.prev_window_cpu[cpu] = p->ravg.prev_window;

 	trace_sched_migration_update_sum(p, migrate_type, rq);

 	BUG_ON((s64)*src_curr_runnable_sum < 0);
 	BUG_ON((s64)*src_prev_runnable_sum < 0);
 	BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
 	BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
 }

 unsigned int sysctl_sched_little_cluster_coloc_fmin_khz;
 static u64 coloc_boost_load;

 void walt_map_freq_to_load(void)
 {
 	struct sched_cluster *cluster;

 	for_each_sched_cluster(cluster) {
 		if (is_min_capacity_cluster(cluster)) {
 			int fcpu = cluster_first_cpu(cluster);

 			coloc_boost_load = div64_u64(
 				((u64)sched_ravg_window *
 				arch_scale_cpu_capacity(NULL, fcpu) *
 				sysctl_sched_little_cluster_coloc_fmin_khz),
 				(u64)1024 * cpu_max_possible_freq(fcpu));
 			coloc_boost_load = div64_u64(coloc_boost_load << 2, 5);
 			break;
 		}
 	}
 }

 static void walt_update_coloc_boost_load(void)
 {
 	struct related_thread_group *grp;
 	struct sched_cluster *cluster;

 	if (!sysctl_sched_little_cluster_coloc_fmin_khz ||
 			sysctl_sched_boost == CONSERVATIVE_BOOST)
 		return;

 	grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
 	if (!grp || !grp->preferred_cluster ||
 			is_min_capacity_cluster(grp->preferred_cluster))
 		return;

 	for_each_sched_cluster(cluster) {
 		if (is_min_capacity_cluster(cluster)) {
 			cluster->coloc_boost_load = coloc_boost_load;
 			break;
 		}
 	}
 }

 int sched_little_cluster_coloc_fmin_khz_handler(struct ctl_table *table,
 				int write, void __user *buffer, size_t *lenp,
 				loff_t *ppos)
 {
 	int ret;
 	static DEFINE_MUTEX(mutex);

 	mutex_lock(&mutex);

 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret || !write)
 		goto done;

 	walt_map_freq_to_load();

 done:
 	mutex_unlock(&mutex);
 	return ret;
 }

 /*
  * Runs in hard-irq context. This should ideally run just after the latest
  * window roll-over.
  */
 void walt_irq_work(struct irq_work *irq_work)
 {
 	struct sched_cluster *cluster;
 	struct rq *rq;
 	int cpu;
 	u64 wc, total_grp_load = 0;
 	int flag = SCHED_CPUFREQ_WALT;
 	bool is_migration = false;
 	int level = 0;

 	/* Am I the window rollover work or the migration work? */
 	if (irq_work == &walt_migration_irq_work)
 		is_migration = true;

 	for_each_cpu(cpu, cpu_possible_mask) {
 		if (level == 0)
 			raw_spin_lock(&cpu_rq(cpu)->lock);
 		else
 			raw_spin_lock_nested(&cpu_rq(cpu)->lock, level);
 		level++;
 	}

 	wc = sched_ktime_clock();
 	walt_load_reported_window = atomic64_read(&walt_irq_work_lastq_ws);

 	for_each_sched_cluster(cluster) {
 		u64 aggr_grp_load = 0;

 		raw_spin_lock(&cluster->load_lock);

 		for_each_cpu(cpu, &cluster->cpus) {
 			rq = cpu_rq(cpu);
 			if (rq->curr) {
 				update_task_ravg(rq->curr, rq,
 						TASK_UPDATE, wc, 0);
 				account_load_subtractions(rq);
 				aggr_grp_load += rq->grp_time.prev_runnable_sum;
 			}
 		}

 		cluster->aggr_grp_load = aggr_grp_load;
 		total_grp_load = aggr_grp_load;
 		cluster->coloc_boost_load = 0;

 		raw_spin_unlock(&cluster->load_lock);
 	}

 	if (total_grp_load)
 		walt_update_coloc_boost_load();

 	for_each_sched_cluster(cluster) {
 		for_each_cpu(cpu, &cluster->cpus) {
 			int nflag = flag;

 			rq = cpu_rq(cpu);

 			if (is_migration) {
 				if (rq->notif_pending) {
 					nflag |= SCHED_CPUFREQ_INTERCLUSTER_MIG;
 					rq->notif_pending = false;
 				} else {
 					nflag |= SCHED_CPUFREQ_FORCE_UPDATE;
 				}
 			}

 			cpufreq_update_util(rq, nflag);
 		}
 	}

 	for_each_cpu(cpu, cpu_possible_mask)
 		raw_spin_unlock(&cpu_rq(cpu)->lock);

 	if (!is_migration)
 		core_ctl_check(this_rq()->window_start);
 }

 void walt_rotation_checkpoint(int nr_big)
 {
 	if (!hmp_capable())
 		return;

 	if (!sysctl_sched_walt_rotate_big_tasks || sched_boost() != NO_BOOST) {
 		walt_rotation_enabled = 0;
 		return;
 	}

 	walt_rotation_enabled = nr_big >= num_possible_cpus();
 }

 int walt_proc_update_handler(struct ctl_table *table, int write,
 			     void __user *buffer, size_t *lenp,
 			     loff_t *ppos)
 {
 	int ret;
 	unsigned int *data = (unsigned int *)table->data;
 	static DEFINE_MUTEX(mutex);

 	mutex_lock(&mutex);
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret || !write) {
 		mutex_unlock(&mutex);
 		return ret;
 	}

 	if (data == &sysctl_sched_group_upmigrate_pct)
 		sched_group_upmigrate =
 		    pct_to_real(sysctl_sched_group_upmigrate_pct);
 	else if (data == &sysctl_sched_group_downmigrate_pct)
 		sched_group_downmigrate =
 		    pct_to_real(sysctl_sched_group_downmigrate_pct);
 	else
 		ret = -EINVAL;
 	mutex_unlock(&mutex);

 	return ret;
 }

 void walt_sched_init(struct rq *rq)
 {
 	int j;

 	cpumask_set_cpu(cpu_of(rq), &rq->freq_domain_cpumask);
 	init_irq_work(&walt_migration_irq_work, walt_irq_work);
 	init_irq_work(&walt_cpufreq_irq_work, walt_irq_work);
 	walt_rotate_work_init();

 	rq->walt_stats.cumulative_runnable_avg = 0;
 	rq->window_start = 0;
 	rq->cum_window_start = 0;
 	rq->walt_stats.nr_big_tasks = 0;
 	rq->walt_flags = 0;
 	rq->cur_irqload = 0;
 	rq->avg_irqload = 0;
 	rq->irqload_ts = 0;
 	rq->static_cpu_pwr_cost = 0;
 	rq->cc.cycles = 1;
 	rq->cc.time = 1;
 	rq->cstate = 0;
 	rq->wakeup_latency = 0;
 	rq->wakeup_energy = 0;

 	/*
 	 * All cpus part of same cluster by default. This avoids the
 	 * need to check for rq->cluster being non-NULL in hot-paths
 	 * like select_best_cpu()
 	 */
 	rq->cluster = &init_cluster;
 	rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
 	rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
 	memset(&rq->grp_time, 0, sizeof(struct group_cpu_time));
 	rq->old_busy_time = 0;
 	rq->old_estimated_time = 0;
 	rq->old_busy_time_group = 0;
 	rq->walt_stats.pred_demands_sum = 0;
 	rq->ed_task = NULL;
 	rq->curr_table = 0;
 	rq->prev_top = 0;
 	rq->curr_top = 0;
 	rq->last_cc_update = 0;
 	rq->cycles = 0;
 	for (j = 0; j < NUM_TRACKED_WINDOWS; j++) {
 		memset(&rq->load_subs[j], 0,
 				sizeof(struct load_subtractions));
 		rq->top_tasks[j] = kcalloc(NUM_LOAD_INDICES,
 				sizeof(u8), GFP_NOWAIT);
 		/* No other choice */
 		BUG_ON(!rq->top_tasks[j]);
 		clear_top_tasks_bitmap(rq->top_tasks_bitmap[j]);
 	}
 	rq->cum_window_demand = 0;
 	rq->notif_pending = false;

 	walt_cpu_util_freq_divisor =
 	    (sched_ravg_window >> SCHED_CAPACITY_SHIFT) * 100;
 }