Merge branch 'pm-cpuidle'
* pm-cpuidle:
cpuidle: Add a kerneldoc comment to cpuidle_use_deepest_state()
cpuidle: fix improper return value on error
intel_idle: Convert to hotplug state machine
intel_idle: Remove superfluous SMP fuction call
MAINTAINERS: Add Jacob Pan as a new intel_idle maintainer
MAINTAINERS: Add bug tracking system location entries for cpuidle
x86/intel_idle: Add Knights Mill CPUID
x86/intel_idle: Add CPU model 0x4a (Atom Z34xx series)
thermal/intel_powerclamp: stop sched tick in forced idle
thermal/intel_powerclamp: Convert to CPU hotplug state
thermal/intel_powerclamp: Convert the kthread to kthread worker API
thermal/intel_powerclamp: Remove duplicated code that starts the kthread
sched/idle: Add support for tasks that inject idle
cpuidle: Allow enforcing deepest idle state selection
cpuidle/powernv: staticise powernv_idle_driver
cpuidle: dt: assign ->enter_freeze to same as ->enter callback function
cpuidle: governors: Remove remaining old module code
diff --git a/MAINTAINERS b/MAINTAINERS
index d951d67..e89a188 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3389,6 +3389,7 @@
L: linux-pm@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git
+B: https://bugzilla.kernel.org
F: drivers/cpuidle/*
F: include/linux/cpuidle.h
@@ -6298,9 +6299,11 @@
F: drivers/platform/x86/intel-vbtn.c
INTEL IDLE DRIVER
+M: Jacob Pan <jacob.jun.pan@linux.intel.com>
M: Len Brown <lenb@kernel.org>
L: linux-pm@vger.kernel.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git
+B: https://bugzilla.kernel.org
S: Supported
F: drivers/idle/intel_idle.c
diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index 7fe442c..0835a37 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -22,7 +22,7 @@
#define POWERNV_THRESHOLD_LATENCY_NS 200000
-struct cpuidle_driver powernv_idle_driver = {
+static struct cpuidle_driver powernv_idle_driver = {
.name = "powernv_idle",
.owner = THIS_MODULE,
};
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index c73207a..62810ff 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -97,7 +97,23 @@
return ret;
}
-#ifdef CONFIG_SUSPEND
+/**
+ * cpuidle_use_deepest_state - Set/clear governor override flag.
+ * @enable: New value of the flag.
+ *
+ * Set/unset the current CPU to use the deepest idle state (override governors
+ * going forward if set).
+ */
+void cpuidle_use_deepest_state(bool enable)
+{
+ struct cpuidle_device *dev;
+
+ preempt_disable();
+ dev = cpuidle_get_device();
+ dev->use_deepest_state = enable;
+ preempt_enable();
+}
+
/**
* cpuidle_find_deepest_state - Find the deepest available idle state.
* @drv: cpuidle driver for the given CPU.
@@ -109,6 +125,7 @@
return find_deepest_state(drv, dev, UINT_MAX, 0, false);
}
+#ifdef CONFIG_SUSPEND
static void enter_freeze_proper(struct cpuidle_driver *drv,
struct cpuidle_device *dev, int index)
{
diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c
index a5c111b..ffca4fc 100644
--- a/drivers/cpuidle/dt_idle_states.c
+++ b/drivers/cpuidle/dt_idle_states.c
@@ -38,6 +38,12 @@
* state enter function.
*/
idle_state->enter = match_id->data;
+ /*
+ * Since this is not a "coupled" state, it's safe to assume interrupts
+ * won't be enabled when it exits allowing the tick to be frozen
+ * safely. So enter() can be also enter_freeze() callback.
+ */
+ idle_state->enter_freeze = match_id->data;
err = of_property_read_u32(state_node, "wakeup-latency-us",
&idle_state->exit_latency);
diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index fb9f511..4e78263 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -9,7 +9,6 @@
*/
#include <linux/mutex.h>
-#include <linux/module.h>
#include <linux/cpuidle.h>
#include "cpuidle.h"
@@ -53,14 +52,11 @@
if (cpuidle_curr_governor) {
list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
cpuidle_disable_device(dev);
- module_put(cpuidle_curr_governor->owner);
}
cpuidle_curr_governor = gov;
if (gov) {
- if (!try_module_get(cpuidle_curr_governor->owner))
- return -EINVAL;
list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
cpuidle_enable_device(dev);
cpuidle_install_idle_handler();
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 63bd5a4..fe8f089 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -15,7 +15,6 @@
#include <linux/kernel.h>
#include <linux/cpuidle.h>
#include <linux/pm_qos.h>
-#include <linux/module.h>
#include <linux/jiffies.h>
#include <linux/tick.h>
@@ -177,7 +176,6 @@
.enable = ladder_enable_device,
.select = ladder_select_state,
.reflect = ladder_reflect,
- .owner = THIS_MODULE,
};
/**
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 03d38c2..d9b5b93 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -19,7 +19,6 @@
#include <linux/tick.h>
#include <linux/sched.h>
#include <linux/math64.h>
-#include <linux/module.h>
/*
* Please note when changing the tuning values:
@@ -484,7 +483,6 @@
.enable = menu_enable_device,
.select = menu_select,
.reflect = menu_reflect,
- .owner = THIS_MODULE,
};
/**
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 832a2c3..c5adc8c 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -403,8 +403,10 @@
/* state statistics */
for (i = 0; i < drv->state_count; i++) {
kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL);
- if (!kobj)
+ if (!kobj) {
+ ret = -ENOMEM;
goto error_state;
+ }
kobj->state = &drv->states[i];
kobj->state_usage = &device->states_usage[i];
init_completion(&kobj->kobj_unregister);
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 4466a2f..7d8ea3d 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -98,8 +98,6 @@
struct cpuidle_driver *drv, int index);
static void intel_idle_freeze(struct cpuidle_device *dev,
struct cpuidle_driver *drv, int index);
-static int intel_idle_cpu_init(int cpu);
-
static struct cpuidle_state *cpuidle_state_table;
/*
@@ -724,6 +722,50 @@
{
.enter = NULL }
};
+static struct cpuidle_state tangier_cstates[] = {
+ {
+ .name = "C1-TNG",
+ .desc = "MWAIT 0x00",
+ .flags = MWAIT2flg(0x00),
+ .exit_latency = 1,
+ .target_residency = 4,
+ .enter = &intel_idle,
+ .enter_freeze = intel_idle_freeze, },
+ {
+ .name = "C4-TNG",
+ .desc = "MWAIT 0x30",
+ .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 100,
+ .target_residency = 400,
+ .enter = &intel_idle,
+ .enter_freeze = intel_idle_freeze, },
+ {
+ .name = "C6-TNG",
+ .desc = "MWAIT 0x52",
+ .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 140,
+ .target_residency = 560,
+ .enter = &intel_idle,
+ .enter_freeze = intel_idle_freeze, },
+ {
+ .name = "C7-TNG",
+ .desc = "MWAIT 0x60",
+ .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 1200,
+ .target_residency = 4000,
+ .enter = &intel_idle,
+ .enter_freeze = intel_idle_freeze, },
+ {
+ .name = "C9-TNG",
+ .desc = "MWAIT 0x64",
+ .flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 10000,
+ .target_residency = 20000,
+ .enter = &intel_idle,
+ .enter_freeze = intel_idle_freeze, },
+ {
+ .enter = NULL }
+};
static struct cpuidle_state avn_cstates[] = {
{
.name = "C1-AVN",
@@ -907,51 +949,15 @@
mwait_idle_with_hints(eax, ecx);
}
-static void __setup_broadcast_timer(void *arg)
+static void __setup_broadcast_timer(bool on)
{
- unsigned long on = (unsigned long)arg;
-
if (on)
tick_broadcast_enable();
else
tick_broadcast_disable();
}
-static int cpu_hotplug_notify(struct notifier_block *n,
- unsigned long action, void *hcpu)
-{
- int hotcpu = (unsigned long)hcpu;
- struct cpuidle_device *dev;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
-
- if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE)
- smp_call_function_single(hotcpu, __setup_broadcast_timer,
- (void *)true, 1);
-
- /*
- * Some systems can hotplug a cpu at runtime after
- * the kernel has booted, we have to initialize the
- * driver in this case
- */
- dev = per_cpu_ptr(intel_idle_cpuidle_devices, hotcpu);
- if (dev->registered)
- break;
-
- if (intel_idle_cpu_init(hotcpu))
- return NOTIFY_BAD;
-
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block cpu_hotplug_notifier = {
- .notifier_call = cpu_hotplug_notify,
-};
-
-static void auto_demotion_disable(void *dummy)
+static void auto_demotion_disable(void)
{
unsigned long long msr_bits;
@@ -959,7 +965,7 @@
msr_bits &= ~(icpu->auto_demotion_disable_flags);
wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
}
-static void c1e_promotion_disable(void *dummy)
+static void c1e_promotion_disable(void)
{
unsigned long long msr_bits;
@@ -978,6 +984,10 @@
.state_table = atom_cstates,
};
+static const struct idle_cpu idle_cpu_tangier = {
+ .state_table = tangier_cstates,
+};
+
static const struct idle_cpu idle_cpu_lincroft = {
.state_table = atom_cstates,
.auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE,
@@ -1066,6 +1076,7 @@
ICPU(INTEL_FAM6_SANDYBRIDGE_X, idle_cpu_snb),
ICPU(INTEL_FAM6_ATOM_CEDARVIEW, idle_cpu_atom),
ICPU(INTEL_FAM6_ATOM_SILVERMONT1, idle_cpu_byt),
+ ICPU(INTEL_FAM6_ATOM_MERRIFIELD, idle_cpu_tangier),
ICPU(INTEL_FAM6_ATOM_AIRMONT, idle_cpu_cht),
ICPU(INTEL_FAM6_IVYBRIDGE, idle_cpu_ivb),
ICPU(INTEL_FAM6_IVYBRIDGE_X, idle_cpu_ivt),
@@ -1084,6 +1095,7 @@
ICPU(INTEL_FAM6_KABYLAKE_DESKTOP, idle_cpu_skl),
ICPU(INTEL_FAM6_SKYLAKE_X, idle_cpu_skx),
ICPU(INTEL_FAM6_XEON_PHI_KNL, idle_cpu_knl),
+ ICPU(INTEL_FAM6_XEON_PHI_KNM, idle_cpu_knl),
ICPU(INTEL_FAM6_ATOM_GOLDMONT, idle_cpu_bxt),
ICPU(INTEL_FAM6_ATOM_DENVERTON, idle_cpu_dnv),
{}
@@ -1373,12 +1385,11 @@
* allocate, initialize, register cpuidle_devices
* @cpu: cpu/core to initialize
*/
-static int intel_idle_cpu_init(int cpu)
+static int intel_idle_cpu_init(unsigned int cpu)
{
struct cpuidle_device *dev;
dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu);
-
dev->cpu = cpu;
if (cpuidle_register_device(dev)) {
@@ -1387,17 +1398,36 @@
}
if (icpu->auto_demotion_disable_flags)
- smp_call_function_single(cpu, auto_demotion_disable, NULL, 1);
+ auto_demotion_disable();
if (icpu->disable_promotion_to_c1e)
- smp_call_function_single(cpu, c1e_promotion_disable, NULL, 1);
+ c1e_promotion_disable();
+
+ return 0;
+}
+
+static int intel_idle_cpu_online(unsigned int cpu)
+{
+ struct cpuidle_device *dev;
+
+ if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE)
+ __setup_broadcast_timer(true);
+
+ /*
+ * Some systems can hotplug a cpu at runtime after
+ * the kernel has booted, we have to initialize the
+ * driver in this case
+ */
+ dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu);
+ if (!dev->registered)
+ return intel_idle_cpu_init(cpu);
return 0;
}
static int __init intel_idle_init(void)
{
- int retval, i;
+ int retval;
/* Do not load intel_idle at all for now if idle= is passed */
if (boot_option_idle_override != IDLE_NO_OVERRIDE)
@@ -1417,35 +1447,29 @@
struct cpuidle_driver *drv = cpuidle_get_driver();
printk(KERN_DEBUG PREFIX "intel_idle yielding to %s",
drv ? drv->name : "none");
- free_percpu(intel_idle_cpuidle_devices);
- return retval;
+ goto init_driver_fail;
}
- cpu_notifier_register_begin();
-
- for_each_online_cpu(i) {
- retval = intel_idle_cpu_init(i);
- if (retval) {
- intel_idle_cpuidle_devices_uninit();
- cpu_notifier_register_done();
- cpuidle_unregister_driver(&intel_idle_driver);
- free_percpu(intel_idle_cpuidle_devices);
- return retval;
- }
- }
- __register_cpu_notifier(&cpu_hotplug_notifier);
-
if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */
lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
- else
- on_each_cpu(__setup_broadcast_timer, (void *)true, 1);
- cpu_notifier_register_done();
+ retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
+ intel_idle_cpu_online, NULL);
+ if (retval < 0)
+ goto hp_setup_fail;
pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n",
lapic_timer_reliable_states);
return 0;
+
+hp_setup_fail:
+ intel_idle_cpuidle_devices_uninit();
+ cpuidle_unregister_driver(&intel_idle_driver);
+init_driver_fail:
+ free_percpu(intel_idle_cpuidle_devices);
+ return retval;
+
}
device_initcall(intel_idle_init);
diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index afada65..83e6971 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -43,7 +43,6 @@
#include <linux/kernel.h>
#include <linux/delay.h>
#include <linux/kthread.h>
-#include <linux/freezer.h>
#include <linux/cpu.h>
#include <linux/thermal.h>
#include <linux/slab.h>
@@ -86,11 +85,26 @@
*/
static bool clamping;
+static const struct sched_param sparam = {
+ .sched_priority = MAX_USER_RT_PRIO / 2,
+};
+struct powerclamp_worker_data {
+ struct kthread_worker *worker;
+ struct kthread_work balancing_work;
+ struct kthread_delayed_work idle_injection_work;
+ unsigned int cpu;
+ unsigned int count;
+ unsigned int guard;
+ unsigned int window_size_now;
+ unsigned int target_ratio;
+ unsigned int duration_jiffies;
+ bool clamping;
+};
-static struct task_struct * __percpu *powerclamp_thread;
+static struct powerclamp_worker_data * __percpu worker_data;
static struct thermal_cooling_device *cooling_dev;
static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu
- * clamping thread
+ * clamping kthread worker
*/
static unsigned int duration;
@@ -262,11 +276,6 @@
return count;
}
-static void noop_timer(unsigned long foo)
-{
- /* empty... just the fact that we get the interrupt wakes us up */
-}
-
static unsigned int get_compensation(int ratio)
{
unsigned int comp = 0;
@@ -368,103 +377,79 @@
return set_target_ratio + guard <= current_ratio;
}
-static int clamp_thread(void *arg)
+static void clamp_balancing_func(struct kthread_work *work)
{
- int cpunr = (unsigned long)arg;
- DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
- static const struct sched_param param = {
- .sched_priority = MAX_USER_RT_PRIO/2,
- };
- unsigned int count = 0;
- unsigned int target_ratio;
+ struct powerclamp_worker_data *w_data;
+ int sleeptime;
+ unsigned long target_jiffies;
+ unsigned int compensated_ratio;
+ int interval; /* jiffies to sleep for each attempt */
- set_bit(cpunr, cpu_clamping_mask);
- set_freezable();
- init_timer_on_stack(&wakeup_timer);
- sched_setscheduler(current, SCHED_FIFO, ¶m);
+ w_data = container_of(work, struct powerclamp_worker_data,
+ balancing_work);
- while (true == clamping && !kthread_should_stop() &&
- cpu_online(cpunr)) {
- int sleeptime;
- unsigned long target_jiffies;
- unsigned int guard;
- unsigned int compensated_ratio;
- int interval; /* jiffies to sleep for each attempt */
- unsigned int duration_jiffies = msecs_to_jiffies(duration);
- unsigned int window_size_now;
+ /*
+ * make sure user selected ratio does not take effect until
+ * the next round. adjust target_ratio if user has changed
+ * target such that we can converge quickly.
+ */
+ w_data->target_ratio = READ_ONCE(set_target_ratio);
+ w_data->guard = 1 + w_data->target_ratio / 20;
+ w_data->window_size_now = window_size;
+ w_data->duration_jiffies = msecs_to_jiffies(duration);
+ w_data->count++;
- try_to_freeze();
- /*
- * make sure user selected ratio does not take effect until
- * the next round. adjust target_ratio if user has changed
- * target such that we can converge quickly.
- */
- target_ratio = set_target_ratio;
- guard = 1 + target_ratio/20;
- window_size_now = window_size;
- count++;
+ /*
+ * systems may have different ability to enter package level
+ * c-states, thus we need to compensate the injected idle ratio
+ * to achieve the actual target reported by the HW.
+ */
+ compensated_ratio = w_data->target_ratio +
+ get_compensation(w_data->target_ratio);
+ if (compensated_ratio <= 0)
+ compensated_ratio = 1;
+ interval = w_data->duration_jiffies * 100 / compensated_ratio;
- /*
- * systems may have different ability to enter package level
- * c-states, thus we need to compensate the injected idle ratio
- * to achieve the actual target reported by the HW.
- */
- compensated_ratio = target_ratio +
- get_compensation(target_ratio);
- if (compensated_ratio <= 0)
- compensated_ratio = 1;
- interval = duration_jiffies * 100 / compensated_ratio;
+ /* align idle time */
+ target_jiffies = roundup(jiffies, interval);
+ sleeptime = target_jiffies - jiffies;
+ if (sleeptime <= 0)
+ sleeptime = 1;
- /* align idle time */
- target_jiffies = roundup(jiffies, interval);
- sleeptime = target_jiffies - jiffies;
- if (sleeptime <= 0)
- sleeptime = 1;
- schedule_timeout_interruptible(sleeptime);
- /*
- * only elected controlling cpu can collect stats and update
- * control parameters.
- */
- if (cpunr == control_cpu && !(count%window_size_now)) {
- should_skip =
- powerclamp_adjust_controls(target_ratio,
- guard, window_size_now);
- smp_mb();
- }
+ if (clamping && w_data->clamping && cpu_online(w_data->cpu))
+ kthread_queue_delayed_work(w_data->worker,
+ &w_data->idle_injection_work,
+ sleeptime);
+}
- if (should_skip)
- continue;
+static void clamp_idle_injection_func(struct kthread_work *work)
+{
+ struct powerclamp_worker_data *w_data;
- target_jiffies = jiffies + duration_jiffies;
- mod_timer(&wakeup_timer, target_jiffies);
- if (unlikely(local_softirq_pending()))
- continue;
- /*
- * stop tick sched during idle time, interrupts are still
- * allowed. thus jiffies are updated properly.
- */
- preempt_disable();
- /* mwait until target jiffies is reached */
- while (time_before(jiffies, target_jiffies)) {
- unsigned long ecx = 1;
- unsigned long eax = target_mwait;
+ w_data = container_of(work, struct powerclamp_worker_data,
+ idle_injection_work.work);
- /*
- * REVISIT: may call enter_idle() to notify drivers who
- * can save power during cpu idle. same for exit_idle()
- */
- local_touch_nmi();
- stop_critical_timings();
- mwait_idle_with_hints(eax, ecx);
- start_critical_timings();
- atomic_inc(&idle_wakeup_counter);
- }
- preempt_enable();
+ /*
+ * only elected controlling cpu can collect stats and update
+ * control parameters.
+ */
+ if (w_data->cpu == control_cpu &&
+ !(w_data->count % w_data->window_size_now)) {
+ should_skip =
+ powerclamp_adjust_controls(w_data->target_ratio,
+ w_data->guard,
+ w_data->window_size_now);
+ smp_mb();
}
- del_timer_sync(&wakeup_timer);
- clear_bit(cpunr, cpu_clamping_mask);
- return 0;
+ if (should_skip)
+ goto balance;
+
+ play_idle(jiffies_to_msecs(w_data->duration_jiffies));
+
+balance:
+ if (clamping && w_data->clamping && cpu_online(w_data->cpu))
+ kthread_queue_work(w_data->worker, &w_data->balancing_work);
}
/*
@@ -508,10 +493,60 @@
schedule_delayed_work(&poll_pkg_cstate_work, HZ);
}
+static void start_power_clamp_worker(unsigned long cpu)
+{
+ struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
+ struct kthread_worker *worker;
+
+ worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inject/%ld", cpu);
+ if (IS_ERR(worker))
+ return;
+
+ w_data->worker = worker;
+ w_data->count = 0;
+ w_data->cpu = cpu;
+ w_data->clamping = true;
+ set_bit(cpu, cpu_clamping_mask);
+ sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
+ kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
+ kthread_init_delayed_work(&w_data->idle_injection_work,
+ clamp_idle_injection_func);
+ kthread_queue_work(w_data->worker, &w_data->balancing_work);
+}
+
+static void stop_power_clamp_worker(unsigned long cpu)
+{
+ struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
+
+ if (!w_data->worker)
+ return;
+
+ w_data->clamping = false;
+ /*
+ * Make sure that all works that get queued after this point see
+ * the clamping disabled. The counter part is not needed because
+ * there is an implicit memory barrier when the queued work
+ * is proceed.
+ */
+ smp_wmb();
+ kthread_cancel_work_sync(&w_data->balancing_work);
+ kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
+ /*
+ * The balancing work still might be queued here because
+ * the handling of the "clapming" variable, cancel, and queue
+ * operations are not synchronized via a lock. But it is not
+ * a big deal. The balancing work is fast and destroy kthread
+ * will wait for it.
+ */
+ clear_bit(w_data->cpu, cpu_clamping_mask);
+ kthread_destroy_worker(w_data->worker);
+
+ w_data->worker = NULL;
+}
+
static int start_power_clamp(void)
{
unsigned long cpu;
- struct task_struct *thread;
set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
/* prevent cpu hotplug */
@@ -525,22 +560,9 @@
clamping = true;
schedule_delayed_work(&poll_pkg_cstate_work, 0);
- /* start one thread per online cpu */
+ /* start one kthread worker per online cpu */
for_each_online_cpu(cpu) {
- struct task_struct **p =
- per_cpu_ptr(powerclamp_thread, cpu);
-
- thread = kthread_create_on_node(clamp_thread,
- (void *) cpu,
- cpu_to_node(cpu),
- "kidle_inject/%ld", cpu);
- /* bind to cpu here */
- if (likely(!IS_ERR(thread))) {
- kthread_bind(thread, cpu);
- wake_up_process(thread);
- *p = thread;
- }
-
+ start_power_clamp_worker(cpu);
}
put_online_cpus();
@@ -550,71 +572,49 @@
static void end_power_clamp(void)
{
int i;
- struct task_struct *thread;
- clamping = false;
/*
- * make clamping visible to other cpus and give per cpu clamping threads
- * sometime to exit, or gets killed later.
+ * Block requeuing in all the kthread workers. They will flush and
+ * stop faster.
*/
- smp_mb();
- msleep(20);
+ clamping = false;
if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
- pr_debug("clamping thread for cpu %d alive, kill\n", i);
- thread = *per_cpu_ptr(powerclamp_thread, i);
- kthread_stop(thread);
+ pr_debug("clamping worker for cpu %d alive, destroy\n",
+ i);
+ stop_power_clamp_worker(i);
}
}
}
-static int powerclamp_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int powerclamp_cpu_online(unsigned int cpu)
{
- unsigned long cpu = (unsigned long)hcpu;
- struct task_struct *thread;
- struct task_struct **percpu_thread =
- per_cpu_ptr(powerclamp_thread, cpu);
-
- if (false == clamping)
- goto exit_ok;
-
- switch (action) {
- case CPU_ONLINE:
- thread = kthread_create_on_node(clamp_thread,
- (void *) cpu,
- cpu_to_node(cpu),
- "kidle_inject/%lu", cpu);
- if (likely(!IS_ERR(thread))) {
- kthread_bind(thread, cpu);
- wake_up_process(thread);
- *percpu_thread = thread;
- }
- /* prefer BSP as controlling CPU */
- if (cpu == 0) {
- control_cpu = 0;
- smp_mb();
- }
- break;
- case CPU_DEAD:
- if (test_bit(cpu, cpu_clamping_mask)) {
- pr_err("cpu %lu dead but powerclamping thread is not\n",
- cpu);
- kthread_stop(*percpu_thread);
- }
- if (cpu == control_cpu) {
- control_cpu = smp_processor_id();
- smp_mb();
- }
+ if (clamping == false)
+ return 0;
+ start_power_clamp_worker(cpu);
+ /* prefer BSP as controlling CPU */
+ if (cpu == 0) {
+ control_cpu = 0;
+ smp_mb();
}
-
-exit_ok:
- return NOTIFY_OK;
+ return 0;
}
-static struct notifier_block powerclamp_cpu_notifier = {
- .notifier_call = powerclamp_cpu_callback,
-};
+static int powerclamp_cpu_predown(unsigned int cpu)
+{
+ if (clamping == false)
+ return 0;
+
+ stop_power_clamp_worker(cpu);
+ if (cpu != control_cpu)
+ return 0;
+
+ control_cpu = cpumask_first(cpu_online_mask);
+ if (control_cpu == cpu)
+ control_cpu = cpumask_next(cpu, cpu_online_mask);
+ smp_mb();
+ return 0;
+}
static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
unsigned long *state)
@@ -742,6 +742,8 @@
debugfs_remove_recursive(debug_dir);
}
+static enum cpuhp_state hp_state;
+
static int __init powerclamp_init(void)
{
int retval;
@@ -759,10 +761,17 @@
/* set default limit, maybe adjusted during runtime based on feedback */
window_size = 2;
- register_hotcpu_notifier(&powerclamp_cpu_notifier);
+ retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "thermal/intel_powerclamp:online",
+ powerclamp_cpu_online,
+ powerclamp_cpu_predown);
+ if (retval < 0)
+ goto exit_free;
- powerclamp_thread = alloc_percpu(struct task_struct *);
- if (!powerclamp_thread) {
+ hp_state = retval;
+
+ worker_data = alloc_percpu(struct powerclamp_worker_data);
+ if (!worker_data) {
retval = -ENOMEM;
goto exit_unregister;
}
@@ -782,9 +791,9 @@
return 0;
exit_free_thread:
- free_percpu(powerclamp_thread);
+ free_percpu(worker_data);
exit_unregister:
- unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
+ cpuhp_remove_state_nocalls(hp_state);
exit_free:
kfree(cpu_clamping_mask);
return retval;
@@ -793,9 +802,9 @@
static void __exit powerclamp_exit(void)
{
- unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
end_power_clamp();
- free_percpu(powerclamp_thread);
+ cpuhp_remove_state_nocalls(hp_state);
+ free_percpu(worker_data);
thermal_cooling_device_unregister(cooling_dev);
kfree(cpu_clamping_mask);
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index b886dc1..ac0efae 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -245,6 +245,8 @@
int cpu_report_state(int cpu);
int cpu_check_up_prepare(int cpu);
void cpu_set_state_online(int cpu);
+void play_idle(unsigned long duration_ms);
+
#ifdef CONFIG_HOTPLUG_CPU
bool cpu_wait_death(unsigned int cpu, int seconds);
bool cpu_report_death(void);
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index bb31373..da346f2 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -74,6 +74,7 @@
struct cpuidle_device {
unsigned int registered:1;
unsigned int enabled:1;
+ unsigned int use_deepest_state:1;
unsigned int cpu;
int last_residency;
@@ -192,11 +193,12 @@
static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; }
#endif
-#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_SUSPEND)
+#ifdef CONFIG_CPU_IDLE
extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
struct cpuidle_device *dev);
extern int cpuidle_enter_freeze(struct cpuidle_driver *drv,
struct cpuidle_device *dev);
+extern void cpuidle_use_deepest_state(bool enable);
#else
static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
struct cpuidle_device *dev)
@@ -204,6 +206,9 @@
static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv,
struct cpuidle_device *dev)
{return -ENODEV; }
+static inline void cpuidle_use_deepest_state(bool enable)
+{
+}
#endif
/* kernel/sched/idle.c */
@@ -235,8 +240,6 @@
int (*select) (struct cpuidle_driver *drv,
struct cpuidle_device *dev);
void (*reflect) (struct cpuidle_device *dev, int index);
-
- struct module *owner;
};
#ifdef CONFIG_CPU_IDLE
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e9c009d..a3d338e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2254,6 +2254,7 @@
/*
* Per process flags
*/
+#define PF_IDLE 0x00000002 /* I am an IDLE thread */
#define PF_EXITING 0x00000004 /* getting shut down */
#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
@@ -2611,7 +2612,7 @@
*/
static inline bool is_idle_task(const struct task_struct *p)
{
- return p->pid == 0;
+ return !!(p->flags & PF_IDLE);
}
extern struct task_struct *curr_task(int cpu);
extern void ia64_set_curr_task(int cpu, struct task_struct *p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 997ac1d..a8eb821 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1540,7 +1540,7 @@
goto bad_fork_cleanup_count;
delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
- p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
+ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
p->flags |= PF_FORKNOEXEC;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 154fd68..c95fbcd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5279,6 +5279,7 @@
__sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
+ idle->flags |= PF_IDLE;
kasan_unpoison_task_stack(idle);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1d8718d..6a4bae0 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -164,11 +164,14 @@
* timekeeping to prevent timer interrupts from kicking us out of idle
* until a proper wakeup interrupt happens.
*/
- if (idle_should_freeze()) {
- entered_state = cpuidle_enter_freeze(drv, dev);
- if (entered_state > 0) {
- local_irq_enable();
- goto exit_idle;
+
+ if (idle_should_freeze() || dev->use_deepest_state) {
+ if (idle_should_freeze()) {
+ entered_state = cpuidle_enter_freeze(drv, dev);
+ if (entered_state > 0) {
+ local_irq_enable();
+ goto exit_idle;
+ }
}
next_state = cpuidle_find_deepest_state(drv, dev);
@@ -202,76 +205,65 @@
*
* Called with polling cleared.
*/
-static void cpu_idle_loop(void)
+static void do_idle(void)
{
- int cpu = smp_processor_id();
+ /*
+ * If the arch has a polling bit, we maintain an invariant:
+ *
+ * Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
+ * rq->idle). This means that, if rq->idle has the polling bit set,
+ * then setting need_resched is guaranteed to cause the CPU to
+ * reschedule.
+ */
- while (1) {
- /*
- * If the arch has a polling bit, we maintain an invariant:
- *
- * Our polling bit is clear if we're not scheduled (i.e. if
- * rq->curr != rq->idle). This means that, if rq->idle has
- * the polling bit set, then setting need_resched is
- * guaranteed to cause the cpu to reschedule.
- */
+ __current_set_polling();
+ tick_nohz_idle_enter();
- __current_set_polling();
- quiet_vmstat();
- tick_nohz_idle_enter();
+ while (!need_resched()) {
+ check_pgt_cache();
+ rmb();
- while (!need_resched()) {
- check_pgt_cache();
- rmb();
-
- if (cpu_is_offline(cpu)) {
- cpuhp_report_idle_dead();
- arch_cpu_idle_dead();
- }
-
- local_irq_disable();
- arch_cpu_idle_enter();
-
- /*
- * In poll mode we reenable interrupts and spin.
- *
- * Also if we detected in the wakeup from idle
- * path that the tick broadcast device expired
- * for us, we don't want to go deep idle as we
- * know that the IPI is going to arrive right
- * away
- */
- if (cpu_idle_force_poll || tick_check_broadcast_expired())
- cpu_idle_poll();
- else
- cpuidle_idle_call();
-
- arch_cpu_idle_exit();
+ if (cpu_is_offline(smp_processor_id())) {
+ cpuhp_report_idle_dead();
+ arch_cpu_idle_dead();
}
- /*
- * Since we fell out of the loop above, we know
- * TIF_NEED_RESCHED must be set, propagate it into
- * PREEMPT_NEED_RESCHED.
- *
- * This is required because for polling idle loops we will
- * not have had an IPI to fold the state for us.
- */
- preempt_set_need_resched();
- tick_nohz_idle_exit();
- __current_clr_polling();
+ local_irq_disable();
+ arch_cpu_idle_enter();
/*
- * We promise to call sched_ttwu_pending and reschedule
- * if need_resched is set while polling is set. That
- * means that clearing polling needs to be visible
- * before doing these things.
+ * In poll mode we reenable interrupts and spin. Also if we
+ * detected in the wakeup from idle path that the tick
+ * broadcast device expired for us, we don't want to go deep
+ * idle as we know that the IPI is going to arrive right away.
*/
- smp_mb__after_atomic();
-
- sched_ttwu_pending();
- schedule_preempt_disabled();
+ if (cpu_idle_force_poll || tick_check_broadcast_expired())
+ cpu_idle_poll();
+ else
+ cpuidle_idle_call();
+ arch_cpu_idle_exit();
}
+
+ /*
+ * Since we fell out of the loop above, we know TIF_NEED_RESCHED must
+ * be set, propagate it into PREEMPT_NEED_RESCHED.
+ *
+ * This is required because for polling idle loops we will not have had
+ * an IPI to fold the state for us.
+ */
+ preempt_set_need_resched();
+ tick_nohz_idle_exit();
+ __current_clr_polling();
+
+ /*
+ * We promise to call sched_ttwu_pending() and reschedule if
+ * need_resched() is set while polling is set. That means that clearing
+ * polling needs to be visible before doing these things.
+ */
+ smp_mb__after_atomic();
+
+ sched_ttwu_pending();
+ schedule_preempt_disabled();
}
bool cpu_in_idle(unsigned long pc)
@@ -280,6 +272,56 @@
pc < (unsigned long)__cpuidle_text_end;
}
+struct idle_timer {
+ struct hrtimer timer;
+ int done;
+};
+
+static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
+{
+ struct idle_timer *it = container_of(timer, struct idle_timer, timer);
+
+ WRITE_ONCE(it->done, 1);
+ set_tsk_need_resched(current);
+
+ return HRTIMER_NORESTART;
+}
+
+void play_idle(unsigned long duration_ms)
+{
+ struct idle_timer it;
+
+ /*
+ * Only FIFO tasks can disable the tick since they don't need the forced
+ * preemption.
+ */
+ WARN_ON_ONCE(current->policy != SCHED_FIFO);
+ WARN_ON_ONCE(current->nr_cpus_allowed != 1);
+ WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
+ WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
+ WARN_ON_ONCE(!duration_ms);
+
+ rcu_sleep_check();
+ preempt_disable();
+ current->flags |= PF_IDLE;
+ cpuidle_use_deepest_state(true);
+
+ it.done = 0;
+ hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ it.timer.function = idle_inject_timer_fn;
+ hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED);
+
+ while (!READ_ONCE(it.done))
+ do_idle();
+
+ cpuidle_use_deepest_state(false);
+ current->flags &= ~PF_IDLE;
+
+ preempt_fold_need_resched();
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(play_idle);
+
void cpu_startup_entry(enum cpuhp_state state)
{
/*
@@ -299,5 +341,6 @@
#endif
arch_cpu_idle_prepare();
cpuhp_online_idle(state);
- cpu_idle_loop();
+ while (1)
+ do_idle();
}