Merge branch 'pm-sleep' into acpi-battery
diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index 64c9276..f455181 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -7,19 +7,30 @@
subsystem.
What: /sys/power/state
-Date: August 2006
+Date: May 2014
Contact: Rafael J. Wysocki <rjw@rjwysocki.net>
Description:
- The /sys/power/state file controls the system power state.
- Reading from this file returns what states are supported,
- which is hard-coded to 'freeze' (Low-Power Idle), 'standby'
- (Power-On Suspend), 'mem' (Suspend-to-RAM), and 'disk'
- (Suspend-to-Disk).
+ The /sys/power/state file controls system sleep states.
+ Reading from this file returns the available sleep state
+ labels, which may be "mem", "standby", "freeze" and "disk"
+ (hibernation). The meanings of the first three labels depend on
+ the relative_sleep_states command line argument as follows:
+ 1) relative_sleep_states = 1
+ "mem", "standby", "freeze" represent non-hibernation sleep
+ states from the deepest ("mem", always present) to the
+ shallowest ("freeze"). "standby" and "freeze" may or may
+ not be present depending on the capabilities of the
+ platform. "freeze" can only be present if "standby" is
+ present.
+ 2) relative_sleep_states = 0 (default)
+ "mem" - "suspend-to-RAM", present if supported.
+ "standby" - "power-on suspend", present if supported.
+ "freeze" - "suspend-to-idle", always present.
Writing to this file one of these strings causes the system to
- transition into that state. Please see the file
- Documentation/power/states.txt for a description of each of
- these states.
+ transition into the corresponding state, if available. See
+ Documentation/power/states.txt for a description of what
+ "suspend-to-RAM", "power-on suspend" and "suspend-to-idle" mean.
What: /sys/power/disk
Date: September 2006
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4384217..e19a88b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2889,6 +2889,13 @@
[KNL, SMP] Set scheduler's default relax_domain_level.
See Documentation/cgroups/cpusets.txt.
+ relative_sleep_states=
+ [SUSPEND] Use sleep state labeling where the deepest
+ state available other than hibernation is always "mem".
+ Format: { "0" | "1" }
+ 0 -- Traditional sleep state labels.
+ 1 -- Relative sleep state labels.
+
reserve= [KNL,BUGS] Force the kernel to ignore some iomem area
reservetop= [X86-32]
diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index 47d46df..d172bce 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -2,6 +2,7 @@
Copyright (c) 2010-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
Copyright (c) 2010 Alan Stern <stern@rowland.harvard.edu>
+Copyright (c) 2014 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Most of the code in Linux is device drivers, so most of the Linux power
@@ -326,6 +327,20 @@
driver in some way for the upcoming system power transition, but it
should not put the device into a low-power state.
+ For devices supporting runtime power management, the return value of the
+ prepare callback can be used to indicate to the PM core that it may
+ safely leave the device in runtime suspend (if runtime-suspended
+ already), provided that all of the device's descendants are also left in
+ runtime suspend. Namely, if the prepare callback returns a positive
+ number and that happens for all of the descendants of the device too,
+ and all of them (including the device itself) are runtime-suspended, the
+ PM core will skip the suspend, suspend_late and suspend_noirq suspend
+ phases as well as the resume_noirq, resume_early and resume phases of
+ the following system resume for all of these devices. In that case,
+ the complete callback will be called directly after the prepare callback
+ and is entirely responsible for bringing the device back to the
+ functional state as appropriate.
+
2. The suspend methods should quiesce the device to stop it from performing
I/O. They also may save the device registers and put it into the
appropriate low-power state, depending on the bus type the device is on,
@@ -400,12 +415,23 @@
the resume callbacks occur; it's not necessary to wait until the
complete phase.
+ Moreover, if the preceding prepare callback returned a positive number,
+ the device may have been left in runtime suspend throughout the whole
+ system suspend and resume (the suspend, suspend_late, suspend_noirq
+ phases of system suspend and the resume_noirq, resume_early, resume
+ phases of system resume may have been skipped for it). In that case,
+ the complete callback is entirely responsible for bringing the device
+ back to the functional state after system suspend if necessary. [For
+ example, it may need to queue up a runtime resume request for the device
+ for this purpose.] To check if that is the case, the complete callback
+ can consult the device's power.direct_complete flag. Namely, if that
+ flag is set when the complete callback is being run, it has been called
+ directly after the preceding prepare and special action may be required
+ to make the device work correctly afterward.
+
At the end of these phases, drivers should be as functional as they were before
suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are
-gated on. Even if the device was in a low-power state before the system sleep
-because of runtime power management, afterwards it should be back in its
-full-power state. There are multiple reasons why it's best to do this; they are
-discussed in more detail in Documentation/power/runtime_pm.txt.
+gated on.
However, the details here may again be platform-specific. For example,
some systems support multiple "run" states, and the mode in effect at
diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
index 5f96daf..e1bee8a 100644
--- a/Documentation/power/runtime_pm.txt
+++ b/Documentation/power/runtime_pm.txt
@@ -2,6 +2,7 @@
(C) 2009-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
(C) 2010 Alan Stern <stern@rowland.harvard.edu>
+(C) 2014 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
1. Introduction
@@ -444,6 +445,10 @@
bool pm_runtime_status_suspended(struct device *dev);
- return true if the device's runtime PM status is 'suspended'
+ bool pm_runtime_suspended_if_enabled(struct device *dev);
+ - return true if the device's runtime PM status is 'suspended' and its
+ 'power.disable_depth' field is equal to 1
+
void pm_runtime_allow(struct device *dev);
- set the power.runtime_auto flag for the device and decrease its usage
counter (used by the /sys/devices/.../power/control interface to
@@ -644,6 +649,18 @@
be more efficient to leave the devices that had been suspended before the system
suspend began in the suspended state.
+To this end, the PM core provides a mechanism allowing some coordination between
+different levels of device hierarchy. Namely, if a system suspend .prepare()
+callback returns a positive number for a device, that indicates to the PM core
+that the device appears to be runtime-suspended and its state is fine, so it
+may be left in runtime suspend provided that all of its descendants are also
+left in runtime suspend. If that happens, the PM core will not execute any
+system suspend and resume callbacks for all of those devices, except for the
+complete callback, which is then entirely responsible for handling the device
+as appropriate. This only applies to system suspend transitions that are not
+related to hibernation (see Documentation/power/devices.txt for more
+information).
+
The PM core does its best to reduce the probability of race conditions between
the runtime PM and system suspend/resume (and hibernation) callbacks by carrying
out the following operations:
diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt
index 442d43d..50f3ef9 100644
--- a/Documentation/power/states.txt
+++ b/Documentation/power/states.txt
@@ -1,62 +1,87 @@
+System Power Management Sleep States
-System Power Management States
+(C) 2014 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+The kernel supports up to four system sleep states generically, although three
+of them depend on the platform support code to implement the low-level details
+for each state.
-The kernel supports four power management states generically, though
-one is generic and the other three are dependent on platform support
-code to implement the low-level details for each state.
-This file describes each state, what they are
-commonly called, what ACPI state they map to, and what string to write
-to /sys/power/state to enter that state
+The states are represented by strings that can be read or written to the
+/sys/power/state file. Those strings may be "mem", "standby", "freeze" and
+"disk", where the last one always represents hibernation (Suspend-To-Disk) and
+the meaning of the remaining ones depends on the relative_sleep_states command
+line argument.
-state: Freeze / Low-Power Idle
+For relative_sleep_states=1, the strings "mem", "standby" and "freeze" label the
+available non-hibernation sleep states from the deepest to the shallowest,
+respectively. In that case, "mem" is always present in /sys/power/state,
+because there is at least one non-hibernation sleep state in every system. If
+the given system supports two non-hibernation sleep states, "standby" is present
+in /sys/power/state in addition to "mem". If the system supports three
+non-hibernation sleep states, "freeze" will be present in /sys/power/state in
+addition to "mem" and "standby".
+
+For relative_sleep_states=0, which is the default, the following descriptions
+apply.
+
+state: Suspend-To-Idle
ACPI state: S0
-String: "freeze"
+Label: "freeze"
-This state is a generic, pure software, light-weight, low-power state.
-It allows more energy to be saved relative to idle by freezing user
+This state is a generic, pure software, light-weight, system sleep state.
+It allows more energy to be saved relative to runtime idle by freezing user
space and putting all I/O devices into low-power states (possibly
lower-power than available at run time), such that the processors can
spend more time in their idle states.
-This state can be used for platforms without Standby/Suspend-to-RAM
+
+This state can be used for platforms without Power-On Suspend/Suspend-to-RAM
support, or it can be used in addition to Suspend-to-RAM (memory sleep)
-to provide reduced resume latency.
+to provide reduced resume latency. It is always supported.
State: Standby / Power-On Suspend
ACPI State: S1
-String: "standby"
+Label: "standby"
-This state offers minimal, though real, power savings, while providing
-a very low-latency transition back to a working system. No operating
-state is lost (the CPU retains power), so the system easily starts up
+This state, if supported, offers moderate, though real, power savings, while
+providing a relatively low-latency transition back to a working system. No
+operating state is lost (the CPU retains power), so the system easily starts up
again where it left off.
-We try to put devices in a low-power state equivalent to D1, which
-also offers low power savings, but low resume latency. Not all devices
-support D1, and those that don't are left on.
+In addition to freezing user space and putting all I/O devices into low-power
+states, which is done for Suspend-To-Idle too, nonboot CPUs are taken offline
+and all low-level system functions are suspended during transitions into this
+state. For this reason, it should allow more energy to be saved relative to
+Suspend-To-Idle, but the resume latency will generally be greater than for that
+state.
State: Suspend-to-RAM
ACPI State: S3
-String: "mem"
+Label: "mem"
-This state offers significant power savings as everything in the
-system is put into a low-power state, except for memory, which is
-placed in self-refresh mode to retain its contents.
+This state, if supported, offers significant power savings as everything in the
+system is put into a low-power state, except for memory, which should be placed
+into the self-refresh mode to retain its contents. All of the steps carried out
+when entering Power-On Suspend are also carried out during transitions to STR.
+Additional operations may take place depending on the platform capabilities. In
+particular, on ACPI systems the kernel passes control to the BIOS (platform
+firmware) as the last step during STR transitions and that usually results in
+powering down some more low-level components that aren't directly controlled by
+the kernel.
-System and device state is saved and kept in memory. All devices are
-suspended and put into D3. In many cases, all peripheral buses lose
-power when entering STR, so devices must be able to handle the
-transition back to the On state.
+System and device state is saved and kept in memory. All devices are suspended
+and put into low-power states. In many cases, all peripheral buses lose power
+when entering STR, so devices must be able to handle the transition back to the
+"on" state.
-For at least ACPI, STR requires some minimal boot-strapping code to
-resume the system from STR. This may be true on other platforms.
+For at least ACPI, STR requires some minimal boot-strapping code to resume the
+system from it. This may be the case on other platforms too.
State: Suspend-to-disk
ACPI State: S4
-String: "disk"
+Label: "disk"
This state offers the greatest power savings, and can be used even in
the absence of low-level platform support for power management. This
diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
index 079160e..f732a83 100644
--- a/Documentation/power/swsusp.txt
+++ b/Documentation/power/swsusp.txt
@@ -220,7 +220,10 @@
A: Try running
-cat `cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u` > /dev/null
+cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file
+do
+ test -f "$file" && cat "$file" > /dev/null
+done
after resume. swapoff -a; swapon -a may also be useful.
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 86d5e4f..343ffad 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -479,7 +479,7 @@
TRACE_DEVICE(dev);
TRACE_RESUME(0);
- if (dev->power.syscore)
+ if (dev->power.syscore || dev->power.direct_complete)
goto Out;
if (!dev->power.is_noirq_suspended)
@@ -605,7 +605,7 @@
TRACE_DEVICE(dev);
TRACE_RESUME(0);
- if (dev->power.syscore)
+ if (dev->power.syscore || dev->power.direct_complete)
goto Out;
if (!dev->power.is_late_suspended)
@@ -735,6 +735,12 @@
if (dev->power.syscore)
goto Complete;
+ if (dev->power.direct_complete) {
+ /* Match the pm_runtime_disable() in __device_suspend(). */
+ pm_runtime_enable(dev);
+ goto Complete;
+ }
+
dpm_wait(dev->parent, async);
dpm_watchdog_set(&wd, dev);
device_lock(dev);
@@ -1007,7 +1013,7 @@
goto Complete;
}
- if (dev->power.syscore)
+ if (dev->power.syscore || dev->power.direct_complete)
goto Complete;
dpm_wait_for_children(dev, async);
@@ -1146,7 +1152,7 @@
goto Complete;
}
- if (dev->power.syscore)
+ if (dev->power.syscore || dev->power.direct_complete)
goto Complete;
dpm_wait_for_children(dev, async);
@@ -1332,6 +1338,17 @@
if (dev->power.syscore)
goto Complete;
+ if (dev->power.direct_complete) {
+ if (pm_runtime_status_suspended(dev)) {
+ pm_runtime_disable(dev);
+ if (pm_runtime_suspended_if_enabled(dev))
+ goto Complete;
+
+ pm_runtime_enable(dev);
+ }
+ dev->power.direct_complete = false;
+ }
+
dpm_watchdog_set(&wd, dev);
device_lock(dev);
@@ -1382,10 +1399,19 @@
End:
if (!error) {
+ struct device *parent = dev->parent;
+
dev->power.is_suspended = true;
- if (dev->power.wakeup_path
- && dev->parent && !dev->parent->power.ignore_children)
- dev->parent->power.wakeup_path = true;
+ if (parent) {
+ spin_lock_irq(&parent->power.lock);
+
+ dev->parent->power.direct_complete = false;
+ if (dev->power.wakeup_path
+ && !dev->parent->power.ignore_children)
+ dev->parent->power.wakeup_path = true;
+
+ spin_unlock_irq(&parent->power.lock);
+ }
}
device_unlock(dev);
@@ -1487,7 +1513,7 @@
{
int (*callback)(struct device *) = NULL;
char *info = NULL;
- int error = 0;
+ int ret = 0;
if (dev->power.syscore)
return 0;
@@ -1523,17 +1549,27 @@
callback = dev->driver->pm->prepare;
}
- if (callback) {
- error = callback(dev);
- suspend_report_result(callback, error);
- }
+ if (callback)
+ ret = callback(dev);
device_unlock(dev);
- if (error)
+ if (ret < 0) {
+ suspend_report_result(callback, ret);
pm_runtime_put(dev);
-
- return error;
+ return ret;
+ }
+ /*
+ * A positive return value from ->prepare() means "this device appears
+ * to be runtime-suspended and its state is fine, so if it really is
+ * runtime-suspended, you can leave it in that state provided that you
+ * will do the same thing with all of its descendants". This only
+ * applies to suspend transitions, however.
+ */
+ spin_lock_irq(&dev->power.lock);
+ dev->power.direct_complete = ret > 0 && state.event == PM_EVENT_SUSPEND;
+ spin_unlock_irq(&dev->power.lock);
+ return 0;
}
/**
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 2d56f41..eb1bd2e 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -318,10 +318,16 @@
{
int ret = 0;
+ if (!dev)
+ return -EINVAL;
+
if (enable) {
device_set_wakeup_capable(dev, true);
ret = device_wakeup_enable(dev);
} else {
+ if (dev->power.can_wakeup)
+ device_wakeup_disable(dev);
+
device_set_wakeup_capable(dev, false);
}
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 8236746..cb70199 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -32,6 +32,7 @@
static int enabled_devices;
static int off __read_mostly;
static int initialized __read_mostly;
+static bool use_deepest_state __read_mostly;
int cpuidle_disabled(void)
{
@@ -65,23 +66,42 @@
}
/**
- * cpuidle_enabled - check if the cpuidle framework is ready
- * @dev: cpuidle device for this cpu
- * @drv: cpuidle driver for this cpu
+ * cpuidle_use_deepest_state - Enable/disable the "deepest idle" mode.
+ * @enable: Whether enable or disable the feature.
*
- * Return 0 on success, otherwise:
- * -NODEV : the cpuidle framework is not available
- * -EBUSY : the cpuidle framework is not initialized
+ * If the "deepest idle" mode is enabled, cpuidle will ignore the governor and
+ * always use the state with the greatest exit latency (out of the states that
+ * are not disabled).
+ *
+ * This function can only be called after cpuidle_pause() to avoid races.
*/
-int cpuidle_enabled(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+void cpuidle_use_deepest_state(bool enable)
{
- if (off || !initialized)
- return -ENODEV;
+ use_deepest_state = enable;
+}
- if (!drv || !dev || !dev->enabled)
- return -EBUSY;
+/**
+ * cpuidle_find_deepest_state - Find the state of the greatest exit latency.
+ * @drv: cpuidle driver for a given CPU.
+ * @dev: cpuidle device for a given CPU.
+ */
+static int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
+ struct cpuidle_device *dev)
+{
+ unsigned int latency_req = 0;
+ int i, ret = CPUIDLE_DRIVER_STATE_START - 1;
- return 0;
+ for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
+ struct cpuidle_state *s = &drv->states[i];
+ struct cpuidle_state_usage *su = &dev->states_usage[i];
+
+ if (s->disabled || su->disable || s->exit_latency <= latency_req)
+ continue;
+
+ latency_req = s->exit_latency;
+ ret = i;
+ }
+ return ret;
}
/**
@@ -138,6 +158,15 @@
*/
int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
{
+ if (off || !initialized)
+ return -ENODEV;
+
+ if (!drv || !dev || !dev->enabled)
+ return -EBUSY;
+
+ if (unlikely(use_deepest_state))
+ return cpuidle_find_deepest_state(drv, dev);
+
return cpuidle_curr_governor->select(drv, dev);
}
@@ -169,7 +198,7 @@
*/
void cpuidle_reflect(struct cpuidle_device *dev, int index)
{
- if (cpuidle_curr_governor->reflect)
+ if (cpuidle_curr_governor->reflect && !unlikely(use_deepest_state))
cpuidle_curr_governor->reflect(dev, index);
}
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 71b5232..c4f80c1 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -296,7 +296,7 @@
data->needs_update = 0;
}
- data->last_state_idx = 0;
+ data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1;
/* Special case when user has set very strict latency requirement */
if (unlikely(latency_req == 0))
@@ -311,13 +311,6 @@
data->bucket = which_bucket(data->next_timer_us);
/*
- * if the correction factor is 0 (eg first time init or cpu hotplug
- * etc), we actually want to start out with a unity factor.
- */
- if (data->correction_factor[data->bucket] == 0)
- data->correction_factor[data->bucket] = RESOLUTION * DECAY;
-
- /*
* Force the result of multiplication to be 64 bits even if both
* operands are 32 bits.
* Make sure to round up for half microseconds.
@@ -466,9 +459,17 @@
struct cpuidle_device *dev)
{
struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
+ int i;
memset(data, 0, sizeof(struct menu_device));
+ /*
+ * if the correction factor is 0 (eg first time init or cpu hotplug
+ * etc), we actually want to start out with a unity factor.
+ */
+ for(i = 0; i < BUCKETS; i++)
+ data->correction_factor[i] = RESOLUTION * DECAY;
+
return 0;
}
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index b0238cb..c51a436 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -120,8 +120,6 @@
#ifdef CONFIG_CPU_IDLE
extern void disable_cpuidle(void);
-extern int cpuidle_enabled(struct cpuidle_driver *drv,
- struct cpuidle_device *dev);
extern int cpuidle_select(struct cpuidle_driver *drv,
struct cpuidle_device *dev);
extern int cpuidle_enter(struct cpuidle_driver *drv,
@@ -145,13 +143,11 @@
extern int cpuidle_enable_device(struct cpuidle_device *dev);
extern void cpuidle_disable_device(struct cpuidle_device *dev);
extern int cpuidle_play_dead(void);
+extern void cpuidle_use_deepest_state(bool enable);
extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
#else
static inline void disable_cpuidle(void) { }
-static inline int cpuidle_enabled(struct cpuidle_driver *drv,
- struct cpuidle_device *dev)
-{return -ENODEV; }
static inline int cpuidle_select(struct cpuidle_driver *drv,
struct cpuidle_device *dev)
{return -ENODEV; }
@@ -180,6 +176,7 @@
{return -ENODEV; }
static inline void cpuidle_disable_device(struct cpuidle_device *dev) { }
static inline int cpuidle_play_dead(void) {return -ENODEV; }
+static inline void cpuidle_use_deepest_state(bool enable) {}
static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
struct cpuidle_device *dev) {return NULL; }
#endif
diff --git a/include/linux/pm.h b/include/linux/pm.h
index d915d03..72c0fe0 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -93,13 +93,23 @@
* been registered) to recover from the race condition.
* This method is executed for all kinds of suspend transitions and is
* followed by one of the suspend callbacks: @suspend(), @freeze(), or
- * @poweroff(). The PM core executes subsystem-level @prepare() for all
- * devices before starting to invoke suspend callbacks for any of them, so
- * generally devices may be assumed to be functional or to respond to
- * runtime resume requests while @prepare() is being executed. However,
- * device drivers may NOT assume anything about the availability of user
- * space at that time and it is NOT valid to request firmware from within
- * @prepare() (it's too late to do that). It also is NOT valid to allocate
+ * @poweroff(). If the transition is a suspend to memory or standby (that
+ * is, not related to hibernation), the return value of @prepare() may be
+ * used to indicate to the PM core to leave the device in runtime suspend
+ * if applicable. Namely, if @prepare() returns a positive number, the PM
+ * core will understand that as a declaration that the device appears to be
+ * runtime-suspended and it may be left in that state during the entire
+ * transition and during the subsequent resume if all of its descendants
+ * are left in runtime suspend too. If that happens, @complete() will be
+ * executed directly after @prepare() and it must ensure the proper
+ * functioning of the device after the system resume.
+ * The PM core executes subsystem-level @prepare() for all devices before
+ * starting to invoke suspend callbacks for any of them, so generally
+ * devices may be assumed to be functional or to respond to runtime resume
+ * requests while @prepare() is being executed. However, device drivers
+ * may NOT assume anything about the availability of user space at that
+ * time and it is NOT valid to request firmware from within @prepare()
+ * (it's too late to do that). It also is NOT valid to allocate
* substantial amounts of memory from @prepare() in the GFP_KERNEL mode.
* [To work around these limitations, drivers may register suspend and
* hibernation notifiers to be executed before the freezing of tasks.]
@@ -112,7 +122,16 @@
* of the other devices that the PM core has unsuccessfully attempted to
* suspend earlier).
* The PM core executes subsystem-level @complete() after it has executed
- * the appropriate resume callbacks for all devices.
+ * the appropriate resume callbacks for all devices. If the corresponding
+ * @prepare() at the beginning of the suspend transition returned a
+ * positive number and the device was left in runtime suspend (without
+ * executing any suspend and resume callbacks for it), @complete() will be
+ * the only callback executed for the device during resume. In that case,
+ * @complete() must be prepared to do whatever is necessary to ensure the
+ * proper functioning of the device after the system resume. To this end,
+ * @complete() can check the power.direct_complete flag of the device to
+ * learn whether (unset) or not (set) the previous suspend and resume
+ * callbacks have been executed for it.
*
* @suspend: Executed before putting the system into a sleep state in which the
* contents of main memory are preserved. The exact action to perform
@@ -546,6 +565,7 @@
bool is_late_suspended:1;
bool ignore_children:1;
bool early_init:1; /* Owned by the PM core */
+ bool direct_complete:1; /* Owned by the PM core */
spinlock_t lock;
#ifdef CONFIG_PM_SLEEP
struct list_head entry;
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 2a5897a..43fd671 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -101,6 +101,11 @@
return dev->power.runtime_status == RPM_SUSPENDED;
}
+static inline bool pm_runtime_suspended_if_enabled(struct device *dev)
+{
+ return pm_runtime_status_suspended(dev) && dev->power.disable_depth == 1;
+}
+
static inline bool pm_runtime_enabled(struct device *dev)
{
return !dev->power.disable_depth;
@@ -150,6 +155,7 @@
static inline bool pm_runtime_suspended(struct device *dev) { return false; }
static inline bool pm_runtime_active(struct device *dev) { return true; }
static inline bool pm_runtime_status_suspended(struct device *dev) { return false; }
+static inline bool pm_runtime_suspended_if_enabled(struct device *dev) { return false; }
static inline bool pm_runtime_enabled(struct device *dev) { return false; }
static inline void pm_runtime_no_callbacks(struct device *dev) {}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index f4f2073..df88d55 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -35,7 +35,7 @@
static int nocompress;
static int noresume;
static int resume_wait;
-static int resume_delay;
+static unsigned int resume_delay;
static char resume_file[256] = CONFIG_PM_STD_PARTITION;
dev_t swsusp_resume_device;
sector_t swsusp_resume_block;
@@ -228,19 +228,23 @@
void swsusp_show_speed(struct timeval *start, struct timeval *stop,
unsigned nr_pages, char *msg)
{
- s64 elapsed_centisecs64;
- int centisecs;
- int k;
- int kps;
+ u64 elapsed_centisecs64;
+ unsigned int centisecs;
+ unsigned int k;
+ unsigned int kps;
elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+ /*
+ * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
+ * it is obvious enough for what went wrong.
+ */
do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
centisecs = elapsed_centisecs64;
if (centisecs == 0)
centisecs = 1; /* avoid div-by-zero */
k = nr_pages * (PAGE_SIZE / 1024);
kps = (k * 100) / centisecs;
- printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
+ printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
msg, k,
centisecs / 100, centisecs % 100,
kps / 1000, (kps % 1000) / 10);
@@ -595,7 +599,8 @@
case HIBERNATION_PLATFORM:
hibernation_platform_enter();
case HIBERNATION_SHUTDOWN:
- kernel_power_off();
+ if (pm_power_off)
+ kernel_power_off();
break;
#ifdef CONFIG_SUSPEND
case HIBERNATION_SUSPEND:
@@ -623,7 +628,8 @@
* corruption after resume.
*/
printk(KERN_CRIT "PM: Please power down manually\n");
- while(1);
+ while (1)
+ cpu_relax();
}
/**
@@ -1109,7 +1115,10 @@
static int __init resumedelay_setup(char *str)
{
- resume_delay = simple_strtoul(str, NULL, 0);
+ int rc = kstrtouint(str, 0, &resume_delay);
+
+ if (rc)
+ return rc;
return 1;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6271bc4..573410d 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -279,26 +279,26 @@
struct kobject *power_kobj;
/**
- * state - control system power state.
+ * state - control system sleep states.
*
- * show() returns what states are supported, which is hard-coded to
- * 'freeze' (Low-Power Idle), 'standby' (Power-On Suspend),
- * 'mem' (Suspend-to-RAM), and 'disk' (Suspend-to-Disk).
+ * show() returns available sleep state labels, which may be "mem", "standby",
+ * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a
+ * description of what they mean.
*
- * store() accepts one of those strings, translates it into the
- * proper enumerated value, and initiates a suspend transition.
+ * store() accepts one of those strings, translates it into the proper
+ * enumerated value, and initiates a suspend transition.
*/
static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
char *s = buf;
#ifdef CONFIG_SUSPEND
- int i;
+ suspend_state_t i;
- for (i = 0; i < PM_SUSPEND_MAX; i++) {
- if (pm_states[i] && valid_state(i))
- s += sprintf(s,"%s ", pm_states[i]);
- }
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+ if (pm_states[i].state)
+ s += sprintf(s,"%s ", pm_states[i].label);
+
#endif
#ifdef CONFIG_HIBERNATION
s += sprintf(s, "%s\n", "disk");
@@ -314,7 +314,7 @@
{
#ifdef CONFIG_SUSPEND
suspend_state_t state = PM_SUSPEND_MIN;
- const char * const *s;
+ struct pm_sleep_state *s;
#endif
char *p;
int len;
@@ -328,8 +328,9 @@
#ifdef CONFIG_SUSPEND
for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
- if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
- return state;
+ if (s->state && len == strlen(s->label)
+ && !strncmp(buf, s->label, len))
+ return s->state;
#endif
return PM_SUSPEND_ON;
@@ -447,8 +448,8 @@
#ifdef CONFIG_SUSPEND
if (state < PM_SUSPEND_MAX)
- return sprintf(buf, "%s\n", valid_state(state) ?
- pm_states[state] : "error");
+ return sprintf(buf, "%s\n", pm_states[state].state ?
+ pm_states[state].label : "error");
#endif
#ifdef CONFIG_HIBERNATION
return sprintf(buf, "disk\n");
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 15f37ea..c60f13b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -178,17 +178,20 @@
unsigned int, char *);
#ifdef CONFIG_SUSPEND
-/* kernel/power/suspend.c */
-extern const char *const pm_states[];
+struct pm_sleep_state {
+ const char *label;
+ suspend_state_t state;
+};
-extern bool valid_state(suspend_state_t state);
+/* kernel/power/suspend.c */
+extern struct pm_sleep_state pm_states[];
+
extern int suspend_devices_and_enter(suspend_state_t state);
#else /* !CONFIG_SUSPEND */
static inline int suspend_devices_and_enter(suspend_state_t state)
{
return -ENOSYS;
}
-static inline bool valid_state(suspend_state_t state) { return false; }
#endif /* !CONFIG_SUSPEND */
#ifdef CONFIG_PM_TEST_SUSPEND
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8233cd4..338a6f1 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,10 +31,10 @@
#include "power.h"
-const char *const pm_states[PM_SUSPEND_MAX] = {
- [PM_SUSPEND_FREEZE] = "freeze",
- [PM_SUSPEND_STANDBY] = "standby",
- [PM_SUSPEND_MEM] = "mem",
+struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = {
+ [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE },
+ [PM_SUSPEND_STANDBY] = { .label = "standby", },
+ [PM_SUSPEND_MEM] = { .label = "mem", },
};
static const struct platform_suspend_ops *suspend_ops;
@@ -54,9 +54,11 @@
static void freeze_enter(void)
{
+ cpuidle_use_deepest_state(true);
cpuidle_resume();
wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
cpuidle_pause();
+ cpuidle_use_deepest_state(false);
}
void freeze_wake(void)
@@ -66,42 +68,62 @@
}
EXPORT_SYMBOL_GPL(freeze_wake);
+static bool valid_state(suspend_state_t state)
+{
+ /*
+ * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level
+ * support and need to be valid to the low level
+ * implementation, no valid callback implies that none are valid.
+ */
+ return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
+}
+
+/*
+ * If this is set, the "mem" label always corresponds to the deepest sleep state
+ * available, the "standby" label corresponds to the second deepest sleep state
+ * available (if any), and the "freeze" label corresponds to the remaining
+ * available sleep state (if there is one).
+ */
+static bool relative_states;
+
+static int __init sleep_states_setup(char *str)
+{
+ relative_states = !strncmp(str, "1", 1);
+ if (relative_states) {
+ pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE;
+ pm_states[PM_SUSPEND_FREEZE].state = 0;
+ }
+ return 1;
+}
+
+__setup("relative_sleep_states=", sleep_states_setup);
+
/**
* suspend_set_ops - Set the global suspend method table.
* @ops: Suspend operations to use.
*/
void suspend_set_ops(const struct platform_suspend_ops *ops)
{
+ suspend_state_t i;
+ int j = PM_SUSPEND_MAX - 1;
+
lock_system_sleep();
+
suspend_ops = ops;
+ for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
+ if (valid_state(i))
+ pm_states[j--].state = i;
+ else if (!relative_states)
+ pm_states[j--].state = 0;
+
+ pm_states[j--].state = PM_SUSPEND_FREEZE;
+ while (j >= PM_SUSPEND_MIN)
+ pm_states[j--].state = 0;
+
unlock_system_sleep();
}
EXPORT_SYMBOL_GPL(suspend_set_ops);
-bool valid_state(suspend_state_t state)
-{
- if (state == PM_SUSPEND_FREEZE) {
-#ifdef CONFIG_PM_DEBUG
- if (pm_test_level != TEST_NONE &&
- pm_test_level != TEST_FREEZER &&
- pm_test_level != TEST_DEVICES &&
- pm_test_level != TEST_PLATFORM) {
- printk(KERN_WARNING "Unsupported pm_test mode for "
- "freeze state, please choose "
- "none/freezer/devices/platform.\n");
- return false;
- }
-#endif
- return true;
- }
- /*
- * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
- * support and need to be valid to the lowlevel
- * implementation, no valid callback implies that none are valid.
- */
- return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
-}
-
/**
* suspend_valid_only_mem - Generic memory-only valid callback.
*
@@ -328,9 +350,17 @@
{
int error;
- if (!valid_state(state))
- return -ENODEV;
-
+ if (state == PM_SUSPEND_FREEZE) {
+#ifdef CONFIG_PM_DEBUG
+ if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
+ pr_warning("PM: Unsupported test mode for freeze state,"
+ "please choose none/freezer/devices/platform.\n");
+ return -EAGAIN;
+ }
+#endif
+ } else if (!valid_state(state)) {
+ return -EINVAL;
+ }
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
@@ -341,7 +371,7 @@
sys_sync();
printk("done.\n");
- pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
+ pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label);
error = suspend_prepare(state);
if (error)
goto Unlock;
@@ -349,7 +379,7 @@
if (suspend_test(TEST_FREEZER))
goto Finish;
- pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+ pr_debug("PM: Entering %s sleep\n", pm_states[state].label);
pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
pm_restore_gfp_mask();
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 9b2a1d5..269b097 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -92,13 +92,13 @@
}
if (state == PM_SUSPEND_MEM) {
- printk(info_test, pm_states[state]);
+ printk(info_test, pm_states[state].label);
status = pm_suspend(state);
if (status == -ENODEV)
state = PM_SUSPEND_STANDBY;
}
if (state == PM_SUSPEND_STANDBY) {
- printk(info_test, pm_states[state]);
+ printk(info_test, pm_states[state].label);
status = pm_suspend(state);
}
if (status < 0)
@@ -136,18 +136,16 @@
static int __init setup_test_suspend(char *value)
{
- unsigned i;
+ suspend_state_t i;
/* "=mem" ==> "mem" */
value++;
- for (i = 0; i < PM_SUSPEND_MAX; i++) {
- if (!pm_states[i])
- continue;
- if (strcmp(pm_states[i], value) != 0)
- continue;
- test_state = (__force suspend_state_t) i;
- return 0;
- }
+ for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+ if (!strcmp(pm_states[i].label, value)) {
+ test_state = pm_states[i].state;
+ return 0;
+ }
+
printk(warn_bad_state, value);
return 0;
}
@@ -164,8 +162,8 @@
/* PM is initialized by now; is that state testable? */
if (test_state == PM_SUSPEND_ON)
goto done;
- if (!valid_state(test_state)) {
- printk(warn_bad_state, pm_states[test_state]);
+ if (!pm_states[test_state].state) {
+ printk(warn_bad_state, pm_states[test_state].label);
goto done;
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f4390a..a8f1224 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -101,19 +101,13 @@
rcu_idle_enter();
/*
- * Check if the cpuidle framework is ready, otherwise fallback
- * to the default arch specific idle method
+ * Ask the cpuidle framework to choose a convenient idle state.
+ * Fall back to the default arch specific idle method on errors.
*/
- ret = cpuidle_enabled(drv, dev);
+ next_state = cpuidle_select(drv, dev);
- if (!ret) {
- /*
- * Ask the governor to choose an idle state it thinks
- * it is convenient to go to. There is *always* a
- * convenient idle state
- */
- next_state = cpuidle_select(drv, dev);
-
+ ret = next_state;
+ if (ret >= 0) {
/*
* The idle task must be scheduled, it is pointless to
* go to idle, just update no idle residency and get
@@ -140,7 +134,7 @@
CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
&dev->cpu);
- if (!ret) {
+ if (ret >= 0) {
trace_cpu_idle_rcuidle(next_state, dev->cpu);
/*
@@ -175,7 +169,7 @@
* We can't use the cpuidle framework, let's use the default
* idle routine
*/
- if (ret)
+ if (ret < 0)
arch_cpu_idle();
__current_set_polling();