msm: SSR: Fix problems with concurrent SSRs If one SSR call comes in and queues a work and the work item starts running and then another SSR call comes in we will end up with a running work item and a pending work item. The pending work item will not run until the running work item completes. With the current code the work item will run to completion and then the pending work item will run and restart the subsystem again. This is wrong since we want to 'short-circuit' the code in this case and do nothing if multiple SSR calls come in while the subsystem is in the 'crashed' state. Add state tracking logic so that we know what part of the restart process a particular subsystem is in and use it to fix this problem. Change-Id: I3592a0f9ed777251081a30cc0ee0af9e49fff733 CRs-Fixed: 397848 Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>

commit: 6b56718679898fef4a0cd94227198e2c42b72b9c [log] [tgz]
author: Stephen Boyd <sboyd@codeaurora.org> Tue Sep 18 11:31:02 2012 -0700
committer: Stephen Boyd <sboyd@codeaurora.org> Tue Sep 18 13:45:37 2012 -0700
tree: 3d83346729f8607408e078e921bdcec4d5365ed5
parent: 769984218fa7cd2dea9a9b525cb168de5643d56b [diff]
diff --git a/arch/arm/mach-msm/subsystem_restart.c b/arch/arm/mach-msm/subsystem_restart.c
index 0318a70..bae1ab0 100644
--- a/arch/arm/mach-msm/subsystem_restart.c
+++ b/arch/arm/mach-msm/subsystem_restart.c

@@ -58,11 +58,13 @@
 enum subsys_state {
 	SUBSYS_OFFLINE,
 	SUBSYS_ONLINE,
+	SUBSYS_CRASHED,
 };
 
 static const char * const subsys_states[] = {
 	[SUBSYS_OFFLINE] = "OFFLINE",
 	[SUBSYS_ONLINE] = "ONLINE",
+	[SUBSYS_CRASHED] = "CRASHED",
 };
 
 struct subsys_device {
@@ -71,7 +73,7 @@
 	char wlname[64];
 	struct work_struct work;
 	spinlock_t restart_lock;
-	int restart_count;
+	bool restarting;
 
 	void *notify;
 	struct device dev;
@@ -107,6 +109,21 @@
 	return snprintf(buf, PAGE_SIZE, "%s\n", subsys_states[state]);
 }
 
+static void subsys_set_state(struct subsys_device *subsys,
+			     enum subsys_state state)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&subsys->restart_lock, flags);
+	if (subsys->state != state) {
+		subsys->state = state;
+		spin_unlock_irqrestore(&subsys->restart_lock, flags);
+		sysfs_notify(&subsys->dev.kobj, NULL, "state");
+		return;
+	}
+	spin_unlock_irqrestore(&subsys->restart_lock, flags);
+}
+
 static struct device_attribute subsys_attrs[] = {
 	__ATTR_RO(name),
 	__ATTR_RO(state),
@@ -333,6 +350,7 @@
 	if (dev->desc->shutdown(dev->desc) < 0)
 		panic("subsys-restart: [%p]: Failed to shutdown %s!",
 			current, name);
+	subsys_set_state(dev, SUBSYS_OFFLINE);
 }
 
 static void subsystem_ramdump(struct subsys_device *dev, void *data)
@@ -351,6 +369,7 @@
 	pr_info("[%p]: Powering up %s\n", current, name);
 	if (dev->desc->powerup(dev->desc) < 0)
 		panic("[%p]: Failed to powerup %s!", current, name);
+	subsys_set_state(dev, SUBSYS_ONLINE);
 }
 
 static int __find_subsys(struct device *dev, void *data)
@@ -418,7 +437,10 @@
 	 * Now that we've acquired the shutdown lock, either we're the first to
 	 * restart these subsystems or some other thread is doing the powerup
 	 * sequence for these subsystems. In the latter case, panic and bail
-	 * out, since a subsystem died in its powerup sequence.
+	 * out, since a subsystem died in its powerup sequence. This catches
+	 * the case where a subsystem in a restart order isn't the one
+	 * who initiated the original restart but has crashed while the restart
+	 * order is being rebooted.
 	 */
 	if (!mutex_trylock(powerup_lock))
 		panic("%s[%p]: Subsystem died during powerup!",
@@ -465,32 +487,36 @@
 
 out:
 	spin_lock_irqsave(&dev->restart_lock, flags);
-	dev->restart_count--;
-	if (!dev->restart_count)
-		wake_unlock(&dev->wake_lock);
+	dev->restarting = false;
+	wake_unlock(&dev->wake_lock);
 	spin_unlock_irqrestore(&dev->restart_lock, flags);
 }
 
 static void __subsystem_restart_dev(struct subsys_device *dev)
 {
 	struct subsys_desc *desc = dev->desc;
+	const char *name = dev->desc->name;
 	unsigned long flags;
 
 	pr_debug("Restarting %s [level=%d]!\n", desc->name, restart_level);
 
+	/*
+	 * We want to allow drivers to call subsystem_restart{_dev}() as many
+	 * times as they want up until the point where the subsystem is
+	 * shutdown.
+	 */
 	spin_lock_irqsave(&dev->restart_lock, flags);
-	if (!dev->restart_count)
-		wake_lock(&dev->wake_lock);
-	dev->restart_count++;
-	spin_unlock_irqrestore(&dev->restart_lock, flags);
-
-	if (!queue_work(ssr_wq, &dev->work)) {
-		spin_lock_irqsave(&dev->restart_lock, flags);
-		dev->restart_count--;
-		if (!dev->restart_count)
-			wake_unlock(&dev->wake_lock);
-		spin_unlock_irqrestore(&dev->restart_lock, flags);
+	if (dev->state != SUBSYS_CRASHED) {
+		if (dev->state == SUBSYS_ONLINE && !dev->restarting) {
+			dev->restarting = true;
+			dev->state = SUBSYS_CRASHED;
+			wake_lock(&dev->wake_lock);
+			queue_work(ssr_wq, &dev->work);
+		} else {
+			panic("Subsystem %s crashed during SSR!", name);
+		}
 	}
+	spin_unlock_irqrestore(&dev->restart_lock, flags);
 }
 
 int subsystem_restart_dev(struct subsys_device *dev)
@@ -644,6 +670,7 @@
 	subsys->dev.parent = desc->dev;
 	subsys->dev.bus = &subsys_bus_type;
 	subsys->dev.release = subsys_device_release;
+	subsys->state = SUBSYS_ONLINE; /* Until proper refcounting appears */
 
 	subsys->notify = subsys_notif_add_subsys(desc->name);
 	subsys->restart_order = update_restart_order(subsys);
commit	6b56718679898fef4a0cd94227198e2c42b72b9c	[log] [tgz]
author	Stephen Boyd <sboyd@codeaurora.org>	Tue Sep 18 11:31:02 2012 -0700
committer	Stephen Boyd <sboyd@codeaurora.org>	Tue Sep 18 13:45:37 2012 -0700
tree	3d83346729f8607408e078e921bdcec4d5365ed5
parent	769984218fa7cd2dea9a9b525cb168de5643d56b [diff]