msm: subsystem_restart: Add support for error services ready indication

Most subsystems are capable of monitoring and reporting their own errors.
But things could go wrong during their boot up before they enable this
capability. To detect this error condition, subsystems will now send a new
"error services ready" indication to the application processor when they
are ready to monitor their own errors. When the "error services ready"
indication is expected from a subsystem, but doesn't come within a
reasonable time, it should be treated as an error.

There are multiple ways to handle this error: 1) Treat the timeout as a SSR
request after the subsystem has booted up. 2) Treat the reception of the
indication as a "boot complete" requirement.

Option (1) will mean that flashing an older image or one that doesn't have
support for this new indication by accident will trigger back-to-back
restarts of the subsystem. After the SSR count/time limit is hit, it will
trigger an entire system reset. Once the system reset is complete, it will
be back to a series of back-to-back restarts of the sybsystem. This results
in continuous resets of the system, making it hard to recover the device
from this invalid configuration.

Option (2) will mean that flashing an incorrect image just results in
failure of boot up of the subsystem. Repeated attempts will only result in
successive boot failures, but will not trigger a subsystem restart or hit
any limits that will trigger a system reset. Recovering from this invalid
configuration should be easier since the system is still stable and allows
manual reflashing of the proper subsystem images. This option also means
that failure to get this indication during a subsystem restart is treated
as a boot failure during SSR. Since the current SSR framework treats boot
failures during SSR as an unrecoverable error and restarts the whole
system, a properly configured system is never left in a state with a
dysfunctional subsystem.

For this reason, option (2), treating the timely reception of the error
services ready indication as a successful boot up requirement is the
implementation chosen by this patch.

Change-Id: Ibe7ea522babfe826e40a950ef3ea4577a463a313
Signed-off-by: Seemanta Dutta <seemanta@codeaurora.org>
diff --git a/arch/arm/mach-msm/subsystem_restart.c b/arch/arm/mach-msm/subsystem_restart.c
index 5fe7a29..fb44e16 100644
--- a/arch/arm/mach-msm/subsystem_restart.c
+++ b/arch/arm/mach-msm/subsystem_restart.c
@@ -31,6 +31,7 @@
 #include <linux/idr.h>
 #include <linux/debugfs.h>
 #include <linux/miscdevice.h>
+#include <linux/interrupt.h>
 
 #include <asm/current.h>
 
@@ -132,6 +133,7 @@
  * @restart_order: order of other devices this devices restarts with
  * @dentry: debugfs directory for this device
  * @do_ramdump_on_put: ramdump on subsystem_put() if true
+ * @err_ready: completion variable to record error ready from subsystem
  */
 struct subsys_device {
 	struct subsys_desc *desc;
@@ -154,6 +156,7 @@
 	bool do_ramdump_on_put;
 	struct miscdevice misc_dev;
 	char miscdevice_name[32];
+	struct completion err_ready;
 };
 
 static struct subsys_device *to_subsys(struct device *d)
@@ -412,6 +415,21 @@
 	}
 }
 
+static int wait_for_err_ready(struct subsys_device *subsys)
+{
+	int ret;
+
+	if (!subsys->desc->err_ready_irq)
+		return 0;
+
+	ret = wait_for_completion_timeout(&subsys->err_ready,
+					  msecs_to_jiffies(10000));
+	if (!ret)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
 static void subsystem_shutdown(struct subsys_device *dev, void *data)
 {
 	const char *name = dev->desc->name;
@@ -436,10 +454,17 @@
 static void subsystem_powerup(struct subsys_device *dev, void *data)
 {
 	const char *name = dev->desc->name;
+	int ret;
 
 	pr_info("[%p]: Powering up %s\n", current, name);
+	init_completion(&dev->err_ready);
 	if (dev->desc->powerup(dev->desc) < 0)
-		panic("[%p]: Failed to powerup %s!", current, name);
+		panic("[%p]: Powerup error: %s!", current, name);
+
+	ret = wait_for_err_ready(dev);
+	if (ret)
+		panic("[%p]: Timed out waiting for error ready: %s!",
+			current, name);
 	subsys_set_state(dev, SUBSYS_ONLINE);
 }
 
@@ -465,8 +490,18 @@
 {
 	int ret;
 
+	init_completion(&subsys->err_ready);
 	ret = subsys->desc->start(subsys->desc);
-	if (!ret)
+	if (ret)
+		return ret;
+
+	ret = wait_for_err_ready(subsys);
+	if (ret)
+		/* pil-boot succeeded but we need to shutdown
+		 * the device because error ready timed out.
+		 */
+		subsys->desc->stop(subsys->desc);
+	else
 		subsys_set_state(subsys, SUBSYS_ONLINE);
 
 	return ret;
@@ -895,6 +930,14 @@
 	ida_simple_remove(&subsys_ida, subsys->id);
 	kfree(subsys);
 }
+static irqreturn_t subsys_err_ready_intr_handler(int irq, void *subsys)
+{
+	struct subsys_device *subsys_dev = subsys;
+	pr_info("Error ready interrupt occured for %s\n",
+		 subsys_dev->desc->name);
+	complete(&subsys_dev->err_ready);
+	return IRQ_HANDLED;
+}
 
 static int subsys_misc_device_add(struct subsys_device *subsys_dev)
 {
@@ -971,8 +1014,24 @@
 		goto err_register;
 	}
 
+	if (subsys->desc->err_ready_irq) {
+		ret = devm_request_irq(&subsys->dev,
+					subsys->desc->err_ready_irq,
+					subsys_err_ready_intr_handler,
+					IRQF_TRIGGER_RISING,
+					"error_ready_interrupt", subsys);
+		if (ret < 0) {
+			dev_err(&subsys->dev,
+				"[%s]: Unable to register err ready handler\n",
+				subsys->desc->name);
+			goto err_misc_device;
+		}
+	}
+
 	return subsys;
 
+err_misc_device:
+	subsys_misc_device_remove(subsys);
 err_register:
 	subsys_debugfs_remove(subsys);
 err_debugfs: