subsystem-restart: Allow a limited number of restarts in a time interval
Subsystems may crash often enough to render a device unusable or
even unstable. To avoid such a situation, reboot the entire system
if a predefined number of subsystem restart sequences occur in a
predefined interval of time.
Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
diff --git a/arch/arm/mach-msm/subsystem_restart.c b/arch/arm/mach-msm/subsystem_restart.c
index 002c9f8..40e6e0c 100644
--- a/arch/arm/mach-msm/subsystem_restart.c
+++ b/arch/arm/mach-msm/subsystem_restart.c
@@ -20,6 +20,7 @@
#include <linux/list.h>
#include <linux/io.h>
#include <linux/kthread.h>
+#include <linux/time.h>
#include <asm/current.h>
@@ -51,12 +52,20 @@
int coupled;
};
+struct restart_log {
+ struct timeval time;
+ struct subsys_data *subsys;
+ struct list_head list;
+};
+
static int restart_level;
static int enable_ramdumps;
+static LIST_HEAD(restart_log_list);
static LIST_HEAD(subsystem_list);
static DEFINE_MUTEX(subsystem_list_lock);
static DEFINE_MUTEX(soc_order_reg_lock);
+static DEFINE_MUTEX(restart_log_mutex);
/* SOC specific restart orders go here */
@@ -224,6 +233,67 @@
restart_list[i]->notif_handle, notif_type);
}
+static int max_restarts;
+module_param(max_restarts, int, 0644);
+
+static long max_history_time = 3600;
+module_param(max_history_time, long, 0644);
+
+static void do_epoch_check(struct subsys_data *subsys)
+{
+ int n = 0;
+ struct timeval *time_first, *curr_time;
+ struct restart_log *r_log, *temp;
+ static int max_restarts_check;
+ static long max_history_time_check;
+
+ mutex_lock(&restart_log_mutex);
+
+ max_restarts_check = max_restarts;
+ max_history_time_check = max_history_time;
+
+ /* Check if epoch checking is enabled */
+ if (!max_restarts_check)
+ return;
+
+ r_log = kmalloc(sizeof(struct restart_log), GFP_KERNEL);
+ r_log->subsys = subsys;
+ do_gettimeofday(&r_log->time);
+ curr_time = &r_log->time;
+ INIT_LIST_HEAD(&r_log->list);
+
+ list_add_tail(&r_log->list, &restart_log_list);
+
+ list_for_each_entry_safe(r_log, temp, &restart_log_list, list) {
+
+ if ((curr_time->tv_sec - r_log->time.tv_sec) >
+ max_history_time_check) {
+
+ pr_debug("Deleted node with restart_time = %ld\n",
+ r_log->time.tv_sec);
+ list_del(&r_log->list);
+ kfree(r_log);
+ continue;
+ }
+ if (!n) {
+ time_first = &r_log->time;
+ pr_debug("time_first: %ld", time_first->tv_sec);
+ }
+ n++;
+ pr_debug("restart_time: %ld\n", r_log->time.tv_sec);
+ }
+
+ if (n >= max_restarts_check) {
+ if ((curr_time->tv_sec - time_first->tv_sec) <
+ max_history_time_check)
+ panic("Subsystems have crashed %d times in less than "
+ "%ld seconds!", max_restarts_check,
+ max_history_time_check);
+ }
+
+ mutex_unlock(&restart_log_mutex);
+}
+
static int subsystem_restart_thread(void *data)
{
struct restart_thread_data *r_work = data;
@@ -278,6 +348,8 @@
if (!mutex_trylock(powerup_lock))
panic("%s: Subsystem died during powerup!", __func__);
+ do_epoch_check(subsys);
+
/* Now it is necessary to take the registration lock. This is because
* the subsystem list in the SoC restart order will be traversed
* and it shouldn't be changed until _this_ restart sequence completes.