ANDROID: hardlockup: detect hard lockups without NMIs using secondary cpus
This patch adds back the hardlockup detection without NMIs from
<aaf78ec6c898b72>("ANDROID: hardlockup: detect hard lockups without NMIs
using secondary cpus") which was reverted after the hard lockup detection
logic was moved to a separate file in <b969a240448bfd8>("kernel/watchdog.c:
move hardlockup detector to separate file").
Change-Id: If6a24b379d29fa582344fe57d26df4bd2b8cc89f
Signed-off-by: Kyle Yan <kyan@codeaurora.org>
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 12b8dd6..5b2c127 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -18,7 +18,11 @@
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static cpumask_t __read_mostly watchdog_cpus;
+#else
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+#endif
/* boot commands */
/*
@@ -26,7 +30,10 @@
*/
unsigned int __read_mostly hardlockup_panic =
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
static unsigned long hardlockup_allcpu_dumped;
+#endif
/*
* We may not want to enable hard lockup detection by default in all cases,
* for example when running the kernel as a guest on a hypervisor. In these
@@ -68,6 +75,108 @@
}
EXPORT_SYMBOL(touch_nmi_watchdog);
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static unsigned int watchdog_next_cpu(unsigned int cpu)
+{
+ cpumask_t cpus = watchdog_cpus;
+ unsigned int next_cpu;
+
+ next_cpu = cpumask_next(cpu, &cpus);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(&cpus);
+
+ if (next_cpu == cpu)
+ return nr_cpu_ids;
+
+ return next_cpu;
+}
+
+static int is_hardlockup_other_cpu(unsigned int cpu)
+{
+ unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
+
+ if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
+ return 1;
+
+ per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+ return 0;
+}
+
+void watchdog_check_hardlockup_other_cpu(void)
+{
+ unsigned int next_cpu;
+
+ /*
+ * Test for hardlockups every 3 samples. The sample period is
+ * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
+ * watchdog_thresh (over by 20%).
+ */
+ if (__this_cpu_read(hrtimer_interrupts) % 3 != 0)
+ return;
+
+ /* check for a hardlockup on the next cpu */
+ next_cpu = watchdog_next_cpu(smp_processor_id());
+ if (next_cpu >= nr_cpu_ids)
+ return;
+
+ smp_rmb();
+
+ if (per_cpu(watchdog_nmi_touch, next_cpu) == true) {
+ per_cpu(watchdog_nmi_touch, next_cpu) = false;
+ return;
+ }
+
+ if (is_hardlockup_other_cpu(next_cpu)) {
+ /* only warn once */
+ if (per_cpu(hard_watchdog_warn, next_cpu) == true)
+ return;
+
+ if (hardlockup_panic)
+ panic("Watchdog detected hard LOCKUP on cpu %u",
+ next_cpu);
+ else
+ WARN(1, "Watchdog detected hard LOCKUP on cpu %u",
+ next_cpu);
+
+ per_cpu(hard_watchdog_warn, next_cpu) = true;
+ } else {
+ per_cpu(hard_watchdog_warn, next_cpu) = false;
+ }
+}
+
+int watchdog_nmi_enable(unsigned int cpu)
+{
+ /*
+ * The new cpu will be marked online before the first hrtimer interrupt
+ * runs on it. If another cpu tests for a hardlockup on the new cpu
+ * before it has run its first hrtimer, it will get a false positive.
+ * Touch the watchdog on the new cpu to delay the first check for at
+ * least 3 sampling periods to guarantee one hrtimer has run on the new
+ * cpu.
+ */
+ per_cpu(watchdog_nmi_touch, cpu) = true;
+ smp_wmb();
+ cpumask_set_cpu(cpu, &watchdog_cpus);
+ return 0;
+}
+
+void watchdog_nmi_disable(unsigned int cpu)
+{
+ unsigned int next_cpu = watchdog_next_cpu(cpu);
+
+ /*
+ * Offlining this cpu will cause the cpu before this one to start
+ * checking the one after this one. If this cpu just finished checking
+ * the next cpu and updating hrtimer_interrupts_saved, and then the
+ * previous cpu checks it within one sample period, it will trigger a
+ * false positive. Touch the watchdog on the next cpu to prevent it.
+ */
+ if (next_cpu < nr_cpu_ids)
+ per_cpu(watchdog_nmi_touch, next_cpu) = true;
+ smp_wmb();
+ cpumask_clear_cpu(cpu, &watchdog_cpus);
+}
+#else
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
@@ -228,3 +337,4 @@
cpu0_err = 0;
}
}
+#endif