msm: Add support for cache error reporting

Add support for reporting L1 / L2 cache errors and TLB
errors, as well as printing a warning in the kernel log (or
optionally panicing the kernel) depending on error
severity.

Change-Id: I6aa4de4bbf478091df88d8ca0be840cbeb4d39d4
Signed-off-by: Stepan Moskovchenko <stepanm@codeaurora.org>
diff --git a/arch/arm/mach-msm/cache_erp.c b/arch/arm/mach-msm/cache_erp.c
new file mode 100644
index 0000000..3c317e9
--- /dev/null
+++ b/arch/arm/mach-msm/cache_erp.c
@@ -0,0 +1,475 @@
+/* Copyright (c) 2012, Code Aurora Forum. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/cpu.h>
+#include <mach/msm-krait-l2-accessors.h>
+
+#define CESR_DCTPE		BIT(0)
+#define CESR_DCDPE		BIT(1)
+#define CESR_ICTPE		BIT(2)
+#define CESR_ICDPE		BIT(3)
+#define CESR_DCTE		(BIT(4) | BIT(5))
+#define CESR_ICTE		(BIT(6) | BIT(7))
+#define CESR_TLBMH		BIT(16)
+#define CESR_I_MASK		0x000000CC
+
+#define L2ESR_IND_ADDR		0x204
+#define L2ESYNR0_IND_ADDR	0x208
+#define L2ESYNR1_IND_ADDR	0x209
+#define L2EAR0_IND_ADDR		0x20C
+#define L2EAR1_IND_ADDR		0x20D
+
+#define L2ESR_MPDCD		BIT(0)
+#define L2ESR_MPSLV             BIT(1)
+#define L2ESR_TSESB             BIT(2)
+#define L2ESR_TSEDB             BIT(3)
+#define L2ESR_DSESB             BIT(4)
+#define L2ESR_DSEDB             BIT(5)
+#define L2ESR_MSE		BIT(6)
+#define L2ESR_MPLDREXNOK	BIT(8)
+
+#define L2ESR_CPU_MASK		0x0F
+#define L2ESR_CPU_SHIFT		16
+
+#ifdef CONFIG_MSM_L1_ERR_PANIC
+#define ERP_L1_ERR(a) panic(a)
+#else
+#define ERP_L1_ERR(a) do { } while (0)
+#endif
+
+#ifdef CONFIG_MSM_L2_ERP_PORT_PANIC
+#define ERP_PORT_ERR(a) panic(a)
+#else
+#define ERP_PORT_ERR(a) WARN(1, a)
+#endif
+
+#ifdef CONFIG_MSM_L2_ERP_1BIT_PANIC
+#define ERP_1BIT_ERR(a) panic(a)
+#else
+#define ERP_1BIT_ERR(a) do { } while (0)
+#endif
+
+#ifdef CONFIG_MSM_L2_ERP_2BIT_PANIC
+#define ERP_2BIT_ERR(a) panic(a)
+#else
+#define ERP_2BIT_ERR(a) do { } while (0)
+#endif
+
+#define MODULE_NAME "msm_cache_erp"
+
+struct msm_l1_err_stats {
+	unsigned int dctpe;
+	unsigned int dcdpe;
+	unsigned int ictpe;
+	unsigned int icdpe;
+	unsigned int dcte;
+	unsigned int icte;
+	unsigned int tlbmh;
+};
+
+struct msm_l2_err_stats {
+	unsigned int mpdcd;
+	unsigned int mpslv;
+	unsigned int tsesb;
+	unsigned int tsedb;
+	unsigned int dsesb;
+	unsigned int dsedb;
+	unsigned int mse;
+	unsigned int mplxrexnok;
+};
+
+static DEFINE_PER_CPU(struct msm_l1_err_stats, msm_l1_erp_stats);
+static struct msm_l2_err_stats msm_l2_erp_stats;
+
+static int l1_erp_irq, l2_erp_irq;
+static struct proc_dir_entry *procfs_entry;
+
+static inline unsigned int read_cesr(void)
+{
+	unsigned int cesr;
+	asm volatile ("mrc p15, 7, %0, c15, c0, 1" : "=r" (cesr));
+	return cesr;
+}
+
+static inline void write_cesr(unsigned int cesr)
+{
+	asm volatile ("mcr p15, 7, %[cesr], c15, c0, 1" : : [cesr]"r" (cesr));
+}
+
+static inline unsigned int read_cesynr(void)
+{
+	unsigned int cesynr;
+	asm volatile ("mrc p15, 7, %0, c15, c0, 3" : "=r" (cesynr));
+	return cesynr;
+}
+
+static int proc_read_status(char *page, char **start, off_t off, int count,
+			    int *eof, void *data)
+{
+	struct msm_l1_err_stats *l1_stats;
+	char *p = page;
+	int len, cpu, ret, bytes_left = PAGE_SIZE;
+
+	for_each_present_cpu(cpu) {
+		l1_stats = &per_cpu(msm_l1_erp_stats, cpu);
+
+		ret = snprintf(p, bytes_left,
+			"CPU %d:\n"	\
+			"\tD-cache tag parity errors:\t%u\n"	\
+			"\tD-cache data parity errors:\t%u\n"	\
+			"\tI-cache tag parity errors:\t%u\n"	\
+			"\tI-cache data parity errors:\t%u\n"	\
+			"\tD-cache timing errors:\t\t%u\n"	\
+			"\tI-cache timing errors:\t\t%u\n"	\
+			"\tTLB multi-hit errors:\t\t%u\n\n",	\
+			cpu,
+			l1_stats->dctpe,
+			l1_stats->dcdpe,
+			l1_stats->ictpe,
+			l1_stats->icdpe,
+			l1_stats->dcte,
+			l1_stats->icte,
+			l1_stats->tlbmh);
+		p += ret;
+		bytes_left -= ret;
+	}
+
+	p += snprintf(p, bytes_left,
+			"L2 master port decode errors:\t\t%u\n"	\
+			"L2 master port slave errors:\t\t%u\n"		\
+			"L2 tag soft errors, single-bit:\t\t%u\n"	\
+			"L2 tag soft errors, double-bit:\t\t%u\n"	\
+			"L2 data soft errors, single-bit:\t%u\n"	\
+			"L2 data soft errors, double-bit:\t%u\n"	\
+			"L2 modified soft errors:\t\t%u\n"		\
+			"L2 master port LDREX NOK errors:\t%u\n",
+			msm_l2_erp_stats.mpdcd,
+			msm_l2_erp_stats.mpslv,
+			msm_l2_erp_stats.tsesb,
+			msm_l2_erp_stats.tsedb,
+			msm_l2_erp_stats.dsesb,
+			msm_l2_erp_stats.dsedb,
+			msm_l2_erp_stats.mse,
+			msm_l2_erp_stats.mplxrexnok);
+
+	len = (p - page) - off;
+	if (len < 0)
+		len = 0;
+
+	*eof = (len <= count) ? 1 : 0;
+	*start = page + off;
+
+	return len;
+}
+
+static irqreturn_t msm_l1_erp_irq(int irq, void *dev_id)
+{
+	struct msm_l1_err_stats *l1_stats = dev_id;
+	unsigned int cesr = read_cesr();
+	unsigned int i_cesynr, d_cesynr;
+
+	pr_alert("L1 Error detected on CPU %d!\n", smp_processor_id());
+	pr_alert("\tCESR    = 0x%08x\n", cesr);
+
+	if (cesr & CESR_DCTPE) {
+		pr_alert("D-cache tag parity error\n");
+		l1_stats->dctpe++;
+	}
+
+	if (cesr & CESR_DCDPE) {
+		pr_alert("D-cache data parity error\n");
+		l1_stats->dcdpe++;
+	}
+
+	if (cesr & CESR_ICTPE) {
+		pr_alert("I-cache tag parity error\n");
+		l1_stats->ictpe++;
+	}
+
+	if (cesr & CESR_ICDPE) {
+		pr_alert("I-cache data parity error\n");
+		l1_stats->icdpe++;
+	}
+
+	if (cesr & CESR_DCTE) {
+		pr_alert("D-cache timing error\n");
+		l1_stats->dcte++;
+	}
+
+	if (cesr & CESR_ICTE) {
+		pr_alert("I-cache timing error\n");
+		l1_stats->icte++;
+	}
+
+	if (cesr & CESR_TLBMH) {
+		pr_alert("TLB multi-hit error\n");
+		l1_stats->tlbmh++;
+	}
+
+	if (cesr & (CESR_ICTPE | CESR_ICDPE | CESR_ICTE)) {
+		i_cesynr = read_cesynr();
+		pr_alert("I-side CESYNR = 0x%08x\n", i_cesynr);
+		write_cesr(CESR_I_MASK);
+
+		/*
+		 * Clear the I-side bits from the captured CESR value so that we
+		 * don't accidentally clear any new I-side errors when we do
+		 * the CESR write-clear operation.
+		 */
+		cesr &= ~CESR_I_MASK;
+	}
+
+	if (cesr & (CESR_DCTPE | CESR_DCDPE | CESR_DCTE)) {
+		d_cesynr = read_cesynr();
+		pr_alert("D-side CESYNR = 0x%08x\n", d_cesynr);
+	}
+
+	/* Clear the interrupt bits we processed */
+	write_cesr(cesr);
+
+	ERP_L1_ERR("L1 cache / TLB error detected");
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t msm_l2_erp_irq(int irq, void *dev_id)
+{
+	unsigned int l2esr;
+	unsigned int l2esynr0;
+	unsigned int l2esynr1;
+	unsigned int l2ear0;
+	unsigned int l2ear1;
+	int soft_error = 0;
+	int port_error = 0;
+	int unrecoverable = 0;
+
+	l2esr = get_l2_indirect_reg(L2ESR_IND_ADDR);
+	l2esynr0 = get_l2_indirect_reg(L2ESYNR0_IND_ADDR);
+	l2esynr1 = get_l2_indirect_reg(L2ESYNR1_IND_ADDR);
+	l2ear0 = get_l2_indirect_reg(L2EAR0_IND_ADDR);
+	l2ear1 = get_l2_indirect_reg(L2EAR1_IND_ADDR);
+
+	pr_alert("L2 Error detected!\n");
+	pr_alert("\tL2ESR    = 0x%08x\n", l2esr);
+	pr_alert("\tL2ESYNR0 = 0x%08x\n", l2esynr0);
+	pr_alert("\tL2ESYNR1 = 0x%08x\n", l2esynr1);
+	pr_alert("\tL2EAR0   = 0x%08x\n", l2ear0);
+	pr_alert("\tL2EAR1   = 0x%08x\n", l2ear1);
+	pr_alert("\tCPU bitmap = 0x%x\n", (l2esr >> L2ESR_CPU_SHIFT) &
+						    L2ESR_CPU_MASK);
+
+	if (l2esr & L2ESR_MPDCD) {
+		pr_alert("L2 master port decode error\n");
+		port_error++;
+		msm_l2_erp_stats.mpdcd++;
+	}
+
+	if (l2esr & L2ESR_MPSLV) {
+		pr_alert("L2 master port slave error\n");
+		port_error++;
+		msm_l2_erp_stats.mpslv++;
+	}
+
+	if (l2esr & L2ESR_TSESB) {
+		pr_alert("L2 tag soft error, single-bit\n");
+		soft_error++;
+		msm_l2_erp_stats.tsesb++;
+	}
+
+	if (l2esr & L2ESR_TSEDB) {
+		pr_alert("L2 tag soft error, double-bit\n");
+		soft_error++;
+		unrecoverable++;
+		msm_l2_erp_stats.tsedb++;
+	}
+
+	if (l2esr & L2ESR_DSESB) {
+		pr_alert("L2 data soft error, single-bit\n");
+		soft_error++;
+		msm_l2_erp_stats.dsesb++;
+	}
+
+	if (l2esr & L2ESR_DSEDB) {
+		pr_alert("L2 data soft error, double-bit\n");
+		soft_error++;
+		unrecoverable++;
+		msm_l2_erp_stats.dsedb++;
+	}
+
+	if (l2esr & L2ESR_MSE) {
+		pr_alert("L2 modified soft error\n");
+		soft_error++;
+		msm_l2_erp_stats.mse++;
+	}
+
+	if (l2esr & L2ESR_MPLDREXNOK) {
+		pr_alert("L2 master port LDREX received Normal OK response\n");
+		port_error++;
+		msm_l2_erp_stats.mplxrexnok++;
+	}
+
+	if (port_error)
+		ERP_PORT_ERR("L2 master port error detected");
+
+	if (soft_error && !unrecoverable)
+		ERP_1BIT_ERR("L2 single-bit error detected");
+
+	if (unrecoverable)
+		ERP_2BIT_ERR("L2 double-bit error detected, trouble ahead");
+
+	set_l2_indirect_reg(L2ESR_IND_ADDR, l2esr);
+	return IRQ_HANDLED;
+}
+
+static void enable_erp_irq_callback(void *info)
+{
+	enable_percpu_irq(l1_erp_irq, IRQ_TYPE_LEVEL_HIGH);
+}
+
+static void disable_erp_irq_callback(void *info)
+{
+	disable_percpu_irq(l1_erp_irq);
+}
+
+static int cache_erp_cpu_callback(struct notifier_block *nfb,
+					    unsigned long action, void *hcpu)
+{
+	switch (action & (~CPU_TASKS_FROZEN)) {
+	case CPU_STARTING:
+		enable_erp_irq_callback(NULL);
+		break;
+
+	case CPU_DYING:
+		disable_erp_irq_callback(NULL);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cache_erp_cpu_notifier = {
+	.notifier_call = cache_erp_cpu_callback,
+};
+
+static int msm_cache_erp_probe(struct platform_device *pdev)
+{
+	struct resource *r;
+	int ret, cpu;
+
+	r = platform_get_resource_byname(pdev, IORESOURCE_IRQ, "l1_irq");
+
+	if (!r) {
+		pr_err("Could not get L1 resource\n");
+		ret = -ENODEV;
+		goto fail;
+	}
+
+	l1_erp_irq = r->start;
+
+	ret = request_percpu_irq(l1_erp_irq, msm_l1_erp_irq, "MSM_L1",
+				 &msm_l1_erp_stats);
+
+	if (ret) {
+		pr_err("Failed to request the L1 cache error interrupt\n");
+		goto fail;
+	}
+
+	r = platform_get_resource_byname(pdev, IORESOURCE_IRQ, "l2_irq");
+
+	if (!r) {
+		pr_err("Could not get L2 resource\n");
+		ret = -ENODEV;
+		goto fail_l1;
+	}
+
+	l2_erp_irq = r->start;
+	ret = request_irq(l2_erp_irq, msm_l2_erp_irq, 0, "MSM_L2", NULL);
+
+	if (ret) {
+		pr_err("Failed to request the L2 cache error interrupt\n");
+		goto fail_l1;
+	}
+
+	procfs_entry = create_proc_entry("cpu/msm_cache_erp", S_IRUGO, NULL);
+
+	if (!procfs_entry) {
+		pr_err("Failed to create procfs node for cache error reporting\n");
+		ret = -ENODEV;
+		goto fail_l2;
+	}
+
+	get_online_cpus();
+	register_hotcpu_notifier(&cache_erp_cpu_notifier);
+	for_each_cpu(cpu, cpu_online_mask)
+		smp_call_function_single(cpu, enable_erp_irq_callback, NULL, 1);
+	put_online_cpus();
+
+	procfs_entry->read_proc = proc_read_status;
+	return 0;
+
+fail_l2:
+	free_irq(l2_erp_irq, NULL);
+fail_l1:
+	free_percpu_irq(l1_erp_irq, NULL);
+fail:
+	return  ret;
+}
+
+static int msm_cache_erp_remove(struct platform_device *pdev)
+{
+	int cpu;
+	if (procfs_entry)
+		remove_proc_entry("cpu/msm_cache_erp", NULL);
+
+	get_online_cpus();
+	unregister_hotcpu_notifier(&cache_erp_cpu_notifier);
+	for_each_cpu(cpu, cpu_online_mask)
+		smp_call_function_single(cpu, disable_erp_irq_callback, NULL,
+					 1);
+	put_online_cpus();
+
+	free_percpu_irq(l1_erp_irq, NULL);
+
+	disable_irq(l2_erp_irq);
+	free_irq(l2_erp_irq, NULL);
+	return 0;
+}
+
+static struct platform_driver msm_cache_erp_driver = {
+	.probe = msm_cache_erp_probe,
+	.remove = msm_cache_erp_remove,
+	.driver = {
+		.name = MODULE_NAME,
+		.owner = THIS_MODULE,
+	},
+};
+
+static int __init msm_cache_erp_init(void)
+{
+	return platform_driver_register(&msm_cache_erp_driver);
+}
+
+static void __exit msm_cache_erp_exit(void)
+{
+	platform_driver_unregister(&msm_cache_erp_driver);
+}
+
+
+module_init(msm_cache_erp_init);
+module_exit(msm_cache_erp_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("MSM cache error reporting driver");