drivers: edac: Add Cache Error Reporting driver for Kryo3xx processors

Cache Error Reporting driver receives error interrupts for Single Bit and
Double Bit Errors, checks the corresponding syndrome registers and takes
action based on configuration options. Optional polling of syndrome
registers with single error and double bit errors configurable as panic.

Change-Id: I025037da5c5ac6f5520b683af69c462663c1e4f0
Signed-off-by: Kyle Yan <kyan@codeaurora.org>
diff --git a/arch/arm64/boot/dts/qcom/msmskunk.dtsi b/arch/arm64/boot/dts/qcom/msmskunk.dtsi
index 9178b23..da8bbfa 100644
--- a/arch/arm64/boot/dts/qcom/msmskunk.dtsi
+++ b/arch/arm64/boot/dts/qcom/msmskunk.dtsi
@@ -519,6 +519,19 @@
 			reg = <0x10 8>;
 		};
 	};
+
+	kryo3xx-erp {
+		compatible = "arm,arm64-kryo3xx-cpu-erp";
+		interrupts = <1 6 4>,
+			     <1 7 4>,
+			     <0 34 4>,
+			     <0 35 4>;
+
+		interrupt-names = "l1-l2-faultirq",
+				  "l1-l2-errirq",
+				  "l3-scu-errirq",
+				  "l3-scu-faultirq";
+	};
 };
 
 &pcie_0_gdsc {
diff --git a/arch/arm64/include/asm/kryo3xx-arm64-edac.h b/arch/arm64/include/asm/kryo3xx-arm64-edac.h
new file mode 100644
index 0000000..cc59dc0
--- /dev/null
+++ b/arch/arm64/include/asm/kryo3xx-arm64-edac.h
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef ASM_KRYO3xx_EDAC_H
+#define ASM_KRYO3xx_EDAC_H
+
+#if defined(CONFIG_EDAC_KRYO3XX_ARM64)
+void kryo3xx_poll_cache_errors(void *info);
+#else
+static inline void kryo3xx_poll_cache_errors(void *info) { }
+#endif
+
+#endif
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 05d2bd7..ed3c7bb 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -38,6 +38,7 @@
 #include <asm/system_misc.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
+#include <asm/kryo3xx-arm64-edac.h>
 
 static const char *fault_name(unsigned int esr);
 
@@ -477,6 +478,7 @@
  */
 static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 {
+	kryo3xx_poll_cache_errors(NULL);
 	return 1;
 }
 
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index dff1a4a..ec91d36 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -413,6 +413,47 @@
 	  Support for error detection and correction on the Synopsys DDR
 	  memory controller.
 
+config EDAC_KRYO3XX_ARM64
+	depends on EDAC_MM_EDAC && ARM64
+	tristate "ARM KRYO3XX Gold and Silver L1/L2/L3/SCU Caches"
+	help
+	   Support for error detection and correction on the
+	   Kryo3xx Gold and Silver CPUs. Reports errors caught by Kryo3xx
+	   ECC mechanism.
+	   For debugging issues having to do with stability and overall system
+	   health, you should probably say 'Y' here.
+
+config EDAC_KRYO3XX_ARM64_POLL
+	depends on EDAC_KRYO3XX_ARM64
+	bool "Poll on kryo3xx ECC registers - kryo3xx"
+	help
+	   This option chooses whether or not you want to poll on the Kryo3xx
+	   ECC registers. When this is enabled, the polling rate can be set as
+	   a module parameter. By default, it will call the polling function
+	   every second.
+	   This option should only be used if the associated interrupt lines
+	   are not enabled.
+
+config EDAC_KRYO3XX_ARM64_PANIC_ON_CE
+	depends on EDAC_KRYO3XX_ARM64
+	bool "Panic on correctable errors - kryo3xx"
+	help
+	   Forcibly cause a kernel panic if an correctable error (CE) is
+	   detected, even though the error is (by definition) correctable and
+	   would otherwise result in no adverse system effects. This can reduce
+	   debugging times on hardware which may be operating at voltages or
+	   frequencies outside normal specification.
+	   For production builds, you should definitely say 'N' here.
+
+config EDAC_KRYO3XX_ARM64_PANIC_ON_UE
+	depends on EDAC_KRYO3XX_ARM64
+	bool "Panic on uncorrectable errors - kryo3xx"
+	help
+	   Forcibly cause a kernel panic if an uncorrectable error (UE) is
+	   detected. This can reduce debugging times on hardware which may be
+	   operating at voltages or frequencies outside normal specification.
+	   For production builds, you should probably say 'N' here.
+
 config EDAC_XGENE
 	tristate "APM X-Gene SoC"
 	depends on EDAC_MM_EDAC && (ARM64 || COMPILE_TEST)
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 9860499..08f22e9 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -69,5 +69,6 @@
 obj-$(CONFIG_EDAC_OCTEON_PCI)		+= octeon_edac-pci.o
 
 obj-$(CONFIG_EDAC_ALTERA)		+= altera_edac.o
+obj-$(CONFIG_EDAC_KRYO3XX_ARM64)	+= kryo3xx_arm64_edac.o
 obj-$(CONFIG_EDAC_SYNOPSYS)		+= synopsys_edac.o
 obj-$(CONFIG_EDAC_XGENE)		+= xgene_edac.o
diff --git a/drivers/edac/kryo3xx_arm64_edac.c b/drivers/edac/kryo3xx_arm64_edac.c
new file mode 100644
index 0000000..aed269f
--- /dev/null
+++ b/drivers/edac/kryo3xx_arm64_edac.c
@@ -0,0 +1,412 @@
+/* Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/edac.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/of_irq.h>
+
+#include <asm/cputype.h>
+
+#include "edac_core.h"
+
+#ifdef CONFIG_EDAC_KRYO3XX_ARM64_POLL
+static int poll_msec = 1000;
+module_param(poll_msec, int, 0444);
+#endif
+
+#ifdef CONFIG_EDAC_KRYO3XX_ARM64_PANIC_ON_CE
+#define ARM64_ERP_PANIC_ON_CE 1
+#else
+#define ARM64_ERP_PANIC_ON_CE 0
+#endif
+
+#ifdef CONFIG_EDAC_KRYO3XX_ARM64_PANIC_ON_UE
+#define ARM64_ERP_PANIC_ON_UE 1
+#else
+#define ARM64_ERP_PANIC_ON_UE 0
+#endif
+
+#define L1 0x0
+#define L2 0x1
+#define L3 0x2
+
+#define EDAC_CPU	"kryo3xx_edac"
+
+#define KRYO3XX_ERRXSTATUS_VALID(a)	((a >> 30) & 0x1)
+#define KRYO3XX_ERRXSTATUS_UE(a)	((a >> 29) & 0x1)
+#define KRYO3XX_ERRXSTATUS_SERR(a)	(a & 0xFF)
+
+#define KRYO3XX_ERRXMISC_LVL(a)		((a >> 1) & 0x7)
+#define KRYO3XX_ERRXMISC_WAY(a)		((a >> 28) & 0xF)
+
+#define ERRXSTATUS_VALID_FLAG		(1 << 30)
+
+static inline void set_errxctlr_el1(void)
+{
+	u64 val = 0x8f;
+
+	asm volatile("msr s3_0_c5_c4_1, %0" : : "r" (val));
+}
+
+static inline void write_errselr_el1(u64 val)
+{
+	asm volatile("msr s3_0_c5_c3_1, %0" : : "r" (val));
+}
+
+static inline u64 read_errxstatus_el1(void)
+{
+	u64 val;
+
+	asm volatile("mrs %0, s3_0_c5_c4_2" : "=r" (val));
+	return val;
+}
+
+static inline u64 read_errxmisc_el1(void)
+{
+	u64 val;
+
+	asm volatile("mrs %0, s3_0_c5_c5_0" : "=r" (val));
+	return val;
+}
+
+static inline void clear_errxstatus_valid(u64 val)
+{
+	u64 reset_val = val & ~ERRXSTATUS_VALID_FLAG;
+
+	asm volatile("msr s3_0_c5_c4_2, %0" : : "r" (reset_val));
+}
+
+struct errors_edac {
+	const char * const msg;
+	void (*func)(struct edac_device_ctl_info *edac_dev,
+			int inst_nr, int block_nr, const char *msg);
+};
+
+static const struct errors_edac errors[] = {
+	{"Kryo3xx L1 Correctable Error", edac_device_handle_ce },
+	{"Kryo3xx L1 Uncorrectable Error", edac_device_handle_ue },
+	{"Kryo3xx L2 Correctable Error", edac_device_handle_ce },
+	{"Kryo3xx L2 Uncorrectable Error", edac_device_handle_ue },
+	{"L3 Correctable Error", edac_device_handle_ce },
+	{"L3 Uncorrectable Error", edac_device_handle_ue },
+};
+
+#define KRYO3XX_L1_CE 0
+#define KRYO3XX_L1_UE 1
+#define KRYO3XX_L2_CE 2
+#define KRYO3XX_L2_UE 3
+#define KRYO3XX_L3_CE 4
+#define KRYO3XX_L3_UE 5
+
+#define DATA_BUF_ERR		0x2
+#define CACHE_DATA_ERR		0x6
+#define CACHE_TAG_DIRTY_ERR	0x7
+#define TLB_PARITY_ERR		0x8
+#define BUS_ERROR		0x18
+
+struct erp_drvdata {
+	struct edac_device_ctl_info *edev_ctl;
+};
+
+static struct erp_drvdata *panic_handler_drvdata;
+
+static DEFINE_SPINLOCK(local_handler_lock);
+
+static int request_erp_irq(struct platform_device *pdev, const char *propname,
+			const char *desc, irq_handler_t handler,
+			void *ed, int percpu)
+{
+	int rc;
+	struct resource *r;
+
+	r = platform_get_resource_byname(pdev, IORESOURCE_IRQ, propname);
+
+	if (!r) {
+		pr_err("ARM64 CPU ERP: Could not find <%s> IRQ property. Proceeding anyway.\n",
+			propname);
+		return -EINVAL;
+	}
+
+	if (!percpu) {
+		rc = devm_request_threaded_irq(&pdev->dev, r->start, NULL,
+					       handler,
+					       IRQF_ONESHOT | IRQF_TRIGGER_HIGH,
+					       desc,
+					       ed);
+	} else {
+		rc = request_percpu_irq(r->start, handler, desc, ed);
+	}
+
+	if (rc) {
+		pr_err("ARM64 CPU ERP: Failed to request IRQ %d: %d (%s / %s). Proceeding anyway.\n",
+		       (int) r->start, rc, propname, desc);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void dump_err_reg(int errorcode, int level, u64 errxstatus, u64 errxmisc,
+	struct edac_device_ctl_info *edev_ctl)
+{
+	edac_printk(KERN_CRIT, EDAC_CPU, "ERRXSTATUS_EL1: %llx\n", errxstatus);
+	edac_printk(KERN_CRIT, EDAC_CPU, "ERRXMISC_EL1: %llx\n", errxmisc);
+	edac_printk(KERN_CRIT, EDAC_CPU, "Cache level: L%d\n", level + 1);
+
+	switch (KRYO3XX_ERRXSTATUS_SERR(errxstatus)) {
+	case DATA_BUF_ERR:
+		edac_printk(KERN_CRIT, EDAC_CPU, "ECC Error from internal data buffer\n");
+		break;
+
+	case CACHE_DATA_ERR:
+		edac_printk(KERN_CRIT, EDAC_CPU, "ECC Error from cache data RAM\n");
+		break;
+
+	case CACHE_TAG_DIRTY_ERR:
+		edac_printk(KERN_CRIT, EDAC_CPU, "ECC Error from cache tag or dirty RAM\n");
+		break;
+
+	case TLB_PARITY_ERR:
+		edac_printk(KERN_CRIT, EDAC_CPU, "Parity error on TLB RAM\n");
+		break;
+
+	case BUS_ERROR:
+		edac_printk(KERN_CRIT, EDAC_CPU, "Bus Error\n");
+		break;
+	}
+
+	if (level == L3)
+		edac_printk(KERN_CRIT, EDAC_CPU,
+			"Way: %d\n", (int) KRYO3XX_ERRXMISC_WAY(errxmisc));
+	else
+		edac_printk(KERN_CRIT, EDAC_CPU,
+			"Way: %d\n", (int) KRYO3XX_ERRXMISC_WAY(errxmisc) >> 2);
+	errors[errorcode].func(edev_ctl, smp_processor_id(),
+				level, errors[errorcode].msg);
+}
+
+static void kryo3xx_parse_l1_l2_cache_error(u64 errxstatus, u64 errxmisc,
+	struct edac_device_ctl_info *edev_ctl)
+{
+	switch (KRYO3XX_ERRXMISC_LVL(errxmisc)) {
+	case L1:
+		if (KRYO3XX_ERRXSTATUS_UE(errxstatus))
+			dump_err_reg(KRYO3XX_L1_UE, L1, errxstatus, errxmisc,
+				edev_ctl);
+		else
+			dump_err_reg(KRYO3XX_L1_CE, L1, errxstatus, errxmisc,
+				edev_ctl);
+		break;
+
+	case L2:
+		if (KRYO3XX_ERRXSTATUS_UE(errxstatus))
+			dump_err_reg(KRYO3XX_L2_UE, L2, errxstatus, errxmisc,
+				edev_ctl);
+		else
+			dump_err_reg(KRYO3XX_L2_CE, L2, errxstatus, errxmisc,
+				edev_ctl);
+		break;
+	}
+
+}
+
+static void kryo3xx_check_l1_l2_ecc(void *info)
+{
+	struct edac_device_ctl_info *edev_ctl = info;
+	u64 errxstatus = 0;
+	u64 errxmisc = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&local_handler_lock, flags);
+	write_errselr_el1(0);
+	errxstatus = read_errxstatus_el1();
+	if (KRYO3XX_ERRXSTATUS_VALID(errxstatus)) {
+		errxmisc = read_errxmisc_el1();
+		edac_printk(KERN_CRIT, EDAC_CPU,
+		"Kryo3xx CPU%d detected a L1/L2 cache error\n",
+		smp_processor_id());
+
+		kryo3xx_parse_l1_l2_cache_error(errxstatus, errxmisc, edev_ctl);
+		clear_errxstatus_valid(errxstatus);
+	}
+	spin_unlock_irqrestore(&local_handler_lock, flags);
+}
+
+static void kryo3xx_check_l3_scu_error(struct edac_device_ctl_info *edev_ctl)
+{
+	u64 errxstatus = 0;
+	u64 errxmisc = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&local_handler_lock, flags);
+	write_errselr_el1(1);
+	errxstatus = read_errxstatus_el1();
+	errxmisc = read_errxmisc_el1();
+
+	if (KRYO3XX_ERRXSTATUS_VALID(errxstatus) &&
+		KRYO3XX_ERRXMISC_LVL(errxmisc) == L3) {
+		if (KRYO3XX_ERRXSTATUS_UE(errxstatus)) {
+			edac_printk(KERN_CRIT, EDAC_CPU, "Detected L3 uncorrectable error\n");
+			dump_err_reg(KRYO3XX_L3_UE, L3, errxstatus, errxmisc,
+				edev_ctl);
+		} else {
+			edac_printk(KERN_CRIT, EDAC_CPU, "Detected L3 correctable error\n");
+			dump_err_reg(KRYO3XX_L3_CE, L3, errxstatus, errxmisc,
+				edev_ctl);
+		}
+
+		clear_errxstatus_valid(errxstatus);
+	}
+	spin_unlock_irqrestore(&local_handler_lock, flags);
+}
+
+void kryo3xx_poll_cache_errors(struct edac_device_ctl_info *edev_ctl)
+{
+	int cpu;
+
+	if (edev_ctl == NULL)
+		edev_ctl = panic_handler_drvdata->edev_ctl;
+
+	kryo3xx_check_l3_scu_error(edev_ctl);
+	for_each_possible_cpu(cpu)
+		smp_call_function_single(cpu, kryo3xx_check_l1_l2_ecc,
+			edev_ctl, 0);
+}
+
+static irqreturn_t kryo3xx_l1_l2_handler(int irq, void *drvdata)
+{
+	struct erp_drvdata *drv = drvdata;
+	struct edac_device_ctl_info *edev_ctl = drv->edev_ctl;
+
+	kryo3xx_check_l1_l2_ecc(edev_ctl);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t kryo3xx_l3_scu_handler(int irq, void *drvdata)
+{
+	struct erp_drvdata *drv = drvdata;
+	struct edac_device_ctl_info *edev_ctl = drv->edev_ctl;
+
+	kryo3xx_check_l3_scu_error(edev_ctl);
+	return IRQ_HANDLED;
+}
+
+static int kryo3xx_cpu_erp_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct erp_drvdata *drv;
+	int rc = 0;
+	int fail = 0;
+
+	set_errxctlr_el1();
+	drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL);
+
+	if (!drv)
+		return -ENOMEM;
+
+	drv->edev_ctl = edac_device_alloc_ctl_info(0, "cpu",
+					num_possible_cpus(), "L", 3, 1, NULL, 0,
+					edac_device_alloc_index());
+
+	if (!drv->edev_ctl)
+		return -ENOMEM;
+
+	#ifdef CONFIG_EDAC_KRYO3XX_ARM64_POLL
+	drv->edev_ctl->edac_check = kryo3xx_poll_cache_errors;
+	drv->edev_ctl->poll_msec = poll_msec;
+	drv->edev_ctl->defer_work = 1;
+	#endif
+
+	drv->edev_ctl->dev = dev;
+	drv->edev_ctl->mod_name = dev_name(dev);
+	drv->edev_ctl->dev_name = dev_name(dev);
+	drv->edev_ctl->ctl_name = "cache";
+	drv->edev_ctl->panic_on_ce = ARM64_ERP_PANIC_ON_CE;
+	drv->edev_ctl->panic_on_ue = ARM64_ERP_PANIC_ON_UE;
+	platform_set_drvdata(pdev, drv);
+
+	rc = edac_device_add_device(drv->edev_ctl);
+	if (rc)
+		goto out_mem;
+
+	panic_handler_drvdata = drv;
+
+	if (request_erp_irq(pdev, "l1-l2-faultirq",
+			"KRYO3XX L1-L2 ECC FAULTIRQ",
+			kryo3xx_l1_l2_handler, drv, 1))
+		fail++;
+
+	if (request_erp_irq(pdev, "l3-scu-faultirq",
+			"KRYO3XX L3-SCU ECC FAULTIRQ",
+			kryo3xx_l3_scu_handler, drv, 0))
+		fail++;
+
+	if (fail == of_irq_count(dev->of_node)) {
+		pr_err("KRYO3XX ERP: Could not request any IRQs. Giving up.\n");
+		rc = -ENODEV;
+		goto out_dev;
+	}
+
+	return 0;
+
+out_dev:
+	edac_device_del_device(dev);
+out_mem:
+	edac_device_free_ctl_info(drv->edev_ctl);
+	return rc;
+}
+
+static int kryo3xx_cpu_erp_remove(struct platform_device *pdev)
+{
+	struct erp_drvdata *drv = dev_get_drvdata(&pdev->dev);
+	struct edac_device_ctl_info *edac_ctl = drv->edev_ctl;
+
+	edac_device_del_device(edac_ctl->dev);
+	edac_device_free_ctl_info(edac_ctl);
+
+	return 0;
+}
+
+static const struct of_device_id kryo3xx_cpu_erp_match_table[] = {
+	{ .compatible = "arm,arm64-kryo3xx-cpu-erp" },
+	{ }
+};
+
+static struct platform_driver kryo3xx_cpu_erp_driver = {
+	.probe = kryo3xx_cpu_erp_probe,
+	.remove = kryo3xx_cpu_erp_remove,
+	.driver = {
+		.name = "kryo3xx_cpu_cache_erp",
+		.owner = THIS_MODULE,
+		.of_match_table = of_match_ptr(kryo3xx_cpu_erp_match_table),
+	},
+};
+
+static int __init kryo3xx_cpu_erp_init(void)
+{
+	return platform_driver_register(&kryo3xx_cpu_erp_driver);
+}
+module_init(kryo3xx_cpu_erp_init);
+
+static void __exit kryo3xx_cpu_erp_exit(void)
+{
+	platform_driver_unregister(&kryo3xx_cpu_erp_driver);
+}
+module_exit(kryo3xx_cpu_erp_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Kryo3xx EDAC driver");