i7core_edac: Add edac_mce glue Adds a glue code to allow i7core to work with mcelog. With the glue, i7core registers itself on edac_mce. At mce, when an error is detected, it calls all registered drivers (in this case, i7core), for EDAC error handling. TODO: It currently just prints the MCE error log using about the same format as mce panic messages. The error message should be enhanced with mcelog userspace info and converted into the proper EDAC format, to feed the EDAC error counts. Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>

commit: d5381642ab01b084787925acdf26b5524d434476 [log] [tgz]
author: Mauro Carvalho Chehab <mchehab@redhat.com> Thu Jul 09 22:06:41 2009 -0300
committer: Mauro Carvalho Chehab <mchehab@redhat.com> Mon May 10 11:44:50 2010 -0300
tree: e8d73b1a3aa701fccb57dbf7f50e60faa75a0ecc
parent: 963c5ba35984c87963480031d1d7e2e556256ad7 [diff] [blame]
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index 9149147..3c7bb5f 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c

@@ -27,6 +27,8 @@
 #include <linux/slab.h>
 #include <linux/edac.h>
 #include <linux/mmzone.h>
+#include <linux/edac_mce.h>
+#include <linux/spinlock.h>
 
 #include "edac_core.h"
 
@@ -195,6 +197,11 @@
 	unsigned long	ce_count[MAX_DIMMS];	/* ECC corrected errors counts per dimm */
 	int		last_ce_count[MAX_DIMMS];
 
+	/* mcelog glue */
+	struct edac_mce		edac_mce;
+	struct mce		mce_entry[MCE_LOG_LEN];
+	unsigned		mce_count;
+	spinlock_t		mce_lock;
 };
 
 /* Device name and register DID (Device ID) */
@@ -900,7 +907,7 @@
 	pci_read_config_dword(pvt->pci_ch[pvt->inject.channel][0],
 			       MC_CHANNEL_ADDR_MATCH + 4, &rdmask2);
 
-	debugf0("Inject addr match write 0x%016llx, read: 0x%08x%08x\n",
+	debugf0("Inject addr match write 0x%016llx, read: 0x%08x 0x%08x\n",
 		mask, rdmask1, rdmask2);
 #endif
 #endif
@@ -1162,9 +1169,11 @@
 	new1 = DIMM1_COR_ERR(rcv0);
 	new0 = DIMM0_COR_ERR(rcv0);
 
+#if 0
 	debugf2("%s CE rcv1=0x%08x rcv0=0x%08x, %d %d %d\n",
 		(pvt->ce_count_available ? "UPDATE" : "READ"),
 		rcv1, rcv0, new0, new1, new2);
+#endif
 
 	/* Updates CE counters if it is not the first time here */
 	if (pvt->ce_count_available) {
@@ -1195,16 +1204,97 @@
 	pvt->last_ce_count[0] = new0;
 }
 
+static void i7core_mce_output_error(struct mem_ctl_info *mci,
+				    struct mce *m)
+{
+	debugf0("CPU %d: Machine Check Exception: %16Lx"
+		"Bank %d: %016Lx\n",
+		m->cpu, m->mcgstatus, m->bank, m->status);
+	if (m->ip) {
+		debugf0("RIP%s %02x:<%016Lx>\n",
+			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
+			m->cs, m->ip);
+	}
+	printk(KERN_EMERG "TSC %llx ", m->tsc);
+	if (m->addr)
+		printk("ADDR %llx ", m->addr);
+	if (m->misc)
+		printk("MISC %llx ", m->misc);
+
+#if 0
+	snprintf(msg, sizeof(msg),
+		"%s (Branch=%d DRAM-Bank=%d Buffer ID = %d RDWR=%s "
+		"RAS=%d CAS=%d %s Err=0x%lx (%s))",
+		type, branch >> 1, bank, buf_id, rdwr_str(rdwr), ras, cas,
+		type, allErrors, error_name[errnum]);
+
+	/* Call the helper to output message */
+	edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg);
+#endif
+}
+
 /*
  *	i7core_check_error	Retrieve and process errors reported by the
  *				hardware. Called by the Core module.
  */
 static void i7core_check_error(struct mem_ctl_info *mci)
 {
+	struct i7core_pvt *pvt = mci->pvt_info;
+	int i;
+	unsigned count = 0;
+	struct mce *m = NULL;
+	unsigned long flags;
+
+	debugf0(__FILE__ ": %s()\n", __func__);
+
+	/* Copy all mce errors into a temporary buffer */
+	spin_lock_irqsave(&pvt->mce_lock, flags);
+	if (pvt->mce_count) {
+		m = kmalloc(sizeof(*m) * pvt->mce_count, GFP_ATOMIC);
+		if (m) {
+			count = pvt->mce_count;
+			memcpy(m, &pvt->mce_entry, sizeof(*m) * count);
+		}
+		pvt->mce_count = 0;
+	}
+	spin_unlock_irqrestore(&pvt->mce_lock, flags);
+
+	/* proccess mcelog errors */
+	for (i = 0; i < count; i++)
+		i7core_mce_output_error(mci, &m[i]);
+
+	kfree(m);
+
+	/* check memory count errors */
 	check_mc_test_err(mci);
 }
 
 /*
+ * i7core_mce_check_error	Replicates mcelog routine to get errors
+ *				This routine simply queues mcelog errors, and
+ *				return. The error itself should be handled later
+ *				by i7core_check_error.
+ */
+static int i7core_mce_check_error(void *priv, struct mce *mce)
+{
+	struct i7core_pvt *pvt = priv;
+	unsigned long flags;
+
+	debugf0(__FILE__ ": %s()\n", __func__);
+
+	spin_lock_irqsave(&pvt->mce_lock, flags);
+	if (pvt->mce_count < MCE_LOG_LEN) {
+		memcpy(&pvt->mce_entry[pvt->mce_count], mce, sizeof(*mce));
+		pvt->mce_count++;
+	}
+	spin_unlock_irqrestore(&pvt->mce_lock, flags);
+
+	/* Advice mcelog that the error were handled */
+//	return 1;
+	return 0; // Let's duplicate the log
+}
+
+/*
  *	i7core_probe	Probe for ONE instance of device to see if it is
  *			present.
  *	return:
@@ -1305,6 +1395,18 @@
 	pvt->inject.page = -1;
 	pvt->inject.col = -1;
 
+	/* Registers on edac_mce in order to receive memory errors */
+	pvt->edac_mce.priv = pvt;
+	pvt->edac_mce.check_error = i7core_mce_check_error;
+	spin_lock_init(&pvt->mce_lock);
+
+	rc = edac_mce_register(&pvt->edac_mce);
+	if (unlikely (rc < 0)) {
+		debugf0("MC: " __FILE__
+			": %s(): failed edac_mce_register()\n", __func__);
+		goto fail1;
+	}
+
 	i7core_printk(KERN_INFO, "Driver loaded.\n");
 
 	return 0;
@@ -1324,17 +1426,22 @@
 static void __devexit i7core_remove(struct pci_dev *pdev)
 {
 	struct mem_ctl_info *mci;
+	struct i7core_pvt *pvt;
 
 	debugf0(__FILE__ ": %s()\n", __func__);
 
 	if (i7core_pci)
 		edac_pci_release_generic_ctl(i7core_pci);
 
-	mci = edac_mc_del_mc(&pdev->dev);
 
+	mci = edac_mc_del_mc(&pdev->dev);
 	if (!mci)
 		return;
 
+	/* Unregisters on edac_mce in order to receive memory errors */
+	pvt = mci->pvt_info;
+	edac_mce_unregister(&pvt->edac_mce);
+
 	/* retrieve references to resources, and free those resources */
 	i7core_put_devices();
commit	d5381642ab01b084787925acdf26b5524d434476	[log] [tgz]
author	Mauro Carvalho Chehab <mchehab@redhat.com>	Thu Jul 09 22:06:41 2009 -0300
committer	Mauro Carvalho Chehab <mchehab@redhat.com>	Mon May 10 11:44:50 2010 -0300
tree	e8d73b1a3aa701fccb57dbf7f50e60faa75a0ecc
parent	963c5ba35984c87963480031d1d7e2e556256ad7 [diff] [blame]