IB/ipath: Workaround problem of errormask register being overwritten
On some system hardware, we are seeing moderately common cases of the
chip errormask register being overwritten due to a chip bug in iba6120
that is triggered by a vendor-specific PCIe broadcast message. This
patch merely checks periodically, and corrects it if needed (the
overwrite can cause us to not get error and hardware error
interrupts). Also, make dd->ipath_errormask the one, true canonical
source for kr_errormask, and remove references to ipath_ignorederrs as
it is currently unused.
Signed-off-by: Dave Olson <dave.olson@qlogic.com>
Signed-off-by: John Gregor <john.gregor@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c
index 71e6c9d..9dd0bac 100644
--- a/drivers/infiniband/hw/ipath/ipath_init_chip.c
+++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c
@@ -851,13 +851,14 @@
ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
dd->ipath_hwerrmask);
- dd->ipath_maskederrs = dd->ipath_ignorederrs;
/* clear all */
ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
/* enable errors that are masked, at least this first time. */
ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
~dd->ipath_maskederrs);
- /* clear any interrups up to this point (ints still not enabled) */
+ dd->ipath_errormask = ipath_read_kreg64(dd,
+ dd->ipath_kregs->kr_errormask);
+ /* clear any interrupts up to this point (ints still not enabled) */
ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
/*
diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c
index 0c075cf..b29fe7e 100644
--- a/drivers/infiniband/hw/ipath/ipath_intr.c
+++ b/drivers/infiniband/hw/ipath/ipath_intr.c
@@ -517,10 +517,7 @@
supp_msgs = handle_frequent_errors(dd, errs, msg, &noprint);
- /*
- * don't report errors that are masked (includes those always
- * ignored)
- */
+ /* don't report errors that are masked */
errs &= ~dd->ipath_maskederrs;
/* do these first, they are most important */
@@ -566,19 +563,19 @@
* ones on this particular interrupt, which also isn't great
*/
dd->ipath_maskederrs |= dd->ipath_lasterror | errs;
+ dd->ipath_errormask &= ~dd->ipath_maskederrs;
ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
- ~dd->ipath_maskederrs);
+ dd->ipath_errormask);
s_iserr = ipath_decode_err(msg, sizeof msg,
- (dd->ipath_maskederrs & ~dd->
- ipath_ignorederrs));
+ dd->ipath_maskederrs);
- if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs) &
+ if (dd->ipath_maskederrs &
~(INFINIPATH_E_RRCVEGRFULL |
INFINIPATH_E_RRCVHDRFULL | INFINIPATH_E_PKTERRS))
ipath_dev_err(dd, "Temporarily disabling "
"error(s) %llx reporting; too frequent (%s)\n",
- (unsigned long long) (dd->ipath_maskederrs &
- ~dd->ipath_ignorederrs), msg);
+ (unsigned long long)dd->ipath_maskederrs,
+ msg);
else {
/*
* rcvegrfull and rcvhdrqfull are "normal",
@@ -793,6 +790,9 @@
/* disable error interrupts, to avoid confusion */
ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL);
+ /* also disable interrupts; errormask is sometimes overwriten */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+
/*
* clear all sends, because they have may been
* completed by usercode while in freeze mode, and
@@ -817,7 +817,7 @@
for (i = 0; i < dd->ipath_pioavregs; i++) {
/* deal with 6110 chip bug */
im = i > 3 ? ((i&1) ? i-1 : i+1) : i;
- val = ipath_read_kreg64(dd, 0x1000+(im*sizeof(u64)));
+ val = ipath_read_kreg64(dd, (0x1000/sizeof(u64))+im);
dd->ipath_pioavailregs_dma[i] = dd->ipath_pioavailshadow[i]
= le64_to_cpu(val);
}
@@ -832,7 +832,8 @@
ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
E_SPKT_ERRS_IGNORE);
ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
- ~dd->ipath_maskederrs);
+ dd->ipath_errormask);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, -1LL);
ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
}
diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h
index ef77329..7a7966f 100644
--- a/drivers/infiniband/hw/ipath/ipath_kernel.h
+++ b/drivers/infiniband/hw/ipath/ipath_kernel.h
@@ -261,18 +261,10 @@
* limiting of hwerror reporting
*/
ipath_err_t ipath_lasthwerror;
- /*
- * errors masked because they occur too fast, also includes errors
- * that are always ignored (ipath_ignorederrs)
- */
+ /* errors masked because they occur too fast */
ipath_err_t ipath_maskederrs;
/* time in jiffies at which to re-enable maskederrs */
unsigned long ipath_unmasktime;
- /*
- * errors always ignored (masked), at least for a given
- * chip/device, because they are wrong or not useful
- */
- ipath_err_t ipath_ignorederrs;
/* count of egrfull errors, combined for all ports */
u64 ipath_last_tidfull;
/* for ipath_qcheck() */
@@ -436,6 +428,7 @@
u64 ipath_lastibcstat;
/* hwerrmask shadow */
ipath_err_t ipath_hwerrmask;
+ ipath_err_t ipath_errormask; /* errormask shadow */
/* interrupt config reg shadow */
u64 ipath_intconfig;
/* kr_sendpiobufbase value */
diff --git a/drivers/infiniband/hw/ipath/ipath_stats.c b/drivers/infiniband/hw/ipath/ipath_stats.c
index 73ed17d..bae4f56 100644
--- a/drivers/infiniband/hw/ipath/ipath_stats.c
+++ b/drivers/infiniband/hw/ipath/ipath_stats.c
@@ -196,6 +196,45 @@
}
}
+static void ipath_chk_errormask(struct ipath_devdata *dd)
+{
+ static u32 fixed;
+ u32 ctrl;
+ unsigned long errormask;
+ unsigned long hwerrs;
+
+ if (!dd->ipath_errormask || !(dd->ipath_flags & IPATH_INITTED))
+ return;
+
+ errormask = ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
+
+ if (errormask == dd->ipath_errormask)
+ return;
+ fixed++;
+
+ hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
+ ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
+
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+ dd->ipath_errormask);
+
+ if ((hwerrs & dd->ipath_hwerrmask) ||
+ (ctrl & INFINIPATH_C_FREEZEMODE)) {
+ /* force re-interrupt of pending events, just in case */
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, 0ULL);
+ ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
+ dev_info(&dd->pcidev->dev,
+ "errormask fixed(%u) %lx -> %lx, ctrl %x hwerr %lx\n",
+ fixed, errormask, (unsigned long)dd->ipath_errormask,
+ ctrl, hwerrs);
+ } else
+ ipath_dbg("errormask fixed(%u) %lx -> %lx, no freeze\n",
+ fixed, errormask,
+ (unsigned long)dd->ipath_errormask);
+}
+
+
/**
* ipath_get_faststats - get word counters from chip before they overflow
* @opaque - contains a pointer to the infinipath device ipath_devdata
@@ -251,14 +290,13 @@
dd->ipath_lasterror = 0;
if (dd->ipath_lasthwerror)
dd->ipath_lasthwerror = 0;
- if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs)
+ if (dd->ipath_maskederrs
&& time_after(jiffies, dd->ipath_unmasktime)) {
char ebuf[256];
int iserr;
iserr = ipath_decode_err(ebuf, sizeof ebuf,
- (dd->ipath_maskederrs & ~dd->
- ipath_ignorederrs));
- if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs) &
+ dd->ipath_maskederrs);
+ if (dd->ipath_maskederrs &
~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
INFINIPATH_E_PKTERRS ))
ipath_dev_err(dd, "Re-enabling masked errors "
@@ -278,9 +316,12 @@
ipath_cdbg(ERRPKT, "Re-enabling packet"
" problem interrupt (%s)\n", ebuf);
}
- dd->ipath_maskederrs = dd->ipath_ignorederrs;
+
+ /* re-enable masked errors */
+ dd->ipath_errormask |= dd->ipath_maskederrs;
ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
- ~dd->ipath_maskederrs);
+ dd->ipath_errormask);
+ dd->ipath_maskederrs = 0;
}
/* limit qfull messages to ~one per minute per port */
@@ -294,6 +335,7 @@
}
}
+ ipath_chk_errormask(dd);
done:
mod_timer(&dd->ipath_stats_timer, jiffies + HZ * 5);
}