[IB] mthca: first pass at catastrophic error reporting

Add some initial support for detecting and reporting catastrophic
errors reported by Mellanox HCAs.  We start a periodic timer which
polls the catastrophic error reporting buffer in device memory.  If an
error is detected, we dump the contents of the buffer for port-mortem
debugging, and report a fatal asynchronous error to higher levels.

In the future we can try to recover from these errors by resetting the
device, but this will require some work in higher-level code as well.
Let's get this in now, so that we at least get catastrophic errors
reported in logs.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
index f106bac..7e68bd4 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -258,6 +258,14 @@
 	struct mthca_icm_table *table;
 };
 
+struct mthca_catas_err {
+	u64			addr;
+	u32 __iomem	       *map;
+	unsigned long		stop;
+	u32			size;
+	struct timer_list	timer;
+};
+
 struct mthca_dev {
 	struct ib_device  ib_dev;
 	struct pci_dev   *pdev;
@@ -318,6 +326,8 @@
 	struct mthca_av_table  av_table;
 	struct mthca_mcg_table mcg_table;
 
+	struct mthca_catas_err catas_err;
+
 	struct mthca_uar       driver_uar;
 	struct mthca_db_table *db_tab;
 	struct mthca_pd        driver_pd;
@@ -405,6 +415,9 @@
 int mthca_register_device(struct mthca_dev *dev);
 void mthca_unregister_device(struct mthca_dev *dev);
 
+void mthca_start_catas_poll(struct mthca_dev *dev);
+void mthca_stop_catas_poll(struct mthca_dev *dev);
+
 int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar);
 void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);