[IB] mthca: first pass at catastrophic error reporting
Add some initial support for detecting and reporting catastrophic
errors reported by Mellanox HCAs. We start a periodic timer which
polls the catastrophic error reporting buffer in device memory. If an
error is detected, we dump the contents of the buffer for port-mortem
debugging, and report a fatal asynchronous error to higher levels.
In the future we can try to recover from these errors by resetting the
device, but this will require some work in higher-level code as well.
Let's get this in now, so that we at least get catastrophic errors
reported in logs.
Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
index f106bac..7e68bd4 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -258,6 +258,14 @@
struct mthca_icm_table *table;
};
+struct mthca_catas_err {
+ u64 addr;
+ u32 __iomem *map;
+ unsigned long stop;
+ u32 size;
+ struct timer_list timer;
+};
+
struct mthca_dev {
struct ib_device ib_dev;
struct pci_dev *pdev;
@@ -318,6 +326,8 @@
struct mthca_av_table av_table;
struct mthca_mcg_table mcg_table;
+ struct mthca_catas_err catas_err;
+
struct mthca_uar driver_uar;
struct mthca_db_table *db_tab;
struct mthca_pd driver_pd;
@@ -405,6 +415,9 @@
int mthca_register_device(struct mthca_dev *dev);
void mthca_unregister_device(struct mthca_dev *dev);
+void mthca_start_catas_poll(struct mthca_dev *dev);
+void mthca_stop_catas_poll(struct mthca_dev *dev);
+
int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar);
void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);