net/mlx4_core: Enhance the catas flow to support device reset
This includes:
- resetting the chip when a fatal error is detected (the current code
does not do this).
- exposing the ability to enter error state from outside the catas code
by calling its functionality. (E.g. FW Command timeout, AER error).
- managing a persistent device state. This is needed to sync between
reset flow cases.
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index a61694c..dc2d910 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -2624,6 +2624,11 @@
}
}
+ /* on load remove any previous indication of internal error,
+ * device is up.
+ */
+ dev->persist->state = MLX4_DEVICE_STATE_UP;
+
slave_start:
err = mlx4_cmd_init(dev);
if (err) {
@@ -3108,6 +3113,7 @@
dev->persist->dev = dev;
pci_set_drvdata(pdev, dev->persist);
priv->pci_dev_data = id->driver_data;
+ mutex_init(&dev->persist->device_state_mutex);
ret = __mlx4_init_one(pdev, id->driver_data, priv);
if (ret) {