net/mlx4_core: Enhance the catas flow to support device reset
This includes:
- resetting the chip when a fatal error is detected (the current code
does not do this).
- exposing the ability to enter error state from outside the catas code
by calling its functionality. (E.g. FW Command timeout, AER error).
- managing a persistent device state. This is needed to sync between
reset flow cases.
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index da425d2..7d5d317 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -411,6 +411,11 @@
MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK = 1 << 4,
};
+enum {
+ MLX4_DEVICE_STATE_UP = 1 << 0,
+ MLX4_DEVICE_STATE_INTERNAL_ERROR = 1 << 1,
+};
+
#define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \
MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK)
@@ -753,6 +758,8 @@
enum mlx4_port_type curr_port_poss_type[MLX4_MAX_PORTS + 1];
struct work_struct catas_work;
struct workqueue_struct *catas_wq;
+ struct mutex device_state_mutex; /* protect HW state */
+ u8 state;
};
struct mlx4_dev {