net/mlx5_core: Add pci error handlers to mlx5_core driver This patch implement the pci_error_handlers for mlx5_core which allow the driver to recover from PCI error. Once an error is detected in the PCI, the mlx5_pci_err_detected is called and it: 1) Marks the device to be in 'Internal Error' state. 2) Dispatches an event to the mlx5_ib to flush all the outstanding cqes with error. 3) Returns all the on going commands with error. 4) Unloads the driver. Afterwards, the FW is reset and mlx5_pci_slot_reset is called and it enables the device and restore it's pci state. If the later succeeds, mlx5_pci_resume is called, and it loads the SW stack. Signed-off-by: Majd Dibbiny <majd@mellanox.com> Signed-off-by: Eli Cohen <eli@mellanox.com> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>

commit: 89d44f0a6c732db23b219be708e2fe1e03ee4842 [log] [tgz]
author: Majd Dibbiny <majd@mellanox.com> Wed Oct 14 17:43:46 2015 +0300
committer: David S. Miller <davem@davemloft.net> Wed Oct 14 19:14:42 2015 -0700
tree: f8a0aa4f622b8a79ae070015e1e73e7ec0b87526
parent: fd76ee4da55abb21babfc69310d321b9cb9a32e0 [diff] [blame]
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index f1eb686..f5deb64 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c

@@ -34,6 +34,7 @@
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/vmalloc.h>
+#include <linux/hardirq.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
@@ -68,6 +69,29 @@
 	return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
 }
 
+static void trigger_cmd_completions(struct mlx5_core_dev *dev)
+{
+	unsigned long flags;
+	u64 vector;
+
+	/* wait for pending handlers to complete */
+	synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector);
+	spin_lock_irqsave(&dev->cmd.alloc_lock, flags);
+	vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1);
+	if (!vector)
+		goto no_trig;
+
+	vector |= MLX5_TRIGGERED_CMD_COMP;
+	spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
+
+	mlx5_core_dbg(dev, "vector 0x%llx\n", vector);
+	mlx5_cmd_comp_handler(dev, vector);
+	return;
+
+no_trig:
+	spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
+}
+
 static int in_fatal(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
@@ -82,6 +106,43 @@
 	return 0;
 }
 
+void mlx5_enter_error_state(struct mlx5_core_dev *dev)
+{
+	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+		return;
+
+	mlx5_core_err(dev, "start\n");
+	if (pci_channel_offline(dev->pdev) || in_fatal(dev))
+		dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
+
+	mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 0);
+	mlx5_core_err(dev, "end\n");
+}
+
+static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
+{
+	u8 nic_interface = get_nic_interface(dev);
+
+	switch (nic_interface) {
+	case MLX5_NIC_IFC_FULL:
+		mlx5_core_warn(dev, "Expected to see disabled NIC but it is full driver\n");
+		break;
+
+	case MLX5_NIC_IFC_DISABLED:
+		mlx5_core_warn(dev, "starting teardown\n");
+		break;
+
+	case MLX5_NIC_IFC_NO_DRAM_NIC:
+		mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n");
+		break;
+	default:
+		mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n",
+			       nic_interface);
+	}
+
+	mlx5_disable_device(dev);
+}
+
 static void health_care(struct work_struct *work)
 {
 	struct mlx5_core_health *health;
@@ -92,6 +153,7 @@
 	priv = container_of(health, struct mlx5_priv, health);
 	dev = container_of(priv, struct mlx5_core_dev, priv);
 	mlx5_core_warn(dev, "handling bad device here\n");
+	mlx5_handle_bad_state(dev);
 }
 
 static const char *hsynd_str(u8 synd)
@@ -147,6 +209,10 @@
 	u32 fw;
 	int i;
 
+	/* If the syndrom is 0, the device is OK and no need to print buffer */
+	if (!ioread8(&h->synd))
+		return;
+
 	for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
 		dev_err(&dev->pdev->dev, "assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i));
 
@@ -178,6 +244,12 @@
 	struct mlx5_core_health *health = &dev->priv.health;
 	u32 count;
 
+	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+		trigger_cmd_completions(dev);
+		mod_timer(&health->timer, get_next_poll_jiffies());
+		return;
+	}
+
 	count = ioread32be(health->health_counter);
 	if (count == health->prev)
 		++health->miss_counter;
commit	89d44f0a6c732db23b219be708e2fe1e03ee4842	[log] [tgz]
author	Majd Dibbiny <majd@mellanox.com>	Wed Oct 14 17:43:46 2015 +0300
committer	David S. Miller <davem@davemloft.net>	Wed Oct 14 19:14:42 2015 -0700
tree	f8a0aa4f622b8a79ae070015e1e73e7ec0b87526
parent	fd76ee4da55abb21babfc69310d321b9cb9a32e0 [diff] [blame]