mlx4: Add support for EEH error recovery

Currently the mlx4 drivers don't have the necessary callbacks to
implement EEH errors detection and recovery, so the PCI layer uses the
probe and remove callbacks to try to recover the device after an error on
the bus. However, these callbacks have race conditions with the internal
catastrophic error recovery functions, which will also detect the error
and this can cause the system to crash if both EEH and catas functions
try to reset the device.

This patch adds the necessary error recovery callbacks and makes sure
that the internal catastrophic error functions will not try to reset the
device in such scenarios. It also adds some calls to
pci_channel_offline() to suppress reads/writes on the bus when the slot
cannot accept I/O operations so we prevent unnecessary accesses to the
bus and speed up the device removal.

Signed-off-by: Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com>
Acked-by: Shlomo Pongratz <shlomop@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 7e94987d..c8fef43 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -296,7 +296,12 @@
 
 static int cmd_pending(struct mlx4_dev *dev)
 {
-	u32 status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
+	u32 status;
+
+	if (pci_channel_offline(dev->pdev))
+		return -EIO;
+
+	status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
 
 	return (status & swab32(1 << HCR_GO_BIT)) ||
 		(mlx4_priv(dev)->cmd.toggle ==
@@ -314,11 +319,29 @@
 
 	mutex_lock(&cmd->hcr_mutex);
 
+	if (pci_channel_offline(dev->pdev)) {
+		/*
+		 * Device is going through error recovery
+		 * and cannot accept commands.
+		 */
+		ret = -EIO;
+		goto out;
+	}
+
 	end = jiffies;
 	if (event)
 		end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS);
 
 	while (cmd_pending(dev)) {
+		if (pci_channel_offline(dev->pdev)) {
+			/*
+			 * Device is going through error recovery
+			 * and cannot accept commands.
+			 */
+			ret = -EIO;
+			goto out;
+		}
+
 		if (time_after_eq(jiffies, end)) {
 			mlx4_err(dev, "%s:cmd_pending failed\n", __func__);
 			goto out;
@@ -431,14 +454,33 @@
 
 	down(&priv->cmd.poll_sem);
 
+	if (pci_channel_offline(dev->pdev)) {
+		/*
+		 * Device is going through error recovery
+		 * and cannot accept commands.
+		 */
+		err = -EIO;
+		goto out;
+	}
+
 	err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
 			    in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0);
 	if (err)
 		goto out;
 
 	end = msecs_to_jiffies(timeout) + jiffies;
-	while (cmd_pending(dev) && time_before(jiffies, end))
+	while (cmd_pending(dev) && time_before(jiffies, end)) {
+		if (pci_channel_offline(dev->pdev)) {
+			/*
+			 * Device is going through error recovery
+			 * and cannot accept commands.
+			 */
+			err = -EIO;
+			goto out;
+		}
+
 		cond_resched();
+	}
 
 	if (cmd_pending(dev)) {
 		err = -ETIMEDOUT;
@@ -532,6 +574,9 @@
 	       int out_is_imm, u32 in_modifier, u8 op_modifier,
 	       u16 op, unsigned long timeout, int native)
 {
+	if (pci_channel_offline(dev->pdev))
+		return -EIO;
+
 	if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) {
 		if (mlx4_priv(dev)->cmd.use_events)
 			return mlx4_cmd_wait(dev, in_param, out_param,