[SCSI] mpt2sas: driver fails to recover from injected PCIe bus errors

fixes surrounding PCIe enhanced error handling:

(1) We need to reject all request generated internaly inside the driver as well
as request arriving from the scsi mid layer when PCIe EEH is active. The fix is
to add a per adapter flag called pci_error_recovery which is checked thru out
the driver when request are generated.

(2) We don't need to call the pci_driver->remove directly from the PCIe
callbacks becuase its already called from the PCIe EEH code. In its place we are
shutting down the watchdog timer, and flushing back all pending IO.

(3) We need to save and restore the pci state across PCIe EEH handling.

Signed-off-by: Eric Moore <eric.moore@lsi.com>
Signed-off-by: James Bottomley <James.Bottomley@suse.de>
diff --git a/drivers/scsi/mpt2sas/mpt2sas_base.c b/drivers/scsi/mpt2sas/mpt2sas_base.c
index 1f22a76..57bcd5c 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_base.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_base.c
@@ -1311,6 +1311,9 @@
 	printk(MPT2SAS_INFO_FMT "ioport(0x%016llx), size(%d)\n",
 	    ioc->name, (unsigned long long)pio_chip, pio_sz);
 
+	/* Save PCI configuration state for recovery from PCI AER/EEH errors */
+	pci_save_state(pdev);
+
 	return 0;
 
  out_fail:
@@ -3407,6 +3410,9 @@
 	dinitprintk(ioc, printk(MPT2SAS_INFO_FMT "%s\n", ioc->name,
 	    __func__));
 
+	if (ioc->pci_error_recovery)
+		return 0;
+
 	ioc_state = mpt2sas_base_get_iocstate(ioc, 0);
 	dhsprintk(ioc, printk(MPT2SAS_INFO_FMT "%s: ioc_state(0x%08x)\n",
 	    ioc->name, __func__, ioc_state));
@@ -3869,6 +3875,13 @@
 	dtmprintk(ioc, printk(MPT2SAS_INFO_FMT "%s: enter\n", ioc->name,
 	    __func__));
 
+	if (ioc->pci_error_recovery) {
+		printk(MPT2SAS_ERR_FMT "%s: pci error recovery reset\n",
+		    ioc->name, __func__);
+		r = 0;
+		goto out;
+	}
+
 	if (mpt2sas_fwfault_debug)
 		mpt2sas_halt_firmware(ioc);
 
diff --git a/drivers/scsi/mpt2sas/mpt2sas_base.h b/drivers/scsi/mpt2sas/mpt2sas_base.h
index 0b0ef5e..0ebef0c 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_base.h
+++ b/drivers/scsi/mpt2sas/mpt2sas_base.h
@@ -477,6 +477,7 @@
  * @ioc_link_reset_in_progress: phy/hard reset in progress
  * @ignore_loginfos: ignore loginfos during task managment
  * @remove_host: flag for when driver unloads, to avoid sending dev resets
+ * @pci_error_recovery: flag to prevent ioc access until slot reset completes
  * @wait_for_port_enable_to_complete:
  * @msix_enable: flag indicating msix is enabled
  * @msix_vector_count: number msix vectors
@@ -617,6 +618,7 @@
 
 	u8		ignore_loginfos;
 	u8		remove_host;
+	u8		pci_error_recovery;
 	u8		wait_for_port_enable_to_complete;
 
 	u8		msix_enable;
diff --git a/drivers/scsi/mpt2sas/mpt2sas_config.c b/drivers/scsi/mpt2sas/mpt2sas_config.c
index e26f920..6afd67b 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_config.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_config.c
@@ -401,7 +401,7 @@
 		if (ioc->config_cmds.smid == smid)
 			mpt2sas_base_free_smid(ioc, smid);
 		if ((ioc->shost_recovery) || (ioc->config_cmds.status &
-		    MPT2_CMD_RESET))
+		    MPT2_CMD_RESET) || ioc->pci_error_recovery)
 			goto retry_config;
 		issue_host_reset = 1;
 		r = -EFAULT;
diff --git a/drivers/scsi/mpt2sas/mpt2sas_ctl.c b/drivers/scsi/mpt2sas/mpt2sas_ctl.c
index 55ac1cb..b774973 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_ctl.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_ctl.c
@@ -2156,7 +2156,7 @@
 		    !ioc)
 			return -ENODEV;
 
-		if (ioc->shost_recovery)
+		if (ioc->shost_recovery || ioc->pci_error_recovery)
 			return -EAGAIN;
 
 		if (_IOC_SIZE(cmd) == sizeof(struct mpt2_ioctl_command)) {
@@ -2275,7 +2275,7 @@
 	if (_ctl_verify_adapter(karg32.hdr.ioc_number, &ioc) == -1 || !ioc)
 		return -ENODEV;
 
-	if (ioc->shost_recovery)
+	if (ioc->shost_recovery || ioc->pci_error_recovery)
 		return -EAGAIN;
 
 	memset(&karg, 0, sizeof(struct mpt2_ioctl_command));
diff --git a/drivers/scsi/mpt2sas/mpt2sas_scsih.c b/drivers/scsi/mpt2sas/mpt2sas_scsih.c
index 854cc91..6273abd 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_scsih.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_scsih.c
@@ -1997,7 +1997,8 @@
 		goto err_out;
 	}
 
-	if (ioc->shost_recovery || ioc->remove_host) {
+	if (ioc->shost_recovery || ioc->remove_host ||
+	    ioc->pci_error_recovery) {
 		printk(MPT2SAS_INFO_FMT "%s: host reset in progress!\n",
 		    __func__, ioc->name);
 		rc = FAILED;
@@ -2644,7 +2645,8 @@
 	unsigned long flags;
 	struct _tr_list *delayed_tr;
 
-	if (ioc->shost_recovery || ioc->remove_host) {
+	if (ioc->shost_recovery || ioc->remove_host ||
+	    ioc->pci_error_recovery) {
 		dewtprintk(ioc, printk(MPT2SAS_INFO_FMT "%s: host reset in "
 		   "progress!\n", __func__, ioc->name));
 		return;
@@ -2742,7 +2744,8 @@
 	u16 smid;
 	struct _tr_list *delayed_tr;
 
-	if (ioc->shost_recovery || ioc->remove_host) {
+	if (ioc->shost_recovery || ioc->remove_host ||
+	    ioc->pci_error_recovery) {
 		dewtprintk(ioc, printk(MPT2SAS_INFO_FMT "%s: host reset in "
 		   "progress!\n", __func__, ioc->name));
 		return;
@@ -2793,7 +2796,8 @@
 	Mpi2SCSITaskManagementReply_t *mpi_reply =
 	    mpt2sas_base_get_reply_virt_addr(ioc, reply);
 
-	if (ioc->shost_recovery || ioc->remove_host) {
+	if (ioc->shost_recovery || ioc->remove_host ||
+	    ioc->pci_error_recovery) {
 		dewtprintk(ioc, printk(MPT2SAS_INFO_FMT "%s: host reset in "
 		   "progress!\n", __func__, ioc->name));
 		return 1;
@@ -2845,7 +2849,8 @@
 	Mpi2SasIoUnitControlRequest_t *mpi_request;
 	u16 smid_sas_ctrl;
 
-	if (ioc->shost_recovery || ioc->remove_host) {
+	if (ioc->shost_recovery || ioc->remove_host ||
+	    ioc->pci_error_recovery) {
 		dewtprintk(ioc, printk(MPT2SAS_INFO_FMT "%s: host reset in "
 		   "progress!\n", __func__, ioc->name));
 		return 1;
@@ -3187,7 +3192,10 @@
 		count++;
 		mpt2sas_base_free_smid(ioc, smid);
 		scsi_dma_unmap(scmd);
-		scmd->result = DID_RESET << 16;
+		if (ioc->pci_error_recovery)
+			scmd->result = DID_NO_CONNECT << 16;
+		else
+			scmd->result = DID_RESET << 16;
 		scmd->scsi_done(scmd);
 	}
 	dtmprintk(ioc, printk(MPT2SAS_INFO_FMT "completing %d cmds\n",
@@ -3324,6 +3332,12 @@
 		return 0;
 	}
 
+	if (ioc->pci_error_recovery) {
+		scmd->result = DID_NO_CONNECT << 16;
+		scmd->scsi_done(scmd);
+		return 0;
+	}
+
 	sas_target_priv_data = sas_device_priv_data->sas_target;
 	/* invalid device handle */
 	if (sas_target_priv_data->handle == MPT2SAS_INVALID_DEVICE_HANDLE) {
@@ -4156,7 +4170,7 @@
 	if (!handle)
 		return -1;
 
-	if (ioc->shost_recovery)
+	if (ioc->shost_recovery || ioc->pci_error_recovery)
 		return -1;
 
 	if ((mpt2sas_config_get_expander_pg0(ioc, &mpi_reply, &expander_pg0,
@@ -4734,7 +4748,7 @@
 		_scsih_sas_topology_change_event_debug(ioc, event_data);
 #endif
 
-	if (ioc->shost_recovery || ioc->remove_host)
+	if (ioc->shost_recovery || ioc->remove_host || ioc->pci_error_recovery)
 		return;
 
 	if (!ioc->sas_hba.num_phys)
@@ -4773,7 +4787,8 @@
 			    "expander event\n", ioc->name));
 			return;
 		}
-		if (ioc->shost_recovery || ioc->remove_host)
+		if (ioc->shost_recovery || ioc->remove_host ||
+		    ioc->pci_error_recovery)
 			return;
 		phy_number = event_data->StartPhyNum + i;
 		reason_code = event_data->PHY[i].PhyStatus &
@@ -6273,7 +6288,8 @@
 	struct MPT2SAS_ADAPTER *ioc = fw_event->ioc;
 
 	/* the queue is being flushed so ignore this event */
-	if (ioc->remove_host || fw_event->cancel_pending_work) {
+	if (ioc->remove_host || fw_event->cancel_pending_work ||
+	    ioc->pci_error_recovery) {
 		_scsih_fw_event_free(ioc, fw_event);
 		return;
 	}
@@ -6355,7 +6371,7 @@
 	u16 sz;
 
 	/* events turned off due to host reset or driver unloading */
-	if (ioc->remove_host)
+	if (ioc->remove_host || ioc->pci_error_recovery)
 		return 1;
 
 	mpi_reply = mpt2sas_base_get_reply_virt_addr(ioc, reply);
@@ -7058,12 +7074,17 @@
 	case pci_channel_io_normal:
 		return PCI_ERS_RESULT_CAN_RECOVER;
 	case pci_channel_io_frozen:
+		/* Fatal error, prepare for slot reset */
+		ioc->pci_error_recovery = 1;
 		scsi_block_requests(ioc->shost);
 		mpt2sas_base_stop_watchdog(ioc);
 		mpt2sas_base_free_resources(ioc);
 		return PCI_ERS_RESULT_NEED_RESET;
 	case pci_channel_io_perm_failure:
-		_scsih_remove(pdev);
+		/* Permanent error, prepare for device removal */
+		ioc->pci_error_recovery = 1;
+		mpt2sas_base_stop_watchdog(ioc);
+		_scsih_flush_running_cmds(ioc);
 		return PCI_ERS_RESULT_DISCONNECT;
 	}
 	return PCI_ERS_RESULT_NEED_RESET;
@@ -7087,7 +7108,9 @@
 	printk(MPT2SAS_INFO_FMT "PCI error: slot reset callback!!\n",
 		ioc->name);
 
+	ioc->pci_error_recovery = 0;
 	ioc->pdev = pdev;
+	pci_restore_state(pdev);
 	rc = mpt2sas_base_map_resources(ioc);
 	if (rc)
 		return PCI_ERS_RESULT_DISCONNECT;
diff --git a/drivers/scsi/mpt2sas/mpt2sas_transport.c b/drivers/scsi/mpt2sas/mpt2sas_transport.c
index f29ea5e..b55c6dc 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_transport.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_transport.c
@@ -140,7 +140,7 @@
 	u32 device_info;
 	u32 ioc_status;
 
-	if (ioc->shost_recovery) {
+	if (ioc->shost_recovery || ioc->pci_error_recovery) {
 		printk(MPT2SAS_INFO_FMT "%s: host reset in progress!\n",
 		    __func__, ioc->name);
 		return -EFAULT;
@@ -302,7 +302,7 @@
 	u64 *sas_address_le;
 	u16 wait_state_count;
 
-	if (ioc->shost_recovery) {
+	if (ioc->shost_recovery || ioc->pci_error_recovery) {
 		printk(MPT2SAS_INFO_FMT "%s: host reset in progress!\n",
 		    __func__, ioc->name);
 		return -EFAULT;
@@ -894,7 +894,7 @@
 	struct _sas_node *sas_node;
 	struct _sas_phy *mpt2sas_phy;
 
-	if (ioc->shost_recovery)
+	if (ioc->shost_recovery || ioc->pci_error_recovery)
 		return;
 
 	spin_lock_irqsave(&ioc->sas_node_lock, flags);
@@ -997,7 +997,7 @@
 	u64 *sas_address_le;
 	u16 wait_state_count;
 
-	if (ioc->shost_recovery) {
+	if (ioc->shost_recovery || ioc->pci_error_recovery) {
 		printk(MPT2SAS_INFO_FMT "%s: host reset in progress!\n",
 		    __func__, ioc->name);
 		return -EFAULT;