e1000e: 82579 potential system hang on stress when ME enabled Previously, a workaround was added to address a hardware bug in the PCIm2PCI arbiter where a write by the driver of the Transmit/Receive Descriptor Tail register could happen concurrently with a write of any MAC CSR register by the Manageability Engine (ME) which could cause the Tail register to have an incorrect value. The arbiter is supposed to prevent the concurrent writes but there is a bug that can cause the Host (driver) access to be acknowledged later than it should. After further investigation, it was discovered that a driver write access of any MAC CSR register after being idle for some time can be lost when ME is accessing a MAC CSR register. When this happens, no further target access is claimed by the MAC which could hang the system. The workaround to check bit 24 in the FWSM register (set only when ME is accessing a MAC CSR register) and delay for a limited amount of time until it is cleared is now done for all driver writes of MAC CSR registers on 82579 with ME enabled. In the rare case when the driver is writing the Tail register and ME is accessing any MAC CSR register for a duration longer than the maximum delay, write the register and verify it has the correct value before continuing, otherwise reset the device. This patch also moves some pre-existing macros from the hardware-specific header file to the more appropriate generic driver header file. Signed-off-by: Bruce Allan <bruce.w.allan@intel.com> Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

commit: bdc125f73f3c810754e858b942d54faf4ba6bffe [log] [tgz]
author: Bruce Allan <bruce.w.allan@intel.com> Tue Mar 20 03:47:52 2012 +0000
committer: Jeff Kirsher <jeffrey.t.kirsher@intel.com> Fri Apr 27 02:25:13 2012 -0700
tree: ad071aff2b90fc56efcb9552bc503e515a7dff42
parent: 36ceeb43cecaf98a488b94bb318a1f3dd5a87033 [diff] [blame]
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 851f793..cdfb1d6 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c

@@ -538,43 +538,15 @@
 	adapter->hw_csum_good++;
 }
 
-/**
- * e1000e_update_tail_wa - helper function for e1000e_update_[rt]dt_wa()
- * @hw: pointer to the HW structure
- * @tail: address of tail descriptor register
- * @i: value to write to tail descriptor register
- *
- * When updating the tail register, the ME could be accessing Host CSR
- * registers at the same time.  Normally, this is handled in h/w by an
- * arbiter but on some parts there is a bug that acknowledges Host accesses
- * later than it should which could result in the descriptor register to
- * have an incorrect value.  Workaround this by checking the FWSM register
- * which has bit 24 set while ME is accessing Host CSR registers, wait
- * if it is set and try again a number of times.
- **/
-static inline s32 e1000e_update_tail_wa(struct e1000_hw *hw, void __iomem *tail,
-					unsigned int i)
-{
-	unsigned int j = 0;
-
-	while ((j++ < E1000_ICH_FWSM_PCIM2PCI_COUNT) &&
-	       (er32(FWSM) & E1000_ICH_FWSM_PCIM2PCI))
-		udelay(50);
-
-	writel(i, tail);
-
-	if ((j == E1000_ICH_FWSM_PCIM2PCI_COUNT) && (i != readl(tail)))
-		return E1000_ERR_SWFW_SYNC;
-
-	return 0;
-}
-
 static void e1000e_update_rdt_wa(struct e1000_ring *rx_ring, unsigned int i)
 {
 	struct e1000_adapter *adapter = rx_ring->adapter;
 	struct e1000_hw *hw = &adapter->hw;
+	s32 ret_val = __ew32_prepare(hw);
 
-	if (e1000e_update_tail_wa(hw, rx_ring->tail, i)) {
+	writel(i, rx_ring->tail);
+
+	if (unlikely(!ret_val && (i != readl(rx_ring->tail)))) {
 		u32 rctl = er32(RCTL);
 		ew32(RCTL, rctl & ~E1000_RCTL_EN);
 		e_err("ME firmware caused invalid RDT - resetting\n");
@@ -586,8 +558,11 @@
 {
 	struct e1000_adapter *adapter = tx_ring->adapter;
 	struct e1000_hw *hw = &adapter->hw;
+	s32 ret_val = __ew32_prepare(hw);
 
-	if (e1000e_update_tail_wa(hw, tx_ring->tail, i)) {
+	writel(i, tx_ring->tail);
+
+	if (unlikely(!ret_val && (i != readl(tx_ring->tail)))) {
 		u32 tctl = er32(TCTL);
 		ew32(TCTL, tctl & ~E1000_TCTL_EN);
 		e_err("ME firmware caused invalid TDT - resetting\n");
@@ -1646,7 +1621,10 @@
 	adapter->flags2 &= ~FLAG2_IS_DISCARDING;
 
 	writel(0, rx_ring->head);
-	writel(0, rx_ring->tail);
+	if (rx_ring->adapter->flags2 & FLAG2_PCIM2PCI_ARBITER_WA)
+		e1000e_update_rdt_wa(rx_ring, 0);
+	else
+		writel(0, rx_ring->tail);
 }
 
 static void e1000e_downshift_workaround(struct work_struct *work)
@@ -2319,7 +2297,10 @@
 	tx_ring->next_to_clean = 0;
 
 	writel(0, tx_ring->head);
-	writel(0, tx_ring->tail);
+	if (tx_ring->adapter->flags2 & FLAG2_PCIM2PCI_ARBITER_WA)
+		e1000e_update_tdt_wa(tx_ring, 0);
+	else
+		writel(0, tx_ring->tail);
 }
 
 /**
commit	bdc125f73f3c810754e858b942d54faf4ba6bffe	[log] [tgz]
author	Bruce Allan <bruce.w.allan@intel.com>	Tue Mar 20 03:47:52 2012 +0000
committer	Jeff Kirsher <jeffrey.t.kirsher@intel.com>	Fri Apr 27 02:25:13 2012 -0700
tree	ad071aff2b90fc56efcb9552bc503e515a7dff42
parent	36ceeb43cecaf98a488b94bb318a1f3dd5a87033 [diff] [blame]