[BNX2]: Fix tx race condition. Fix a subtle race condition between bnx2_start_xmit() and bnx2_tx_int() similar to the one in tg3 discovered by Herbert Xu: CPU0 CPU1 bnx2_start_xmit() if (tx_ring_full) { tx_lock bnx2_tx() if (!netif_queue_stopped) netif_stop_queue() if (!tx_ring_full) update_tx_ring netif_wake_queue() tx_unlock } Even though tx_ring is updated before the if statement in bnx2_tx_int() in program order, it can be re-ordered by the CPU as shown above. This scenario can cause the tx queue to be stopped forever if bnx2_tx_int() has just freed up the entire tx_ring. The possibility of this happening should be very rare though. The following changes are made, very much identical to the tg3 fix: 1. Add memory barrier to fix the above race condition. 2. Eliminate the private tx_lock altogether and rely solely on netif_tx_lock. This eliminates one spinlock in bnx2_start_xmit() when the ring is full. 3. Because of 2, use netif_tx_lock in bnx2_tx_int() before calling netif_wake_queue(). 4. Add memory barrier to bnx2_tx_avail(). 5. Add bp->tx_wake_thresh which is set to half the tx ring size. 6. Check for the full wake queue condition before getting netif_tx_lock in tg3_tx(). This reduces the number of unnecessary spinlocks when the tx ring is full in a steady-state condition. Signed-off-by: Michael Chan <mchan@broadcom.com> Signed-off-by: David S. Miller <davem@davemloft.net>

commit: 2f8af120a159a843948749ea88bcacda9779b132 [log] [tgz]
author: Michael Chan <mchan@broadcom.com> Tue Aug 15 01:39:10 2006 -0700
committer: David S. Miller <davem@sunset.davemloft.net> Thu Aug 17 16:29:51 2006 -0700
tree: e633037a72db856b8078e814e3451dd8f44c14de
parent: fb33f82568d32016b3b3c00819574f9526e52be3 [diff] [blame]
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index db73de0..2099edb 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c

@@ -209,8 +209,10 @@
 
 static inline u32 bnx2_tx_avail(struct bnx2 *bp)
 {
-	u32 diff = TX_RING_IDX(bp->tx_prod) - TX_RING_IDX(bp->tx_cons);
+	u32 diff;
 
+	smp_mb();
+	diff = TX_RING_IDX(bp->tx_prod) - TX_RING_IDX(bp->tx_cons);
 	if (diff > MAX_TX_DESC_CNT)
 		diff = (diff & MAX_TX_DESC_CNT) - 1;
 	return (bp->tx_ring_size - diff);
@@ -1686,15 +1688,20 @@
 	}
 
 	bp->tx_cons = sw_cons;
+	/* Need to make the tx_cons update visible to bnx2_start_xmit()
+	 * before checking for netif_queue_stopped().  Without the
+	 * memory barrier, there is a small possibility that bnx2_start_xmit()
+	 * will miss it and cause the queue to be stopped forever.
+	 */
+	smp_mb();
 
-	if (unlikely(netif_queue_stopped(bp->dev))) {
-		spin_lock(&bp->tx_lock);
+	if (unlikely(netif_queue_stopped(bp->dev)) &&
+		     (bnx2_tx_avail(bp) > bp->tx_wake_thresh)) {
+		netif_tx_lock(bp->dev);
 		if ((netif_queue_stopped(bp->dev)) &&
-		    (bnx2_tx_avail(bp) > MAX_SKB_FRAGS)) {
-
+		    (bnx2_tx_avail(bp) > bp->tx_wake_thresh))
 			netif_wake_queue(bp->dev);
-		}
-		spin_unlock(&bp->tx_lock);
+		netif_tx_unlock(bp->dev);
 	}
 }
 
@@ -3503,6 +3510,8 @@
 	struct tx_bd *txbd;
 	u32 val;
 
+	bp->tx_wake_thresh = bp->tx_ring_size / 2;
+
 	txbd = &bp->tx_desc_ring[MAX_TX_DESC_CNT];
 		
 	txbd->tx_bd_haddr_hi = (u64) bp->tx_desc_mapping >> 32;
@@ -4390,10 +4399,8 @@
 #endif
 
 /* Called with netif_tx_lock.
- * hard_start_xmit is pseudo-lockless - a lock is only required when
- * the tx queue is full. This way, we get the benefit of lockless
- * operations most of the time without the complexities to handle
- * netif_stop_queue/wake_queue race conditions.
+ * bnx2_tx_int() runs without netif_tx_lock unless it needs to call
+ * netif_wake_queue().
  */
 static int
 bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -4512,12 +4519,9 @@
 	dev->trans_start = jiffies;
 
 	if (unlikely(bnx2_tx_avail(bp) <= MAX_SKB_FRAGS)) {
-		spin_lock(&bp->tx_lock);
 		netif_stop_queue(dev);
-		
-		if (bnx2_tx_avail(bp) > MAX_SKB_FRAGS)
+		if (bnx2_tx_avail(bp) > bp->tx_wake_thresh)
 			netif_wake_queue(dev);
-		spin_unlock(&bp->tx_lock);
 	}
 
 	return NETDEV_TX_OK;
@@ -5628,7 +5632,6 @@
 	bp->pdev = pdev;
 
 	spin_lock_init(&bp->phy_lock);
-	spin_lock_init(&bp->tx_lock);
 	INIT_WORK(&bp->reset_task, bnx2_reset_task, bp);
 
 	dev->base_addr = dev->mem_start = pci_resource_start(pdev, 0);
commit	2f8af120a159a843948749ea88bcacda9779b132	[log] [tgz]
author	Michael Chan <mchan@broadcom.com>	Tue Aug 15 01:39:10 2006 -0700
committer	David S. Miller <davem@sunset.davemloft.net>	Thu Aug 17 16:29:51 2006 -0700
tree	e633037a72db856b8078e814e3451dd8f44c14de
parent	fb33f82568d32016b3b3c00819574f9526e52be3 [diff] [blame]