bnx2: Add prefetches to rx path.

Add prefetches of the skb and the next rx descriptor to speed up rx path.

Use prefetchw() for the skb [suggested by Eric Dumazet].

The rx descriptor is in skb->data which is mapped for streaming mode DMA.
Eric Dumazet pointed out that we should not prefetch the data before
dma_sync.  So we prefetch only if dma_sync is no_op on the system.

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 320526b..667f419 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -2719,6 +2719,7 @@
 	}
 
 	rx_buf->skb = skb;
+	rx_buf->desc = (struct l2_fhdr *) skb->data;
 	dma_unmap_addr_set(rx_buf, mapping, mapping);
 
 	rxbd->rx_bd_haddr_hi = (u64) mapping >> 32;
@@ -2941,6 +2942,7 @@
 	rxr->rx_prod_bseq += bp->rx_buf_use_size;
 
 	prod_rx_buf->skb = skb;
+	prod_rx_buf->desc = (struct l2_fhdr *) skb->data;
 
 	if (cons == prod)
 		return;
@@ -3074,6 +3076,7 @@
 	u16 hw_cons, sw_cons, sw_ring_cons, sw_prod, sw_ring_prod;
 	struct l2_fhdr *rx_hdr;
 	int rx_pkt = 0, pg_ring_used = 0;
+	struct pci_dev *pdev = bp->pdev;
 
 	hw_cons = bnx2_get_hw_rx_cons(bnapi);
 	sw_cons = rxr->rx_cons;
@@ -3086,7 +3089,7 @@
 	while (sw_cons != hw_cons) {
 		unsigned int len, hdr_len;
 		u32 status;
-		struct sw_bd *rx_buf;
+		struct sw_bd *rx_buf, *next_rx_buf;
 		struct sk_buff *skb;
 		dma_addr_t dma_addr;
 		u16 vtag = 0;
@@ -3097,7 +3100,14 @@
 
 		rx_buf = &rxr->rx_buf_ring[sw_ring_cons];
 		skb = rx_buf->skb;
+		prefetchw(skb);
 
+		if (!get_dma_ops(&pdev->dev)->sync_single_for_cpu) {
+			next_rx_buf =
+				&rxr->rx_buf_ring[
+					RX_RING_IDX(NEXT_RX_BD(sw_cons))];
+			prefetch(next_rx_buf->desc);
+		}
 		rx_buf->skb = NULL;
 
 		dma_addr = dma_unmap_addr(rx_buf, mapping);
@@ -3106,7 +3116,7 @@
 			BNX2_RX_OFFSET + BNX2_RX_COPY_THRESH,
 			PCI_DMA_FROMDEVICE);
 
-		rx_hdr = (struct l2_fhdr *) skb->data;
+		rx_hdr = rx_buf->desc;
 		len = rx_hdr->l2_fhdr_pkt_len;
 		status = rx_hdr->l2_fhdr_status;
 
@@ -5764,7 +5774,7 @@
 	rx_buf = &rxr->rx_buf_ring[rx_start_idx];
 	rx_skb = rx_buf->skb;
 
-	rx_hdr = (struct l2_fhdr *) rx_skb->data;
+	rx_hdr = rx_buf->desc;
 	skb_reserve(rx_skb, BNX2_RX_OFFSET);
 
 	pci_dma_sync_single_for_cpu(bp->pdev,