[PATCH] spidernet: performance optimizations

Performance optimizations, changes in these areas:
  - RX and TX checksum offload
  - correct maximum MTU
  - don't use TX interrupts anymore, use a timer instead
  - remove some superfluous barriers
  - improve RX RAM full handling

From: Utz Bacher <utz.bacher@de.ibm.com>
Signed-off-by: Jens Osterkamp <jens.osterkamp@de.ibm.com>
Signed-off-by: Arnd Bergmann <arndb@de.ibm.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
diff --git a/drivers/net/spider_net.c b/drivers/net/spider_net.c
index 8696919..e2ad9ae 100644
--- a/drivers/net/spider_net.c
+++ b/drivers/net/spider_net.c
@@ -22,7 +22,6 @@
  */
 
 #include <linux/config.h>
-
 #include <linux/compiler.h>
 #include <linux/crc32.h>
 #include <linux/delay.h>
@@ -43,6 +42,7 @@
 #include <linux/slab.h>
 #include <linux/tcp.h>
 #include <linux/types.h>
+#include <linux/vmalloc.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 #include <asm/bitops.h>
@@ -108,42 +108,6 @@
 	writel(value, card->regs + reg);
 }
 
-/**
- * spider_net_write_reg_sync - writes to an SMMIO register of a card
- * @card: device structure
- * @reg: register to write to
- * @value: value to write into the specified SMMIO register
- *
- * Unlike spider_net_write_reg, this will also make sure the
- * data arrives on the card by reading the reg again.
- */
-static void
-spider_net_write_reg_sync(struct spider_net_card *card, u32 reg, u32 value)
-{
-	value = cpu_to_le32(value);
-	writel(value, card->regs + reg);
-	(void)readl(card->regs + reg);
-}
-
-/**
- * spider_net_rx_irq_off - switch off rx irq on this spider card
- * @card: device structure
- *
- * switches off rx irq by masking them out in the GHIINTnMSK register
- */
-static void
-spider_net_rx_irq_off(struct spider_net_card *card)
-{
-	u32 regvalue;
-	unsigned long flags;
-
-	spin_lock_irqsave(&card->intmask_lock, flags);
-	regvalue = spider_net_read_reg(card, SPIDER_NET_GHIINT0MSK);
-	regvalue &= ~SPIDER_NET_RXINT;
-	spider_net_write_reg_sync(card, SPIDER_NET_GHIINT0MSK, regvalue);
-	spin_unlock_irqrestore(&card->intmask_lock, flags);
-}
-
 /** spider_net_write_phy - write to phy register
  * @netdev: adapter to be written to
  * @mii_id: id of MII
@@ -199,6 +163,21 @@
 }
 
 /**
+ * spider_net_rx_irq_off - switch off rx irq on this spider card
+ * @card: device structure
+ *
+ * switches off rx irq by masking them out in the GHIINTnMSK register
+ */
+static void
+spider_net_rx_irq_off(struct spider_net_card *card)
+{
+	u32 regvalue;
+
+	regvalue = SPIDER_NET_INT0_MASK_VALUE & (~SPIDER_NET_RXINT);
+	spider_net_write_reg(card, SPIDER_NET_GHIINT0MSK, regvalue);
+}
+
+/**
  * spider_net_rx_irq_on - switch on rx irq on this spider card
  * @card: device structure
  *
@@ -208,51 +187,9 @@
 spider_net_rx_irq_on(struct spider_net_card *card)
 {
 	u32 regvalue;
-	unsigned long flags;
 
-	spin_lock_irqsave(&card->intmask_lock, flags);
-	regvalue = spider_net_read_reg(card, SPIDER_NET_GHIINT0MSK);
-	regvalue |= SPIDER_NET_RXINT;
-	spider_net_write_reg_sync(card, SPIDER_NET_GHIINT0MSK, regvalue);
-	spin_unlock_irqrestore(&card->intmask_lock, flags);
-}
-
-/**
- * spider_net_tx_irq_off - switch off tx irq on this spider card
- * @card: device structure
- *
- * switches off tx irq by masking them out in the GHIINTnMSK register
- */
-static void
-spider_net_tx_irq_off(struct spider_net_card *card)
-{
-	u32 regvalue;
-	unsigned long flags;
-
-	spin_lock_irqsave(&card->intmask_lock, flags);
-	regvalue = spider_net_read_reg(card, SPIDER_NET_GHIINT0MSK);
-	regvalue &= ~SPIDER_NET_TXINT;
-	spider_net_write_reg_sync(card, SPIDER_NET_GHIINT0MSK, regvalue);
-	spin_unlock_irqrestore(&card->intmask_lock, flags);
-}
-
-/**
- * spider_net_tx_irq_on - switch on tx irq on this spider card
- * @card: device structure
- *
- * switches on tx irq by enabling them in the GHIINTnMSK register
- */
-static void
-spider_net_tx_irq_on(struct spider_net_card *card)
-{
-	u32 regvalue;
-	unsigned long flags;
-
-	spin_lock_irqsave(&card->intmask_lock, flags);
-	regvalue = spider_net_read_reg(card, SPIDER_NET_GHIINT0MSK);
-	regvalue |= SPIDER_NET_TXINT;
-	spider_net_write_reg_sync(card, SPIDER_NET_GHIINT0MSK, regvalue);
-	spin_unlock_irqrestore(&card->intmask_lock, flags);
+	regvalue = SPIDER_NET_INT0_MASK_VALUE | SPIDER_NET_RXINT;
+	spider_net_write_reg(card, SPIDER_NET_GHIINT0MSK, regvalue);
 }
 
 /**
@@ -326,9 +263,8 @@
 spider_net_get_descr_status(struct spider_net_descr *descr)
 {
 	u32 cmd_status;
-	rmb();
+
 	cmd_status = descr->dmac_cmd_status;
-	rmb();
 	cmd_status >>= SPIDER_NET_DESCR_IND_PROC_SHIFT;
 	/* no need to mask out any bits, as cmd_status is 32 bits wide only
 	 * (and unsigned) */
@@ -349,7 +285,6 @@
 {
 	u32 cmd_status;
 	/* read the status */
-	mb();
 	cmd_status = descr->dmac_cmd_status;
 	/* clean the upper 4 bits */
 	cmd_status &= SPIDER_NET_DESCR_IND_PROC_MASKO;
@@ -357,7 +292,6 @@
 	cmd_status |= ((u32)status)<<SPIDER_NET_DESCR_IND_PROC_SHIFT;
 	/* and write it back */
 	descr->dmac_cmd_status = cmd_status;
-	wmb();
 }
 
 /**
@@ -398,8 +332,9 @@
 {
 	int i;
 	struct spider_net_descr *descr;
+	dma_addr_t buf;
 
-	spin_lock_init(&card->chain_lock);
+	atomic_set(&card->rx_chain_refill,0);
 
 	descr = start_descr;
 	memset(descr, 0, sizeof(*descr) * no);
@@ -408,14 +343,14 @@
 	for (i=0; i<no; i++, descr++) {
 		spider_net_set_descr_status(descr, SPIDER_NET_DESCR_NOT_IN_USE);
 
-		descr->bus_addr =
-			pci_map_single(card->pdev, descr,
-				       SPIDER_NET_DESCR_SIZE,
-				       PCI_DMA_BIDIRECTIONAL);
+		buf = pci_map_single(card->pdev, descr,
+				     SPIDER_NET_DESCR_SIZE,
+				     PCI_DMA_BIDIRECTIONAL);
 
-		if (descr->bus_addr == DMA_ERROR_CODE)
+		if (buf == DMA_ERROR_CODE)
 			goto iommu_error;
 
+		descr->bus_addr = buf;
 		descr->next = descr + 1;
 		descr->prev = descr - 1;
 
@@ -439,7 +374,8 @@
 	for (i=0; i < no; i++, descr++)
 		if (descr->bus_addr)
 			pci_unmap_single(card->pdev, descr->bus_addr,
-					 SPIDER_NET_DESCR_SIZE, PCI_DMA_BIDIRECTIONAL);
+					 SPIDER_NET_DESCR_SIZE,
+					 PCI_DMA_BIDIRECTIONAL);
 	return -ENOMEM;
 }
 
@@ -459,7 +395,7 @@
 		if (descr->skb) {
 			dev_kfree_skb(descr->skb);
 			pci_unmap_single(card->pdev, descr->buf_addr,
-					 SPIDER_NET_MAX_MTU,
+					 SPIDER_NET_MAX_FRAME,
 					 PCI_DMA_BIDIRECTIONAL);
 		}
 		descr = descr->next;
@@ -486,7 +422,7 @@
 	int bufsize;
 
 	/* we need to round up the buffer size to a multiple of 128 */
-	bufsize = (SPIDER_NET_MAX_MTU + SPIDER_NET_RXBUF_ALIGN - 1) &
+	bufsize = (SPIDER_NET_MAX_FRAME + SPIDER_NET_RXBUF_ALIGN - 1) &
 		(~(SPIDER_NET_RXBUF_ALIGN - 1));
 
 	/* and we need to have it 128 byte aligned, therefore we allocate a
@@ -494,10 +430,8 @@
 	/* allocate an skb */
 	descr->skb = dev_alloc_skb(bufsize + SPIDER_NET_RXBUF_ALIGN - 1);
 	if (!descr->skb) {
-		if (net_ratelimit())
-			if (netif_msg_rx_err(card))
-				pr_err("Not enough memory to allocate "
-					"rx buffer\n");
+		if (netif_msg_rx_err(card) && net_ratelimit())
+			pr_err("Not enough memory to allocate rx buffer\n");
 		return -ENOMEM;
 	}
 	descr->buf_size = bufsize;
@@ -512,12 +446,11 @@
 		skb_reserve(descr->skb, SPIDER_NET_RXBUF_ALIGN - offset);
 	/* io-mmu-map the skb */
 	buf = pci_map_single(card->pdev, descr->skb->data,
-					 SPIDER_NET_MAX_MTU,
-					 PCI_DMA_BIDIRECTIONAL);
+			     SPIDER_NET_MAX_FRAME, PCI_DMA_BIDIRECTIONAL);
 	descr->buf_addr = buf;
 	if (buf == DMA_ERROR_CODE) {
 		dev_kfree_skb_any(descr->skb);
-		if (netif_msg_rx_err(card))
+		if (netif_msg_rx_err(card) && net_ratelimit())
 			pr_err("Could not iommu-map rx buffer\n");
 		spider_net_set_descr_status(descr, SPIDER_NET_DESCR_NOT_IN_USE);
 	} else {
@@ -528,10 +461,10 @@
 }
 
 /**
- * spider_net_enable_rxctails - sets RX dmac chain tail addresses
+ * spider_net_enable_rxchtails - sets RX dmac chain tail addresses
  * @card: card structure
  *
- * spider_net_enable_rxctails sets the RX DMAC chain tail adresses in the
+ * spider_net_enable_rxchtails sets the RX DMAC chain tail adresses in the
  * chip by writing to the appropriate register. DMA is enabled in
  * spider_net_enable_rxdmac.
  */
@@ -553,6 +486,7 @@
 static void
 spider_net_enable_rxdmac(struct spider_net_card *card)
 {
+	wmb();
 	spider_net_write_reg(card, SPIDER_NET_GDADMACCNTR,
 			     SPIDER_NET_DMA_RX_VALUE);
 }
@@ -561,32 +495,28 @@
  * spider_net_refill_rx_chain - refills descriptors/skbs in the rx chains
  * @card: card structure
  *
- * refills descriptors in all chains (last used chain first): allocates skbs
- * and iommu-maps them.
+ * refills descriptors in the rx chain: allocates skbs and iommu-maps them.
  */
 static void
 spider_net_refill_rx_chain(struct spider_net_card *card)
 {
 	struct spider_net_descr_chain *chain;
-	int count = 0;
-	unsigned long flags;
 
 	chain = &card->rx_chain;
 
-	spin_lock_irqsave(&card->chain_lock, flags);
-	while (spider_net_get_descr_status(chain->head) ==
-				SPIDER_NET_DESCR_NOT_IN_USE) {
-		if (spider_net_prepare_rx_descr(card, chain->head))
-			break;
-		count++;
-		chain->head = chain->head->next;
-	}
-	spin_unlock_irqrestore(&card->chain_lock, flags);
+	/* one context doing the refill (and a second context seeing that
+	 * and omitting it) is ok. If called by NAPI, we'll be called again
+	 * as spider_net_decode_one_descr is called several times. If some
+	 * interrupt calls us, the NAPI is about to clean up anyway. */
+	if (atomic_inc_return(&card->rx_chain_refill) == 1)
+		while (spider_net_get_descr_status(chain->head) ==
+		       SPIDER_NET_DESCR_NOT_IN_USE) {
+			if (spider_net_prepare_rx_descr(card, chain->head))
+				break;
+			chain->head = chain->head->next;
+		}
 
-	/* could be optimized, only do that, if we know the DMA processing
-	 * has terminated */
-	if (count)
-		spider_net_enable_rxdmac(card);
+	atomic_dec(&card->rx_chain_refill);
 }
 
 /**
@@ -615,6 +545,7 @@
 	/* this will allocate the rest of the rx buffers; if not, it's
 	 * business as usual later on */
 	spider_net_refill_rx_chain(card);
+	spider_net_enable_rxdmac(card);
 	return 0;
 
 error:
@@ -651,24 +582,30 @@
  * @card: adapter structure
  * @brutal: if set, don't care about whether descriptor seems to be in use
  *
- * releases the tx descriptors that spider has finished with (if non-brutal)
- * or simply release tx descriptors (if brutal)
+ * returns 0 if the tx ring is empty, otherwise 1.
+ *
+ * spider_net_release_tx_chain releases the tx descriptors that spider has
+ * finished with (if non-brutal) or simply release tx descriptors (if brutal).
+ * If some other context is calling this function, we return 1 so that we're
+ * scheduled again (if we were scheduled) and will not loose initiative.
  */
-static void
+static int
 spider_net_release_tx_chain(struct spider_net_card *card, int brutal)
 {
 	struct spider_net_descr_chain *tx_chain = &card->tx_chain;
 	enum spider_net_descr_status status;
 
-	spider_net_tx_irq_off(card);
+	if (atomic_inc_return(&card->tx_chain_release) != 1) {
+		atomic_dec(&card->tx_chain_release);
+		return 1;
+	}
 
-	/* no lock for chain needed, if this is only executed once at a time */
-again:
 	for (;;) {
 		status = spider_net_get_descr_status(tx_chain->tail);
 		switch (status) {
 		case SPIDER_NET_DESCR_CARDOWNED:
-			if (!brutal) goto out;
+			if (!brutal)
+				goto out;
 			/* fallthrough, if we release the descriptors
 			 * brutally (then we don't care about
 			 * SPIDER_NET_DESCR_CARDOWNED) */
@@ -695,25 +632,30 @@
 		tx_chain->tail = tx_chain->tail->next;
 	}
 out:
+	atomic_dec(&card->tx_chain_release);
+
 	netif_wake_queue(card->netdev);
 
-	if (!brutal) {
-		/* switch on tx irqs (while we are still in the interrupt
-		 * handler, so we don't get an interrupt), check again
-		 * for done descriptors. This results in fewer interrupts */
-		spider_net_tx_irq_on(card);
-		status = spider_net_get_descr_status(tx_chain->tail);
-		switch (status) {
-			case SPIDER_NET_DESCR_RESPONSE_ERROR:
-			case SPIDER_NET_DESCR_PROTECTION_ERROR:
-			case SPIDER_NET_DESCR_FORCE_END:
-			case SPIDER_NET_DESCR_COMPLETE:
-				goto again;
-			default:
-				break;
-		}
-	}
+	if (status == SPIDER_NET_DESCR_CARDOWNED)
+		return 1;
+	return 0;
+}
 
+/**
+ * spider_net_cleanup_tx_ring - cleans up the TX ring
+ * @card: card structure
+ *
+ * spider_net_cleanup_tx_ring is called by the tx_timer (as we don't use
+ * interrupts to cleanup our TX ring) and returns sent packets to the stack
+ * by freeing them
+ */
+static void
+spider_net_cleanup_tx_ring(struct spider_net_card *card)
+{
+	if ( (spider_net_release_tx_chain(card, 0)) &&
+	      (card->netdev->flags & IFF_UP) ) {
+		mod_timer(&card->tx_timer, jiffies + SPIDER_NET_TX_TIMER);
+	}
 }
 
 /**
@@ -728,16 +670,22 @@
 static u8
 spider_net_get_multicast_hash(struct net_device *netdev, __u8 *addr)
 {
-	/* FIXME: an addr of 01:00:5e:00:00:01 must result in 0xa9,
-	 * ff:ff:ff:ff:ff:ff must result in 0xfd */
 	u32 crc;
 	u8 hash;
+	char addr_for_crc[ETH_ALEN] = { 0, };
+	int i, bit;
 
-	crc = crc32_be(~0, addr, netdev->addr_len);
+	for (i = 0; i < ETH_ALEN * 8; i++) {
+		bit = (addr[i / 8] >> (i % 8)) & 1;
+		addr_for_crc[ETH_ALEN - 1 - i / 8] += bit << (7 - (i % 8));
+	}
+
+	crc = crc32_be(~0, addr_for_crc, netdev->addr_len);
 
 	hash = (crc >> 27);
 	hash <<= 3;
 	hash |= crc & 7;
+	hash &= 0xff;
 
 	return hash;
 }
@@ -823,9 +771,11 @@
 {
 	struct spider_net_card *card = netdev_priv(netdev);
 
+	tasklet_kill(&card->rxram_full_tl);
 	netif_poll_disable(netdev);
 	netif_carrier_off(netdev);
 	netif_stop_queue(netdev);
+	del_timer_sync(&card->tx_timer);
 
 	/* disable/mask all interrupts */
 	spider_net_write_reg(card, SPIDER_NET_GHIINT0MSK, 0);
@@ -874,13 +824,15 @@
  * @skb: packet to consider
  *
  * fills out the command and status field of the descriptor structure,
- * depending on hardware checksum settings. This function assumes a wmb()
- * has executed before.
+ * depending on hardware checksum settings.
  */
 static void
 spider_net_set_txdescr_cmdstat(struct spider_net_descr *descr,
 			       struct sk_buff *skb)
 {
+	/* make sure the other fields in the descriptor are written */
+	wmb();
+
 	if (skb->ip_summed != CHECKSUM_HW) {
 		descr->dmac_cmd_status = SPIDER_NET_DMAC_CMDSTAT_NOCS;
 		return;
@@ -889,14 +841,13 @@
 	/* is packet ip?
 	 * if yes: tcp? udp? */
 	if (skb->protocol == htons(ETH_P_IP)) {
-		if (skb->nh.iph->protocol == IPPROTO_TCP) {
+		if (skb->nh.iph->protocol == IPPROTO_TCP)
 			descr->dmac_cmd_status = SPIDER_NET_DMAC_CMDSTAT_TCPCS;
-		} else if (skb->nh.iph->protocol == IPPROTO_UDP) {
+		else if (skb->nh.iph->protocol == IPPROTO_UDP)
 			descr->dmac_cmd_status = SPIDER_NET_DMAC_CMDSTAT_UDPCS;
-		} else { /* the stack should checksum non-tcp and non-udp
-			    packets on his own: NETIF_F_IP_CSUM */
+		else /* the stack should checksum non-tcp and non-udp
+			packets on his own: NETIF_F_IP_CSUM */
 			descr->dmac_cmd_status = SPIDER_NET_DMAC_CMDSTAT_NOCS;
-		}
 	}
 }
 
@@ -916,10 +867,12 @@
 			    struct spider_net_descr *descr,
 			    struct sk_buff *skb)
 {
-	dma_addr_t buf = pci_map_single(card->pdev, skb->data,
-					skb->len, PCI_DMA_BIDIRECTIONAL);
+	dma_addr_t buf;
+
+	buf = pci_map_single(card->pdev, skb->data,
+			     skb->len, PCI_DMA_BIDIRECTIONAL);
 	if (buf == DMA_ERROR_CODE) {
-		if (netif_msg_tx_err(card))
+		if (netif_msg_tx_err(card) && net_ratelimit())
 			pr_err("could not iommu-map packet (%p, %i). "
 				  "Dropping packet\n", skb->data, skb->len);
 		return -ENOMEM;
@@ -930,10 +883,6 @@
 	descr->skb = skb;
 	descr->data_status = 0;
 
-	/* make sure the above values are in memory before we change the
-	 * status */
-	wmb();
-
 	spider_net_set_txdescr_cmdstat(descr,skb);
 
 	return 0;
@@ -975,17 +924,12 @@
 	struct spider_net_descr *descr;
 	int result;
 
+	spider_net_release_tx_chain(card, 0);
+
 	descr = spider_net_get_next_tx_descr(card);
 
-	if (!descr) {
-		netif_stop_queue(netdev);
-
-		descr = spider_net_get_next_tx_descr(card);
-		if (!descr)
-			goto error;
-		else
-			netif_start_queue(netdev);
-	}
+	if (!descr)
+		goto error;
 
 	result = spider_net_prepare_tx_descr(card, descr, skb);
 	if (result)
@@ -993,19 +937,25 @@
 
 	card->tx_chain.head = card->tx_chain.head->next;
 
-	/* make sure the status from spider_net_prepare_tx_descr is in
-	 * memory before we check out the previous descriptor */
-	wmb();
-
 	if (spider_net_get_descr_status(descr->prev) !=
-	    SPIDER_NET_DESCR_CARDOWNED)
-		spider_net_kick_tx_dma(card, descr);
+	    SPIDER_NET_DESCR_CARDOWNED) {
+		/* make sure the current descriptor is in memory. Then
+		 * kicking it on again makes sense, if the previous is not
+		 * card-owned anymore. Check the previous descriptor twice
+		 * to omit an mb() in heavy traffic cases */
+		mb();
+		if (spider_net_get_descr_status(descr->prev) !=
+		    SPIDER_NET_DESCR_CARDOWNED)
+			spider_net_kick_tx_dma(card, descr);
+	}
+
+	mod_timer(&card->tx_timer, jiffies + SPIDER_NET_TX_TIMER);
 
 	return NETDEV_TX_OK;
 
 error:
 	card->netdev_stats.tx_dropped++;
-	return NETDEV_TX_LOCKED;
+	return NETDEV_TX_BUSY;
 }
 
 /**
@@ -1030,6 +980,7 @@
  * spider_net_pass_skb_up - takes an skb from a descriptor and passes it on
  * @descr: descriptor to process
  * @card: card structure
+ * @napi: whether caller is in NAPI context
  *
  * returns 1 on success, 0 if no packet was passed to the stack
  *
@@ -1038,7 +989,7 @@
  */
 static int
 spider_net_pass_skb_up(struct spider_net_descr *descr,
-		       struct spider_net_card *card)
+		       struct spider_net_card *card, int napi)
 {
 	struct sk_buff *skb;
 	struct net_device *netdev;
@@ -1049,22 +1000,20 @@
 
 	netdev = card->netdev;
 
-	/* check for errors in the data_error flag */
-	if ((data_error & SPIDER_NET_DATA_ERROR_MASK) &&
-	    netif_msg_rx_err(card))
-		pr_err("error in received descriptor found, "
-		       "data_status=x%08x, data_error=x%08x\n",
-		       data_status, data_error);
-
-	/* prepare skb, unmap descriptor */
-	skb = descr->skb;
-	pci_unmap_single(card->pdev, descr->buf_addr, SPIDER_NET_MAX_MTU,
+	/* unmap descriptor */
+	pci_unmap_single(card->pdev, descr->buf_addr, SPIDER_NET_MAX_FRAME,
 			 PCI_DMA_BIDIRECTIONAL);
 
 	/* the cases we'll throw away the packet immediately */
-	if (data_error & SPIDER_NET_DESTROY_RX_FLAGS)
+	if (data_error & SPIDER_NET_DESTROY_RX_FLAGS) {
+		if (netif_msg_rx_err(card))
+			pr_err("error in received descriptor found, "
+			       "data_status=x%08x, data_error=x%08x\n",
+			       data_status, data_error);
 		return 0;
+	}
 
+	skb = descr->skb;
 	skb->dev = netdev;
 	skb_put(skb, descr->valid_size);
 
@@ -1076,14 +1025,14 @@
 
 	/* checksum offload */
 	if (card->options.rx_csum) {
-		if ( (data_status & SPIDER_NET_DATA_STATUS_CHK_MASK) &&
-		     (!(data_error & SPIDER_NET_DATA_ERROR_CHK_MASK)) )
+		if ( ( (data_status & SPIDER_NET_DATA_STATUS_CKSUM_MASK) ==
+		       SPIDER_NET_DATA_STATUS_CKSUM_MASK) &&
+		     !(data_error & SPIDER_NET_DATA_ERR_CKSUM_MASK))
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 		else
 			skb->ip_summed = CHECKSUM_NONE;
-	} else {
+	} else
 		skb->ip_summed = CHECKSUM_NONE;
-	}
 
 	if (data_status & SPIDER_NET_VLAN_PACKET) {
 		/* further enhancements: HW-accel VLAN
@@ -1092,7 +1041,10 @@
 	}
 
 	/* pass skb up to stack */
-	netif_receive_skb(skb);
+	if (napi)
+		netif_receive_skb(skb);
+	else
+		netif_rx_ni(skb);
 
 	/* update netdevice statistics */
 	card->netdev_stats.rx_packets++;
@@ -1102,16 +1054,18 @@
 }
 
 /**
- * spider_net_decode_descr - processes an rx descriptor
+ * spider_net_decode_one_descr - processes an rx descriptor
  * @card: card structure
+ * @napi: whether caller is in NAPI context
  *
  * returns 1 if a packet has been sent to the stack, otherwise 0
  *
  * processes an rx descriptor by iommu-unmapping the data buffer and passing
- * the packet up to the stack
+ * the packet up to the stack. This function is called in softirq
+ * context, e.g. either bottom half from interrupt or NAPI polling context
  */
 static int
-spider_net_decode_one_descr(struct spider_net_card *card)
+spider_net_decode_one_descr(struct spider_net_card *card, int napi)
 {
 	enum spider_net_descr_status status;
 	struct spider_net_descr *descr;
@@ -1125,17 +1079,19 @@
 
 	if (status == SPIDER_NET_DESCR_CARDOWNED) {
 		/* nothing in the descriptor yet */
-		return 0;
+		result=0;
+		goto out;
 	}
 
 	if (status == SPIDER_NET_DESCR_NOT_IN_USE) {
-		/* not initialized yet, I bet chain->tail == chain->head
-		 * and the ring is empty */
+		/* not initialized yet, the ring must be empty */
 		spider_net_refill_rx_chain(card);
-		return 0;
+		spider_net_enable_rxdmac(card);
+		result=0;
+		goto out;
 	}
 
-	/* descriptor definitively used -- move on head */
+	/* descriptor definitively used -- move on tail */
 	chain->tail = descr->next;
 
 	result = 0;
@@ -1146,6 +1102,9 @@
 			pr_err("%s: dropping RX descriptor with state %d\n",
 			       card->netdev->name, status);
 		card->netdev_stats.rx_dropped++;
+		pci_unmap_single(card->pdev, descr->buf_addr,
+				 SPIDER_NET_MAX_FRAME, PCI_DMA_BIDIRECTIONAL);
+		dev_kfree_skb_irq(descr->skb);
 		goto refill;
 	}
 
@@ -1158,12 +1117,13 @@
 	}
 
 	/* ok, we've got a packet in descr */
-	result = spider_net_pass_skb_up(descr, card);
+	result = spider_net_pass_skb_up(descr, card, napi);
 refill:
 	spider_net_set_descr_status(descr, SPIDER_NET_DESCR_NOT_IN_USE);
 	/* change the descriptor state: */
-	spider_net_refill_rx_chain(card);
-
+	if (!napi)
+		spider_net_refill_rx_chain(card);
+out:
 	return result;
 }
 
@@ -1189,7 +1149,7 @@
 	packets_to_do = min(*budget, netdev->quota);
 
 	while (packets_to_do) {
-		if (spider_net_decode_one_descr(card)) {
+		if (spider_net_decode_one_descr(card, 1)) {
 			packets_done++;
 			packets_to_do--;
 		} else {
@@ -1201,6 +1161,7 @@
 
 	netdev->quota -= packets_done;
 	*budget -= packets_done;
+	spider_net_refill_rx_chain(card);
 
 	/* if all packets are in the stack, enable interrupts and return 0 */
 	/* if not, return 1 */
@@ -1345,6 +1306,24 @@
 }
 
 /**
+ * spider_net_handle_rxram_full - cleans up RX ring upon RX RAM full interrupt
+ * @card: card structure
+ *
+ * spider_net_handle_rxram_full empties the RX ring so that spider can put
+ * more packets in it and empty its RX RAM. This is called in bottom half
+ * context
+ */
+static void
+spider_net_handle_rxram_full(struct spider_net_card *card)
+{
+	while (spider_net_decode_one_descr(card, 0))
+		;
+	spider_net_enable_rxchtails(card);
+	spider_net_enable_rxdmac(card);
+	netif_rx_schedule(card->netdev);
+}
+
+/**
  * spider_net_handle_error_irq - handles errors raised by an interrupt
  * @card: card structure
  * @status_reg: interrupt status register 0 (GHIINT0STS)
@@ -1452,17 +1431,21 @@
 				switch (i)
 	{
 	case SPIDER_NET_GTMFLLINT:
-		if (netif_msg_intr(card))
+		if (netif_msg_intr(card) && net_ratelimit())
 			pr_err("Spider TX RAM full\n");
 		show_error = 0;
 		break;
+	case SPIDER_NET_GRFDFLLINT: /* fallthrough */
+	case SPIDER_NET_GRFCFLLINT: /* fallthrough */
+	case SPIDER_NET_GRFBFLLINT: /* fallthrough */
+	case SPIDER_NET_GRFAFLLINT: /* fallthrough */
 	case SPIDER_NET_GRMFLLINT:
-		if (netif_msg_intr(card))
+		if (netif_msg_intr(card) && net_ratelimit())
 			pr_err("Spider RX RAM full, incoming packets "
-			       "might be discarded !\n");
-		netif_rx_schedule(card->netdev);
-		spider_net_enable_rxchtails(card);
-		spider_net_enable_rxdmac(card);
+			       "might be discarded!\n");
+		spider_net_rx_irq_off(card);
+		tasklet_schedule(&card->rxram_full_tl);
+		show_error = 0;
 		break;
 
 	/* case SPIDER_NET_GTMSHTINT: problem, print a message */
@@ -1470,10 +1453,6 @@
 		/* allrighty. tx from previous descr ok */
 		show_error = 0;
 		break;
-	/* case SPIDER_NET_GRFDFLLINT: print a message down there */
-	/* case SPIDER_NET_GRFCFLLINT: print a message down there */
-	/* case SPIDER_NET_GRFBFLLINT: print a message down there */
-	/* case SPIDER_NET_GRFAFLLINT: print a message down there */
 
 	/* chain end */
 	case SPIDER_NET_GDDDCEINT: /* fallthrough */
@@ -1485,6 +1464,7 @@
 			       "restarting DMAC %c.\n",
 			       'D'+i-SPIDER_NET_GDDDCEINT);
 		spider_net_refill_rx_chain(card);
+		spider_net_enable_rxdmac(card);
 		show_error = 0;
 		break;
 
@@ -1495,6 +1475,7 @@
 	case SPIDER_NET_GDAINVDINT:
 		/* could happen when rx chain is full */
 		spider_net_refill_rx_chain(card);
+		spider_net_enable_rxdmac(card);
 		show_error = 0;
 		break;
 
@@ -1583,17 +1564,13 @@
 	if (!status_reg)
 		return IRQ_NONE;
 
-	if (status_reg & SPIDER_NET_TXINT)
-		spider_net_release_tx_chain(card, 0);
-
 	if (status_reg & SPIDER_NET_RXINT ) {
 		spider_net_rx_irq_off(card);
 		netif_rx_schedule(netdev);
 	}
 
-	/* we do this after rx and tx processing, as we want the tx chain
-	 * processed to see, whether we should restart tx dma processing */
-	spider_net_handle_error_irq(card, status_reg);
+	if (status_reg & SPIDER_NET_ERRINT )
+		spider_net_handle_error_irq(card, status_reg);
 
 	/* clear interrupt sources */
 	spider_net_write_reg(card, SPIDER_NET_GHIINT0STS, status_reg);
@@ -1834,26 +1811,27 @@
 /**
  * spider_net_download_firmware - loads firmware into the adapter
  * @card: card structure
- * @firmware: firmware pointer
+ * @firmware_ptr: pointer to firmware data
  *
- * spider_net_download_firmware loads the firmware opened by
- * spider_net_init_firmware into the adapter.
+ * spider_net_download_firmware loads the firmware data into the
+ * adapter. It assumes the length etc. to be allright.
  */
 static int
 spider_net_download_firmware(struct spider_net_card *card,
-			     const struct firmware *firmware)
+			     u8 *firmware_ptr)
 {
 	int sequencer, i;
-	u32 *fw_ptr = (u32 *)firmware->data;
+	u32 *fw_ptr = (u32 *)firmware_ptr;
 
 	/* stop sequencers */
 	spider_net_write_reg(card, SPIDER_NET_GSINIT,
 			     SPIDER_NET_STOP_SEQ_VALUE);
 
-	for (sequencer = 0; sequencer < 6; sequencer++) {
+	for (sequencer = 0; sequencer < SPIDER_NET_FIRMWARE_SEQS;
+	     sequencer++) {
 		spider_net_write_reg(card,
 				     SPIDER_NET_GSnPRGADR + sequencer * 8, 0);
-		for (i = 0; i < SPIDER_NET_FIRMWARE_LEN; i++) {
+		for (i = 0; i < SPIDER_NET_FIRMWARE_SEQWORDS; i++) {
 			spider_net_write_reg(card, SPIDER_NET_GSnPRGDAT +
 					     sequencer * 8, *fw_ptr);
 			fw_ptr++;
@@ -1898,41 +1876,53 @@
 static int
 spider_net_init_firmware(struct spider_net_card *card)
 {
-	struct firmware *firmware;
+	struct firmware *firmware = NULL;
 	struct device_node *dn;
-	u8 *fw_prop;
-	int err = -EIO;
+	u8 *fw_prop = NULL;
+	int err = -ENOENT;
+	int fw_size;
 
 	if (request_firmware((const struct firmware **)&firmware,
-			     SPIDER_NET_FIRMWARE_NAME, &card->pdev->dev) < 0) {
-		if (netif_msg_probe(card))
-			pr_err("Couldn't read in sequencer data file %s.\n",
-			       SPIDER_NET_FIRMWARE_NAME);
+			     SPIDER_NET_FIRMWARE_NAME, &card->pdev->dev) == 0) {
+		if ( (firmware->size != SPIDER_NET_FIRMWARE_LEN) &&
+		     netif_msg_probe(card) ) {
+			pr_err("Incorrect size of spidernet firmware in " \
+			       "filesystem. Looking in host firmware...\n");
+			goto try_host_fw;
+		}
+		err = spider_net_download_firmware(card, firmware->data);
 
-		dn = pci_device_to_OF_node(card->pdev);
-		if (!dn)
-			goto out;
+		release_firmware(firmware);
+		if (err)
+			goto try_host_fw;
 
-		fw_prop = (u8 *)get_property(dn, "firmware", NULL);
-		if (!fw_prop)
-			goto out;
-
-		memcpy(firmware->data, fw_prop, 6 * SPIDER_NET_FIRMWARE_LEN * sizeof(u32));
-		firmware->size = 6 * SPIDER_NET_FIRMWARE_LEN * sizeof(u32);
+		goto done;
 	}
 
-	if (firmware->size != 6 * SPIDER_NET_FIRMWARE_LEN * sizeof(u32)) {
-		if (netif_msg_probe(card))
-			pr_err("Invalid size of sequencer data file %s.\n",
-			       SPIDER_NET_FIRMWARE_NAME);
-		goto out;
+try_host_fw:
+	dn = pci_device_to_OF_node(card->pdev);
+	if (!dn)
+		goto out_err;
+
+	fw_prop = (u8 *)get_property(dn, "firmware", &fw_size);
+	if (!fw_prop)
+		goto out_err;
+
+	if ( (fw_size != SPIDER_NET_FIRMWARE_LEN) &&
+	     netif_msg_probe(card) ) {
+		pr_err("Incorrect size of spidernet firmware in " \
+		       "host firmware\n");
+		goto done;
 	}
 
-	if (!spider_net_download_firmware(card, firmware))
-		err = 0;
-out:
-	release_firmware(firmware);
+	err = spider_net_download_firmware(card, fw_prop);
 
+done:
+	return err;
+out_err:
+	if (netif_msg_probe(card))
+		pr_err("Couldn't find spidernet firmware in filesystem " \
+		       "or host firmware\n");
 	return err;
 }
 
@@ -1952,10 +1942,11 @@
 			     SPIDER_NET_CKRCTRL_RUN_VALUE);
 
 	/* empty sequencer data */
-	for (sequencer = 0; sequencer < 6; sequencer++) {
+	for (sequencer = 0; sequencer < SPIDER_NET_FIRMWARE_SEQS;
+	     sequencer++) {
 		spider_net_write_reg(card, SPIDER_NET_GSnPRGDAT +
 				     sequencer * 8, 0x0);
-		for (i = 0; i < SPIDER_NET_FIRMWARE_LEN; i++) {
+		for (i = 0; i < SPIDER_NET_FIRMWARE_SEQWORDS; i++) {
 			spider_net_write_reg(card, SPIDER_NET_GSnPRGDAT +
 					     sequencer * 8, 0x0);
 		}
@@ -2079,7 +2070,15 @@
 	SET_NETDEV_DEV(netdev, &card->pdev->dev);
 
 	pci_set_drvdata(card->pdev, netdev);
-	spin_lock_init(&card->intmask_lock);
+
+	atomic_set(&card->tx_chain_release,0);
+	card->rxram_full_tl.data = (unsigned long) card;
+	card->rxram_full_tl.func =
+		(void (*)(unsigned long)) spider_net_handle_rxram_full;
+	init_timer(&card->tx_timer);
+	card->tx_timer.function =
+		(void (*)(unsigned long)) spider_net_cleanup_tx_ring;
+	card->tx_timer.data = (unsigned long) card;
 	netdev->irq = card->pdev->irq;
 
 	card->options.rx_csum = SPIDER_NET_RX_CSUM_DEFAULT;
diff --git a/drivers/net/spider_net.h b/drivers/net/spider_net.h
index 98f11ec..5922b52 100644
--- a/drivers/net/spider_net.h
+++ b/drivers/net/spider_net.h
@@ -33,25 +33,32 @@
 
 extern char spider_net_driver_name[];
 
-#define SPIDER_NET_MAX_MTU			2308
+#define SPIDER_NET_MAX_FRAME			2312
+#define SPIDER_NET_MAX_MTU			2294
 #define SPIDER_NET_MIN_MTU			64
 
 #define SPIDER_NET_RXBUF_ALIGN			128
 
-#define SPIDER_NET_RX_DESCRIPTORS_DEFAULT	64
+#define SPIDER_NET_RX_DESCRIPTORS_DEFAULT	256
 #define SPIDER_NET_RX_DESCRIPTORS_MIN		16
-#define SPIDER_NET_RX_DESCRIPTORS_MAX		256
+#define SPIDER_NET_RX_DESCRIPTORS_MAX		512
 
-#define SPIDER_NET_TX_DESCRIPTORS_DEFAULT	64
+#define SPIDER_NET_TX_DESCRIPTORS_DEFAULT	256
 #define SPIDER_NET_TX_DESCRIPTORS_MIN		16
-#define SPIDER_NET_TX_DESCRIPTORS_MAX		256
+#define SPIDER_NET_TX_DESCRIPTORS_MAX		512
+
+#define SPIDER_NET_TX_TIMER			20
 
 #define SPIDER_NET_RX_CSUM_DEFAULT		1
 
-#define SPIDER_NET_WATCHDOG_TIMEOUT 5*HZ
-#define SPIDER_NET_NAPI_WEIGHT 64
+#define SPIDER_NET_WATCHDOG_TIMEOUT		50*HZ
+#define SPIDER_NET_NAPI_WEIGHT			64
 
-#define SPIDER_NET_FIRMWARE_LEN		1024
+#define SPIDER_NET_FIRMWARE_SEQS	6
+#define SPIDER_NET_FIRMWARE_SEQWORDS	1024
+#define SPIDER_NET_FIRMWARE_LEN		(SPIDER_NET_FIRMWARE_SEQS * \
+					 SPIDER_NET_FIRMWARE_SEQWORDS * \
+					 sizeof(u32))
 #define SPIDER_NET_FIRMWARE_NAME	"spider_fw.bin"
 
 /** spider_net SMMIO registers */
@@ -142,14 +149,12 @@
 /** SCONFIG registers */
 #define SPIDER_NET_SCONFIG_IOACTE	0x00002810
 
-/** hardcoded register values */
-#define SPIDER_NET_INT0_MASK_VALUE	0x3f7fe3ff
-#define SPIDER_NET_INT1_MASK_VALUE	0xffffffff
+/** interrupt mask registers */
+#define SPIDER_NET_INT0_MASK_VALUE	0x3f7fe2c7
+#define SPIDER_NET_INT1_MASK_VALUE	0xffff7ff7
 /* no MAC aborts -> auto retransmission */
-#define SPIDER_NET_INT2_MASK_VALUE	0xfffffff1
+#define SPIDER_NET_INT2_MASK_VALUE	0xffef7ff1
 
-/* clear counter when interrupt sources are cleared
-#define SPIDER_NET_FRAMENUM_VALUE	0x0001f001 */
 /* we rely on flagged descriptor interrupts */
 #define SPIDER_NET_FRAMENUM_VALUE	0x00000000
 /* set this first, then the FRAMENUM_VALUE */
@@ -168,7 +173,7 @@
 #if 0
 #define SPIDER_NET_WOL_VALUE		0x00000000
 #endif
-#define SPIDER_NET_IPSECINIT_VALUE	0x00f000f8
+#define SPIDER_NET_IPSECINIT_VALUE	0x6f716f71
 
 /* pause frames: automatic, no upper retransmission count */
 /* outside loopback mode: ETOMOD signal dont matter, not connected */
@@ -318,6 +323,10 @@
 #define SPIDER_NET_RXINT	( (1 << SPIDER_NET_GDAFDCINT) | \
 				  (1 << SPIDER_NET_GRMFLLINT) )
 
+#define SPIDER_NET_ERRINT	( 0xffffffff & \
+				  (~SPIDER_NET_TXINT) & \
+				  (~SPIDER_NET_RXINT) )
+
 #define SPIDER_NET_GPREXEC		0x80000000
 #define SPIDER_NET_GPRDAT_MASK		0x0000ffff
 
@@ -358,9 +367,6 @@
 /* descr ready, descr is in middle of chain, get interrupt on completion */
 #define SPIDER_NET_DMAC_RX_CARDOWNED	0xa0800000
 
-/* multicast is no problem */
-#define SPIDER_NET_DATA_ERROR_MASK	0xffffbfff
-
 enum spider_net_descr_status {
 	SPIDER_NET_DESCR_COMPLETE		= 0x00, /* used in rx and tx */
 	SPIDER_NET_DESCR_RESPONSE_ERROR		= 0x01, /* used in rx and tx */
@@ -384,7 +390,7 @@
 
 	/* used in the driver */
 	struct sk_buff *skb;
-	dma_addr_t bus_addr;
+	u32 bus_addr;
 	struct spider_net_descr *next;
 	struct spider_net_descr *prev;
 } __attribute__((aligned(32)));
@@ -396,21 +402,21 @@
 };
 
 /* descriptor data_status bits */
-#define SPIDER_NET_RXIPCHK		29
-#define SPIDER_NET_TCPUDPIPCHK		28
-#define SPIDER_NET_DATA_STATUS_CHK_MASK	(1 << SPIDER_NET_RXIPCHK | \
-					 1 << SPIDER_NET_TCPUDPIPCHK)
-
+#define SPIDER_NET_RX_IPCHK		29
+#define SPIDER_NET_RX_TCPCHK		28
 #define SPIDER_NET_VLAN_PACKET		21
+#define SPIDER_NET_DATA_STATUS_CKSUM_MASK ( (1 << SPIDER_NET_RX_IPCHK) | \
+					  (1 << SPIDER_NET_RX_TCPCHK) )
 
 /* descriptor data_error bits */
-#define SPIDER_NET_RXIPCHKERR		27
-#define SPIDER_NET_RXTCPCHKERR		26
-#define SPIDER_NET_DATA_ERROR_CHK_MASK	(1 << SPIDER_NET_RXIPCHKERR | \
-					 1 << SPIDER_NET_RXTCPCHKERR)
+#define SPIDER_NET_RX_IPCHKERR		27
+#define SPIDER_NET_RX_RXTCPCHKERR	28
 
-/* the cases we don't pass the packet to the stack */
-#define SPIDER_NET_DESTROY_RX_FLAGS	0x70138000
+#define SPIDER_NET_DATA_ERR_CKSUM_MASK	(1 << SPIDER_NET_RX_IPCHKERR)
+
+/* the cases we don't pass the packet to the stack.
+ * 701b8000 would be correct, but every packets gets that flag */
+#define SPIDER_NET_DESTROY_RX_FLAGS	0x700b8000
 
 #define SPIDER_NET_DESCR_SIZE		32
 
@@ -445,13 +451,16 @@
 
 	struct spider_net_descr_chain tx_chain;
 	struct spider_net_descr_chain rx_chain;
-	spinlock_t chain_lock;
+	atomic_t rx_chain_refill;
+	atomic_t tx_chain_release;
 
 	struct net_device_stats netdev_stats;
 
 	struct spider_net_options options;
 
 	spinlock_t intmask_lock;
+	struct tasklet_struct rxram_full_tl;
+	struct timer_list tx_timer;
 
 	struct work_struct tx_timeout_task;
 	atomic_t tx_timeout_task_counter;
diff --git a/drivers/net/spider_net_ethtool.c b/drivers/net/spider_net_ethtool.c
index d42e60b..a5bb0b76 100644
--- a/drivers/net/spider_net_ethtool.c
+++ b/drivers/net/spider_net_ethtool.c
@@ -113,6 +113,23 @@
 	return 0;
 }
 
+static uint32_t
+spider_net_ethtool_get_tx_csum(struct net_device *netdev)
+{
+        return (netdev->features & NETIF_F_HW_CSUM) != 0;
+}
+
+static int
+spider_net_ethtool_set_tx_csum(struct net_device *netdev, uint32_t data)
+{
+        if (data)
+                netdev->features |= NETIF_F_HW_CSUM;
+        else
+                netdev->features &= ~NETIF_F_HW_CSUM;
+
+        return 0;
+}
+
 struct ethtool_ops spider_net_ethtool_ops = {
 	.get_settings		= spider_net_ethtool_get_settings,
 	.get_drvinfo		= spider_net_ethtool_get_drvinfo,
@@ -122,5 +139,7 @@
 	.nway_reset		= spider_net_ethtool_nway_reset,
 	.get_rx_csum		= spider_net_ethtool_get_rx_csum,
 	.set_rx_csum		= spider_net_ethtool_set_rx_csum,
+	.get_tx_csum		= spider_net_ethtool_get_tx_csum,
+	.set_tx_csum		= spider_net_ethtool_set_tx_csum,
 };