gianfar: Add paged allocation and Rx S/G

The eTSEC h/w is capable of scatter/gather on the receive side
too if MAXFRM > MRBLR, when the allowed maximum Rx frame size
is set to be greater than the maximum Rx buffer size (MRBLR).
It's about time the driver makes use of this h/w capability,
by supporting fixed buffer sizes and Rx S/G.

The buffer size given to eTSEC for reception is fixed to
1536B (must be multiple of 64), which is the same default
buffer size as before, used to accommodate standard MTU
(1500B) size frames.  As before, eTSEC can receive frames of
up to 9600B.  Individual Rx buffers are mapped to page halves
(page size for eTSEC systems is 4KB).  The skb is built around
the first buffer of a frame (using build_skb()).  In case the
frame spans multiple buffers, the trailing buffers are added
as Rx fragments to the skb.  The last buffer in frame is marked
by the L status flag.  A mechanism is in place to reuse the pages
owned by the driver (for Rx) for subsequent receptions.

Supporting fixed size buffers allows the implementation of Rx S/G,
which in turn removes the memory pressure issues the driver had
before when MTU was set for jumbo frame reception.
Also, in most cases, the Rx path becomes faster due to Rx page
reusal, since the overhead of allocating new rx buffers is removed
from the fast path.

Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index 7654d5e..648ca85 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -109,7 +109,7 @@
 
 #define TX_TIMEOUT      (1*HZ)
 
-const char gfar_driver_version[] = "1.3";
+const char gfar_driver_version[] = "2.0";
 
 static int gfar_enet_open(struct net_device *dev);
 static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev);
@@ -207,6 +207,7 @@
 
 		rx_queue->next_to_clean = 0;
 		rx_queue->next_to_use = 0;
+		rx_queue->next_to_alloc = 0;
 
 		/* make sure next_to_clean != next_to_use after this
 		 * by leaving at least 1 unused descriptor
@@ -222,7 +223,7 @@
 {
 	void *vaddr;
 	dma_addr_t addr;
-	int i, j, k;
+	int i, j;
 	struct gfar_private *priv = netdev_priv(ndev);
 	struct device *dev = priv->dev;
 	struct gfar_priv_tx_q *tx_queue = NULL;
@@ -262,6 +263,7 @@
 		rx_queue->rx_bd_base = vaddr;
 		rx_queue->rx_bd_dma_base = addr;
 		rx_queue->ndev = ndev;
+		rx_queue->dev = dev;
 		addr  += sizeof(struct rxbd8) * rx_queue->rx_ring_size;
 		vaddr += sizeof(struct rxbd8) * rx_queue->rx_ring_size;
 	}
@@ -276,21 +278,17 @@
 		if (!tx_queue->tx_skbuff)
 			goto cleanup;
 
-		for (k = 0; k < tx_queue->tx_ring_size; k++)
-			tx_queue->tx_skbuff[k] = NULL;
+		for (j = 0; j < tx_queue->tx_ring_size; j++)
+			tx_queue->tx_skbuff[j] = NULL;
 	}
 
 	for (i = 0; i < priv->num_rx_queues; i++) {
 		rx_queue = priv->rx_queue[i];
-		rx_queue->rx_skbuff =
-			kmalloc_array(rx_queue->rx_ring_size,
-				      sizeof(*rx_queue->rx_skbuff),
-				      GFP_KERNEL);
-		if (!rx_queue->rx_skbuff)
+		rx_queue->rx_buff = kcalloc(rx_queue->rx_ring_size,
+					    sizeof(*rx_queue->rx_buff),
+					    GFP_KERNEL);
+		if (!rx_queue->rx_buff)
 			goto cleanup;
-
-		for (j = 0; j < rx_queue->rx_ring_size; j++)
-			rx_queue->rx_skbuff[j] = NULL;
 	}
 
 	gfar_init_bds(ndev);
@@ -335,10 +333,8 @@
 	}
 }
 
-static void gfar_rx_buff_size_config(struct gfar_private *priv)
+static void gfar_rx_offload_en(struct gfar_private *priv)
 {
-	int frame_size = priv->ndev->mtu + ETH_HLEN + ETH_FCS_LEN;
-
 	/* set this when rx hw offload (TOE) functions are being used */
 	priv->uses_rxfcb = 0;
 
@@ -347,16 +343,6 @@
 
 	if (priv->hwts_rx_en)
 		priv->uses_rxfcb = 1;
-
-	if (priv->uses_rxfcb)
-		frame_size += GMAC_FCB_LEN;
-
-	frame_size += priv->padding;
-
-	frame_size = (frame_size & ~(INCREMENTAL_BUFFER_SIZE - 1)) +
-		     INCREMENTAL_BUFFER_SIZE;
-
-	priv->rx_buffer_size = frame_size;
 }
 
 static void gfar_mac_rx_config(struct gfar_private *priv)
@@ -590,7 +576,6 @@
 		if (!priv->rx_queue[i])
 			return -ENOMEM;
 
-		priv->rx_queue[i]->rx_skbuff = NULL;
 		priv->rx_queue[i]->qindex = i;
 		priv->rx_queue[i]->ndev = priv->ndev;
 	}
@@ -1184,12 +1169,11 @@
 
 	udelay(3);
 
-	/* Compute rx_buff_size based on config flags */
-	gfar_rx_buff_size_config(priv);
+	gfar_rx_offload_en(priv);
 
 	/* Initialize the max receive frame/buffer lengths */
-	gfar_write(&regs->maxfrm, priv->rx_buffer_size);
-	gfar_write(&regs->mrblr, priv->rx_buffer_size);
+	gfar_write(&regs->maxfrm, GFAR_JUMBO_FRAME_SIZE);
+	gfar_write(&regs->mrblr, GFAR_RXB_SIZE);
 
 	/* Initialize the Minimum Frame Length Register */
 	gfar_write(&regs->minflr, MINFLR_INIT_SETTINGS);
@@ -1197,12 +1181,11 @@
 	/* Initialize MACCFG2. */
 	tempval = MACCFG2_INIT_SETTINGS;
 
-	/* If the mtu is larger than the max size for standard
-	 * ethernet frames (ie, a jumbo frame), then set maccfg2
-	 * to allow huge frames, and to check the length
+	/* eTSEC74 erratum: Rx frames of length MAXFRM or MAXFRM-1
+	 * are marked as truncated.  Avoid this by MACCFG2[Huge Frame]=1,
+	 * and by checking RxBD[LG] and discarding larger than MAXFRM.
 	 */
-	if (priv->rx_buffer_size > DEFAULT_RX_BUFFER_SIZE ||
-	    gfar_has_errata(priv, GFAR_ERRATA_74))
+	if (gfar_has_errata(priv, GFAR_ERRATA_74))
 		tempval |= MACCFG2_HUGEFRAME | MACCFG2_LENGTHCHECK;
 
 	gfar_write(&regs->maccfg2, tempval);
@@ -1413,8 +1396,6 @@
 	    priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER)
 		dev->needed_headroom = GMAC_FCB_LEN;
 
-	priv->rx_buffer_size = DEFAULT_RX_BUFFER_SIZE;
-
 	/* Initializing some of the rx/tx queue level parameters */
 	for (i = 0; i < priv->num_tx_queues; i++) {
 		priv->tx_queue[i]->tx_ring_size = DEFAULT_TX_RING_SIZE;
@@ -1911,26 +1892,32 @@
 
 static void free_skb_rx_queue(struct gfar_priv_rx_q *rx_queue)
 {
-	struct rxbd8 *rxbdp;
-	struct gfar_private *priv = netdev_priv(rx_queue->ndev);
 	int i;
 
-	rxbdp = rx_queue->rx_bd_base;
+	struct rxbd8 *rxbdp = rx_queue->rx_bd_base;
+
+	if (rx_queue->skb)
+		dev_kfree_skb(rx_queue->skb);
 
 	for (i = 0; i < rx_queue->rx_ring_size; i++) {
-		if (rx_queue->rx_skbuff[i]) {
-			dma_unmap_single(priv->dev, be32_to_cpu(rxbdp->bufPtr),
-					 priv->rx_buffer_size,
-					 DMA_FROM_DEVICE);
-			dev_kfree_skb_any(rx_queue->rx_skbuff[i]);
-			rx_queue->rx_skbuff[i] = NULL;
-		}
+		struct	gfar_rx_buff *rxb = &rx_queue->rx_buff[i];
+
 		rxbdp->lstatus = 0;
 		rxbdp->bufPtr = 0;
 		rxbdp++;
+
+		if (!rxb->page)
+			continue;
+
+		dma_unmap_single(rx_queue->dev, rxb->dma,
+				 PAGE_SIZE, DMA_FROM_DEVICE);
+		__free_page(rxb->page);
+
+		rxb->page = NULL;
 	}
-	kfree(rx_queue->rx_skbuff);
-	rx_queue->rx_skbuff = NULL;
+
+	kfree(rx_queue->rx_buff);
+	rx_queue->rx_buff = NULL;
 }
 
 /* If there are any tx skbs or rx skbs still around, free them.
@@ -1955,7 +1942,7 @@
 
 	for (i = 0; i < priv->num_rx_queues; i++) {
 		rx_queue = priv->rx_queue[i];
-		if (rx_queue->rx_skbuff)
+		if (rx_queue->rx_buff)
 			free_skb_rx_queue(rx_queue);
 	}
 
@@ -2513,7 +2500,7 @@
 	struct gfar_private *priv = netdev_priv(dev);
 	int frame_size = new_mtu + ETH_HLEN;
 
-	if ((frame_size < 64) || (frame_size > JUMBO_FRAME_SIZE)) {
+	if ((frame_size < 64) || (frame_size > GFAR_JUMBO_FRAME_SIZE)) {
 		netif_err(priv, drv, dev, "Invalid MTU setting\n");
 		return -EINVAL;
 	}
@@ -2567,15 +2554,6 @@
 	schedule_work(&priv->reset_task);
 }
 
-static void gfar_align_skb(struct sk_buff *skb)
-{
-	/* We need the data buffer to be aligned properly.  We will reserve
-	 * as many bytes as needed to align the data properly
-	 */
-	skb_reserve(skb, RXBUF_ALIGNMENT -
-		    (((unsigned long) skb->data) & (RXBUF_ALIGNMENT - 1)));
-}
-
 /* Interrupt Handler for Transmit complete */
 static void gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
 {
@@ -2682,28 +2660,27 @@
 	netdev_tx_completed_queue(txq, howmany, bytes_sent);
 }
 
-static struct sk_buff *gfar_new_skb(struct net_device *ndev,
-				    dma_addr_t *bufaddr)
+static bool gfar_new_page(struct gfar_priv_rx_q *rxq, struct gfar_rx_buff *rxb)
 {
-	struct gfar_private *priv = netdev_priv(ndev);
-	struct sk_buff *skb;
+	struct page *page;
 	dma_addr_t addr;
 
-	skb = netdev_alloc_skb(ndev, priv->rx_buffer_size + RXBUF_ALIGNMENT);
-	if (!skb)
-		return NULL;
+	page = dev_alloc_page();
+	if (unlikely(!page))
+		return false;
 
-	gfar_align_skb(skb);
+	addr = dma_map_page(rxq->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE);
+	if (unlikely(dma_mapping_error(rxq->dev, addr))) {
+		__free_page(page);
 
-	addr = dma_map_single(priv->dev, skb->data,
-			      priv->rx_buffer_size, DMA_FROM_DEVICE);
-	if (unlikely(dma_mapping_error(priv->dev, addr))) {
-		dev_kfree_skb_any(skb);
-		return NULL;
+		return false;
 	}
 
-	*bufaddr = addr;
-	return skb;
+	rxb->dma = addr;
+	rxb->page = page;
+	rxb->page_offset = 0;
+
+	return true;
 }
 
 static void gfar_rx_alloc_err(struct gfar_priv_rx_q *rx_queue)
@@ -2718,41 +2695,40 @@
 static void gfar_alloc_rx_buffs(struct gfar_priv_rx_q *rx_queue,
 				int alloc_cnt)
 {
-	struct net_device *ndev = rx_queue->ndev;
-	struct rxbd8 *bdp, *base;
-	dma_addr_t bufaddr;
+	struct rxbd8 *bdp;
+	struct gfar_rx_buff *rxb;
 	int i;
 
 	i = rx_queue->next_to_use;
-	base = rx_queue->rx_bd_base;
 	bdp = &rx_queue->rx_bd_base[i];
+	rxb = &rx_queue->rx_buff[i];
 
 	while (alloc_cnt--) {
-		struct sk_buff *skb = rx_queue->rx_skbuff[i];
-
-		if (likely(!skb)) {
-			skb = gfar_new_skb(ndev, &bufaddr);
-			if (unlikely(!skb)) {
+		/* try reuse page */
+		if (unlikely(!rxb->page)) {
+			if (unlikely(!gfar_new_page(rx_queue, rxb))) {
 				gfar_rx_alloc_err(rx_queue);
 				break;
 			}
-		} else { /* restore from sleep state */
-			bufaddr = be32_to_cpu(bdp->bufPtr);
 		}
 
-		rx_queue->rx_skbuff[i] = skb;
-
 		/* Setup the new RxBD */
-		gfar_init_rxbdp(rx_queue, bdp, bufaddr);
+		gfar_init_rxbdp(rx_queue, bdp,
+				rxb->dma + rxb->page_offset + RXBUF_ALIGNMENT);
 
 		/* Update to the next pointer */
-		bdp = next_bd(bdp, base, rx_queue->rx_ring_size);
+		bdp++;
+		rxb++;
 
-		if (unlikely(++i == rx_queue->rx_ring_size))
+		if (unlikely(++i == rx_queue->rx_ring_size)) {
 			i = 0;
+			bdp = rx_queue->rx_bd_base;
+			rxb = rx_queue->rx_buff;
+		}
 	}
 
 	rx_queue->next_to_use = i;
+	rx_queue->next_to_alloc = i;
 }
 
 static void count_errors(u32 lstatus, struct net_device *ndev)
@@ -2839,6 +2815,93 @@
 	return IRQ_HANDLED;
 }
 
+static bool gfar_add_rx_frag(struct gfar_rx_buff *rxb, u32 lstatus,
+			     struct sk_buff *skb, bool first)
+{
+	unsigned int size = lstatus & BD_LENGTH_MASK;
+	struct page *page = rxb->page;
+
+	/* Remove the FCS from the packet length */
+	if (likely(lstatus & BD_LFLAG(RXBD_LAST)))
+		size -= ETH_FCS_LEN;
+
+	if (likely(first))
+		skb_put(skb, size);
+	else
+		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
+				rxb->page_offset + RXBUF_ALIGNMENT,
+				size, GFAR_RXB_TRUESIZE);
+
+	/* try reuse page */
+	if (unlikely(page_count(page) != 1))
+		return false;
+
+	/* change offset to the other half */
+	rxb->page_offset ^= GFAR_RXB_TRUESIZE;
+
+	atomic_inc(&page->_count);
+
+	return true;
+}
+
+static void gfar_reuse_rx_page(struct gfar_priv_rx_q *rxq,
+			       struct gfar_rx_buff *old_rxb)
+{
+	struct gfar_rx_buff *new_rxb;
+	u16 nta = rxq->next_to_alloc;
+
+	new_rxb = &rxq->rx_buff[nta];
+
+	/* find next buf that can reuse a page */
+	nta++;
+	rxq->next_to_alloc = (nta < rxq->rx_ring_size) ? nta : 0;
+
+	/* copy page reference */
+	*new_rxb = *old_rxb;
+
+	/* sync for use by the device */
+	dma_sync_single_range_for_device(rxq->dev, old_rxb->dma,
+					 old_rxb->page_offset,
+					 GFAR_RXB_TRUESIZE, DMA_FROM_DEVICE);
+}
+
+static struct sk_buff *gfar_get_next_rxbuff(struct gfar_priv_rx_q *rx_queue,
+					    u32 lstatus, struct sk_buff *skb)
+{
+	struct gfar_rx_buff *rxb = &rx_queue->rx_buff[rx_queue->next_to_clean];
+	struct page *page = rxb->page;
+	bool first = false;
+
+	if (likely(!skb)) {
+		void *buff_addr = page_address(page) + rxb->page_offset;
+
+		skb = build_skb(buff_addr, GFAR_SKBFRAG_SIZE);
+		if (unlikely(!skb)) {
+			gfar_rx_alloc_err(rx_queue);
+			return NULL;
+		}
+		skb_reserve(skb, RXBUF_ALIGNMENT);
+		first = true;
+	}
+
+	dma_sync_single_range_for_cpu(rx_queue->dev, rxb->dma, rxb->page_offset,
+				      GFAR_RXB_TRUESIZE, DMA_FROM_DEVICE);
+
+	if (gfar_add_rx_frag(rxb, lstatus, skb, first)) {
+		/* reuse the free half of the page */
+		gfar_reuse_rx_page(rx_queue, rxb);
+	} else {
+		/* page cannot be reused, unmap it */
+		dma_unmap_page(rx_queue->dev, rxb->dma,
+			       PAGE_SIZE, DMA_FROM_DEVICE);
+	}
+
+	/* clear rxb content */
+	rxb->page = NULL;
+
+	return skb;
+}
+
 static inline void gfar_rx_checksum(struct sk_buff *skb, struct rxfcb *fcb)
 {
 	/* If valid headers were found, and valid sums
@@ -2902,14 +2965,14 @@
 int gfar_clean_rx_ring(struct gfar_priv_rx_q *rx_queue, int rx_work_limit)
 {
 	struct net_device *ndev = rx_queue->ndev;
-	struct rxbd8 *bdp, *base;
-	struct sk_buff *skb;
-	int i, howmany = 0;
-	int cleaned_cnt = gfar_rxbd_unused(rx_queue);
 	struct gfar_private *priv = netdev_priv(ndev);
+	struct rxbd8 *bdp;
+	int i, howmany = 0;
+	struct sk_buff *skb = rx_queue->skb;
+	int cleaned_cnt = gfar_rxbd_unused(rx_queue);
+	unsigned int total_bytes = 0, total_pkts = 0;
 
 	/* Get the first full descriptor */
-	base = rx_queue->rx_bd_base;
 	i = rx_queue->next_to_clean;
 
 	while (rx_work_limit--) {
@@ -2929,54 +2992,51 @@
 		rmb();
 
 		/* fetch next to clean buffer from the ring */
-		skb = rx_queue->rx_skbuff[i];
+		skb = gfar_get_next_rxbuff(rx_queue, lstatus, skb);
+		if (unlikely(!skb))
+			break;
 
-		dma_unmap_single(priv->dev, be32_to_cpu(bdp->bufPtr),
-				 priv->rx_buffer_size, DMA_FROM_DEVICE);
+		cleaned_cnt++;
+		howmany++;
 
-		if (unlikely(!(lstatus & BD_LFLAG(RXBD_ERR)) &&
-			     (lstatus & BD_LENGTH_MASK) > priv->rx_buffer_size))
-			lstatus |= BD_LFLAG(RXBD_LARGE);
+		if (unlikely(++i == rx_queue->rx_ring_size))
+			i = 0;
 
-		if (unlikely(!(lstatus & BD_LFLAG(RXBD_LAST)) ||
-			     (lstatus & BD_LFLAG(RXBD_ERR)))) {
+		rx_queue->next_to_clean = i;
+
+		/* fetch next buffer if not the last in frame */
+		if (!(lstatus & BD_LFLAG(RXBD_LAST)))
+			continue;
+
+		if (unlikely(lstatus & BD_LFLAG(RXBD_ERR))) {
 			count_errors(lstatus, ndev);
 
 			/* discard faulty buffer */
 			dev_kfree_skb(skb);
-
-		} else {
-			/* Increment the number of packets */
-			rx_queue->stats.rx_packets++;
-			howmany++;
-
-			if (likely(skb)) {
-				int pkt_len = (lstatus & BD_LENGTH_MASK) -
-					  ETH_FCS_LEN;
-				/* Remove the FCS from the packet length */
-				skb_put(skb, pkt_len);
-				rx_queue->stats.rx_bytes += pkt_len;
-				skb_record_rx_queue(skb, rx_queue->qindex);
-				gfar_process_frame(ndev, skb);
-
-				/* Send the packet up the stack */
-				napi_gro_receive(&rx_queue->grp->napi_rx, skb);
-
-			} else {
-				netif_warn(priv, rx_err, ndev, "Missing skb!\n");
-				rx_queue->stats.rx_dropped++;
-				atomic64_inc(&priv->extra_stats.rx_skbmissing);
-			}
-
+			skb = NULL;
+			rx_queue->stats.rx_dropped++;
+			continue;
 		}
 
-		rx_queue->rx_skbuff[i] = NULL;
-		cleaned_cnt++;
-		if (unlikely(++i == rx_queue->rx_ring_size))
-			i = 0;
+		/* Increment the number of packets */
+		total_pkts++;
+		total_bytes += skb->len;
+
+		skb_record_rx_queue(skb, rx_queue->qindex);
+
+		gfar_process_frame(ndev, skb);
+
+		/* Send the packet up the stack */
+		napi_gro_receive(&rx_queue->grp->napi_rx, skb);
+
+		skb = NULL;
 	}
 
-	rx_queue->next_to_clean = i;
+	/* Store incomplete frames for completion */
+	rx_queue->skb = skb;
+
+	rx_queue->stats.rx_packets += total_pkts;
+	rx_queue->stats.rx_bytes += total_bytes;
 
 	if (cleaned_cnt)
 		gfar_alloc_rx_buffs(rx_queue, cleaned_cnt);