spi: sh-msiof: Add DMA support

Add DMA support to the MSIOF driver using platform data.

As MSIOF DMA is limited to 32-bit words (requiring byte/wordswapping for
smaller wordsizes), and the group length is limited to 256 words, DMA is
performed on two fixed pages, allocated and mapped at driver initialization
time.

Performance figures (in Mbps) on r8a7791/koelsch at different SPI clock
frequencies for 1024-byte and 4096-byte transfers:

                   1024 bytes           4096 bytes
  -  3.25 MHz: PIO  2.1, DMA  2.6 | PIO  2.8, DMA  3.1
  -  6.5  MHz: PIO  3.2, DMA  4.4 | PIO  5.0, DMA  5.9
  - 13    MHz: PIO  4.2, DMA  6.6 | PIO  8.2, DMA 10.7
  - 26    MHz: PIO  5.9, DMA 10.4 | PIO 12.4, DMA 18.4

Note that DMA is only faster than PIO for transfers that exceed the FIFO
size (typically 64 words / 256 bytes).

Also note that large transfers (larger than the group length for DMA, or
larger than the FIFO size for PIO), should use cs-gpio (with the
appropriate pinmux setup), as the hardware chipselect will be deasserted in
between chunks.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Mark Brown <broonie@linaro.org>
diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c
index 824f44e..9922ed3 100644
--- a/drivers/spi/spi-sh-msiof.c
+++ b/drivers/spi/spi-sh-msiof.c
@@ -2,6 +2,7 @@
  * SuperH MSIOF SPI Master Interface
  *
  * Copyright (c) 2009 Magnus Damm
+ * Copyright (C) 2014 Glider bvba
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -13,6 +14,8 @@
 #include <linux/clk.h>
 #include <linux/completion.h>
 #include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
 #include <linux/err.h>
 #include <linux/gpio.h>
 #include <linux/interrupt.h>
@@ -23,6 +26,7 @@
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
+#include <linux/sh_dma.h>
 
 #include <linux/spi/sh_msiof.h>
 #include <linux/spi/spi.h>
@@ -37,6 +41,7 @@
 };
 
 struct sh_msiof_spi_priv {
+	struct spi_master *master;
 	void __iomem *mapbase;
 	struct clk *clk;
 	struct platform_device *pdev;
@@ -45,6 +50,10 @@
 	struct completion done;
 	int tx_fifo_size;
 	int rx_fifo_size;
+	void *tx_dma_page;
+	void *rx_dma_page;
+	dma_addr_t tx_dma_addr;
+	dma_addr_t rx_dma_addr;
 };
 
 #define TMDR1	0x00	/* Transmit Mode Register 1 */
@@ -84,6 +93,8 @@
 #define MDR2_WDLEN1(i)	(((i) - 1) << 16) /* Word Count (1-64/256 (SH, A1))) */
 #define MDR2_GRPMASK1	0x00000001 /* Group Output Mask 1 (SH, A1) */
 
+#define MAX_WDLEN	256U
+
 /* TSCR and RSCR */
 #define SCR_BRPS_MASK	    0x1f00 /* Prescaler Setting (1-32) */
 #define SCR_BRPS(i)	(((i) - 1) << 8)
@@ -282,8 +293,6 @@
 	 *    1    0         11     11    0    0
 	 *    1    1         11     11    1    1
 	 */
-	sh_msiof_write(p, FCTR, 0);
-
 	tmp = MDR1_SYNCMD_SPI | 1 << MDR1_FLD_SHIFT | MDR1_XXSTP;
 	tmp |= !cs_high << MDR1_SYNCAC_SHIFT;
 	tmp |= lsb_first << MDR1_BITLSB_SHIFT;
@@ -319,8 +328,6 @@
 
 	if (rx_buf)
 		sh_msiof_write(p, RMDR2, dr2);
-
-	sh_msiof_write(p, IER, STR_TEOF | STR_REOF);
 }
 
 static void sh_msiof_reset_str(struct sh_msiof_spi_priv *p)
@@ -563,8 +570,12 @@
 	/* the fifo contents need shifting */
 	fifo_shift = 32 - bits;
 
+	/* default FIFO watermarks for PIO */
+	sh_msiof_write(p, FCTR, 0);
+
 	/* setup msiof transfer mode registers */
 	sh_msiof_spi_set_mode_regs(p, tx_buf, rx_buf, bits, words);
+	sh_msiof_write(p, IER, IER_TEOFE | IER_REOFE);
 
 	/* write tx fifo */
 	if (tx_buf)
@@ -609,11 +620,170 @@
 	return ret;
 }
 
+static void sh_msiof_dma_complete(void *arg)
+{
+	struct sh_msiof_spi_priv *p = arg;
+
+	sh_msiof_write(p, IER, 0);
+	complete(&p->done);
+}
+
+static int sh_msiof_dma_once(struct sh_msiof_spi_priv *p, const void *tx,
+			     void *rx, unsigned int len)
+{
+	u32 ier_bits = 0;
+	struct dma_async_tx_descriptor *desc_tx = NULL, *desc_rx = NULL;
+	dma_cookie_t cookie;
+	int ret;
+
+	/* 1 stage FIFO watermarks for DMA */
+	sh_msiof_write(p, FCTR, FCTR_TFWM_1 | FCTR_RFWM_1);
+
+	/* setup msiof transfer mode registers (32-bit words) */
+	sh_msiof_spi_set_mode_regs(p, tx, rx, 32, len / 4);
+
+	if (tx) {
+		ier_bits |= IER_TDREQE | IER_TDMAE;
+		dma_sync_single_for_device(&p->pdev->dev, p->tx_dma_addr, len,
+					   DMA_TO_DEVICE);
+		desc_tx = dmaengine_prep_slave_single(p->master->dma_tx,
+					p->tx_dma_addr, len, DMA_TO_DEVICE,
+					DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+		if (!desc_tx)
+			return -EIO;
+	}
+
+	if (rx) {
+		ier_bits |= IER_RDREQE | IER_RDMAE;
+		desc_rx = dmaengine_prep_slave_single(p->master->dma_rx,
+					p->rx_dma_addr, len, DMA_FROM_DEVICE,
+					DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+		if (!desc_rx)
+			return -EIO;
+	}
+	sh_msiof_write(p, IER, ier_bits);
+
+	reinit_completion(&p->done);
+
+	if (rx) {
+		desc_rx->callback = sh_msiof_dma_complete;
+		desc_rx->callback_param = p;
+		cookie = dmaengine_submit(desc_rx);
+		if (dma_submit_error(cookie)) {
+			ret = cookie;
+			goto stop_ier;
+		}
+		dma_async_issue_pending(p->master->dma_rx);
+	}
+
+	if (tx) {
+		if (rx) {
+			/* No callback */
+			desc_tx->callback = NULL;
+		} else {
+			desc_tx->callback = sh_msiof_dma_complete;
+			desc_tx->callback_param = p;
+		}
+		cookie = dmaengine_submit(desc_tx);
+		if (dma_submit_error(cookie)) {
+			ret = cookie;
+			goto stop_rx;
+		}
+		dma_async_issue_pending(p->master->dma_tx);
+	}
+
+	ret = sh_msiof_spi_start(p, rx);
+	if (ret) {
+		dev_err(&p->pdev->dev, "failed to start hardware\n");
+		goto stop_tx;
+	}
+
+	/* wait for tx fifo to be emptied / rx fifo to be filled */
+	ret = wait_for_completion_timeout(&p->done, HZ);
+	if (!ret) {
+		dev_err(&p->pdev->dev, "DMA timeout\n");
+		ret = -ETIMEDOUT;
+		goto stop_reset;
+	}
+
+	/* clear status bits */
+	sh_msiof_reset_str(p);
+
+	ret = sh_msiof_spi_stop(p, rx);
+	if (ret) {
+		dev_err(&p->pdev->dev, "failed to shut down hardware\n");
+		return ret;
+	}
+
+	if (rx)
+		dma_sync_single_for_cpu(&p->pdev->dev, p->rx_dma_addr, len,
+					DMA_FROM_DEVICE);
+
+	return 0;
+
+stop_reset:
+	sh_msiof_reset_str(p);
+	sh_msiof_spi_stop(p, rx);
+stop_tx:
+	if (tx)
+		dmaengine_terminate_all(p->master->dma_tx);
+stop_rx:
+	if (rx)
+		dmaengine_terminate_all(p->master->dma_rx);
+stop_ier:
+	sh_msiof_write(p, IER, 0);
+	return ret;
+}
+
+static void copy_bswap32(u32 *dst, const u32 *src, unsigned int words)
+{
+	/* src or dst can be unaligned, but not both */
+	if ((unsigned long)src & 3) {
+		while (words--) {
+			*dst++ = swab32(get_unaligned(src));
+			src++;
+		}
+	} else if ((unsigned long)dst & 3) {
+		while (words--) {
+			put_unaligned(swab32(*src++), dst);
+			dst++;
+		}
+	} else {
+		while (words--)
+			*dst++ = swab32(*src++);
+	}
+}
+
+static void copy_wswap32(u32 *dst, const u32 *src, unsigned int words)
+{
+	/* src or dst can be unaligned, but not both */
+	if ((unsigned long)src & 3) {
+		while (words--) {
+			*dst++ = swahw32(get_unaligned(src));
+			src++;
+		}
+	} else if ((unsigned long)dst & 3) {
+		while (words--) {
+			put_unaligned(swahw32(*src++), dst);
+			dst++;
+		}
+	} else {
+		while (words--)
+			*dst++ = swahw32(*src++);
+	}
+}
+
+static void copy_plain32(u32 *dst, const u32 *src, unsigned int words)
+{
+	memcpy(dst, src, words * 4);
+}
+
 static int sh_msiof_transfer_one(struct spi_master *master,
 				 struct spi_device *spi,
 				 struct spi_transfer *t)
 {
 	struct sh_msiof_spi_priv *p = spi_master_get_devdata(master);
+	void (*copy32)(u32 *, const u32 *, unsigned int);
 	void (*tx_fifo)(struct sh_msiof_spi_priv *, const void *, int, int);
 	void (*rx_fifo)(struct sh_msiof_spi_priv *, void *, int, int);
 	const void *tx_buf = t->tx_buf;
@@ -624,7 +794,48 @@
 	unsigned int words;
 	int n;
 	bool swab;
+	int ret;
 
+	/* setup clocks (clock already enabled in chipselect()) */
+	sh_msiof_spi_set_clk_regs(p, clk_get_rate(p->clk), t->speed_hz);
+
+	while (master->dma_tx && len > 15) {
+		/*
+		 *  DMA supports 32-bit words only, hence pack 8-bit and 16-bit
+		 *  words, with byte resp. word swapping.
+		 */
+		unsigned int l = min(len, MAX_WDLEN * 4);
+
+		if (bits <= 8) {
+			if (l & 3)
+				break;
+			copy32 = copy_bswap32;
+		} else if (bits <= 16) {
+			if (l & 1)
+				break;
+			copy32 = copy_wswap32;
+		} else {
+			copy32 = copy_plain32;
+		}
+
+		if (tx_buf)
+			copy32(p->tx_dma_page, tx_buf, l / 4);
+
+		ret = sh_msiof_dma_once(p, tx_buf, rx_buf, l);
+		if (ret)
+			return ret;
+
+		if (rx_buf) {
+			copy32(rx_buf, p->rx_dma_page, l / 4);
+			rx_buf += l;
+		}
+		if (tx_buf)
+			tx_buf += l;
+
+		len -= l;
+		if (!len)
+			return 0;
+	}
 
 	if (bits <= 8 && len > 15 && !(len & 3)) {
 		bits = 32;
@@ -673,9 +884,6 @@
 			rx_fifo = sh_msiof_spi_read_fifo_32;
 	}
 
-	/* setup clocks (clock already enabled in chipselect()) */
-	sh_msiof_spi_set_clk_regs(p, clk_get_rate(p->clk), t->speed_hz);
-
 	/* transfer in fifo sized chunks */
 	words = len / bytes_per_word;
 
@@ -745,6 +953,123 @@
 }
 #endif
 
+static struct dma_chan *sh_msiof_request_dma_chan(struct device *dev,
+	enum dma_transfer_direction dir, unsigned int id, dma_addr_t port_addr)
+{
+	dma_cap_mask_t mask;
+	struct dma_chan *chan;
+	struct dma_slave_config cfg;
+	int ret;
+
+	dma_cap_zero(mask);
+	dma_cap_set(DMA_SLAVE, mask);
+
+	chan = dma_request_channel(mask, shdma_chan_filter,
+				  (void *)(unsigned long)id);
+	if (!chan) {
+		dev_warn(dev, "dma_request_channel failed\n");
+		return NULL;
+	}
+
+	memset(&cfg, 0, sizeof(cfg));
+	cfg.slave_id = id;
+	cfg.direction = dir;
+	if (dir == DMA_MEM_TO_DEV)
+		cfg.dst_addr = port_addr;
+	else
+		cfg.src_addr = port_addr;
+
+	ret = dmaengine_slave_config(chan, &cfg);
+	if (ret) {
+		dev_warn(dev, "dmaengine_slave_config failed %d\n", ret);
+		dma_release_channel(chan);
+		return NULL;
+	}
+
+	return chan;
+}
+
+static int sh_msiof_request_dma(struct sh_msiof_spi_priv *p)
+{
+	struct platform_device *pdev = p->pdev;
+	struct device *dev = &pdev->dev;
+	const struct sh_msiof_spi_info *info = dev_get_platdata(dev);
+	const struct resource *res;
+	struct spi_master *master;
+
+	if (!info || !info->dma_tx_id || !info->dma_rx_id)
+		return 0;	/* The driver assumes no error */
+
+	/* The DMA engine uses the second register set, if present */
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+	if (!res)
+		res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+
+	master = p->master;
+	master->dma_tx = sh_msiof_request_dma_chan(dev, DMA_MEM_TO_DEV,
+						   info->dma_tx_id,
+						   res->start + TFDR);
+	if (!master->dma_tx)
+		return -ENODEV;
+
+	master->dma_rx = sh_msiof_request_dma_chan(dev, DMA_DEV_TO_MEM,
+						   info->dma_rx_id,
+						   res->start + RFDR);
+	if (!master->dma_rx)
+		goto free_tx_chan;
+
+	p->tx_dma_page = (void *)__get_free_page(GFP_KERNEL | GFP_DMA);
+	if (!p->tx_dma_page)
+		goto free_rx_chan;
+
+	p->rx_dma_page = (void *)__get_free_page(GFP_KERNEL | GFP_DMA);
+	if (!p->rx_dma_page)
+		goto free_tx_page;
+
+	p->tx_dma_addr = dma_map_single(dev, p->tx_dma_page, PAGE_SIZE,
+					DMA_TO_DEVICE);
+	if (dma_mapping_error(dev, p->tx_dma_addr))
+		goto free_rx_page;
+
+	p->rx_dma_addr = dma_map_single(dev, p->rx_dma_page, PAGE_SIZE,
+					DMA_FROM_DEVICE);
+	if (dma_mapping_error(dev, p->rx_dma_addr))
+		goto unmap_tx_page;
+
+	dev_info(dev, "DMA available");
+	return 0;
+
+unmap_tx_page:
+	dma_unmap_single(dev, p->tx_dma_addr, PAGE_SIZE, DMA_TO_DEVICE);
+free_rx_page:
+	free_page((unsigned long)p->rx_dma_page);
+free_tx_page:
+	free_page((unsigned long)p->tx_dma_page);
+free_rx_chan:
+	dma_release_channel(master->dma_rx);
+free_tx_chan:
+	dma_release_channel(master->dma_tx);
+	master->dma_tx = NULL;
+	return -ENODEV;
+}
+
+static void sh_msiof_release_dma(struct sh_msiof_spi_priv *p)
+{
+	struct spi_master *master = p->master;
+	struct device *dev;
+
+	if (!master->dma_tx)
+		return;
+
+	dev = &p->pdev->dev;
+	dma_unmap_single(dev, p->rx_dma_addr, PAGE_SIZE, DMA_FROM_DEVICE);
+	dma_unmap_single(dev, p->tx_dma_addr, PAGE_SIZE, DMA_TO_DEVICE);
+	free_page((unsigned long)p->rx_dma_page);
+	free_page((unsigned long)p->tx_dma_page);
+	dma_release_channel(master->dma_rx);
+	dma_release_channel(master->dma_tx);
+}
+
 static int sh_msiof_spi_probe(struct platform_device *pdev)
 {
 	struct resource	*r;
@@ -763,6 +1088,7 @@
 	p = spi_master_get_devdata(master);
 
 	platform_set_drvdata(pdev, p);
+	p->master = master;
 
 	of_id = of_match_device(sh_msiof_match, &pdev->dev);
 	if (of_id) {
@@ -833,6 +1159,10 @@
 	master->auto_runtime_pm = true;
 	master->transfer_one = sh_msiof_transfer_one;
 
+	ret = sh_msiof_request_dma(p);
+	if (ret < 0)
+		dev_warn(&pdev->dev, "DMA not available, using PIO\n");
+
 	ret = devm_spi_register_master(&pdev->dev, master);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "spi_register_master error.\n");
@@ -842,6 +1172,7 @@
 	return 0;
 
  err2:
+	sh_msiof_release_dma(p);
 	pm_runtime_disable(&pdev->dev);
  err1:
 	spi_master_put(master);
@@ -850,6 +1181,9 @@
 
 static int sh_msiof_spi_remove(struct platform_device *pdev)
 {
+	struct sh_msiof_spi_priv *p = platform_get_drvdata(pdev);
+
+	sh_msiof_release_dma(p);
 	pm_runtime_disable(&pdev->dev);
 	return 0;
 }
diff --git a/include/linux/spi/sh_msiof.h b/include/linux/spi/sh_msiof.h
index 2e8db3d..88a14d8 100644
--- a/include/linux/spi/sh_msiof.h
+++ b/include/linux/spi/sh_msiof.h
@@ -5,6 +5,8 @@
 	int tx_fifo_override;
 	int rx_fifo_override;
 	u16 num_chipselect;
+	unsigned int dma_tx_id;
+	unsigned int dma_rx_id;
 };
 
 #endif /* __SPI_SH_MSIOF_H__ */