usb: ISP1760: improve pre-fetch timing

ISP1760 requires a delay of 90ns between programming the address and
reading the data.  Current driver solves this by a mdelay(1) which is
very heavy weighted and slow.  This patch applies the workaround from
the ISP1760 FAQ by using two different banks for PTD and payload data
and using a common wait for them.  This wait is done by an additional
ISP1760 access (whose timing constraints guarantee the 90ns delay).
This improves speed when reading from an USB stick from:

  $ time dd if=/dev/sda of=/dev/zero bs=65536 count=1638
  real    1m 15.43s
  user    0m 0.44s
  sys     0m 39.46s

to

  $ time dd if=/dev/sda of=/dev/zero bs=65536 count=1638
  real    0m 18.53s
  user    0m 0.16s
  sys     0m 12.97s

[bigeasy@linutronix.de: fixed comment formating, moved define into
   header file, obey 80 char rule]

Signed-off-by: Enrico Scholz <enrico.scholz@sigma-chemnitz.de>
Signed-off-by: Sebastian Siewior <bigeasy@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

diff --git a/drivers/usb/host/isp1760-hcd.c b/drivers/usb/host/isp1760-hcd.c
index db5f4f5..e209b3b 100644
--- a/drivers/usb/host/isp1760-hcd.c
+++ b/drivers/usb/host/isp1760-hcd.c
@@ -126,9 +126,8 @@
  * doesn't quite work because some people have to enforce 32-bit access
  */
 static void priv_read_copy(struct isp1760_hcd *priv, u32 *src,
-		__u32 __iomem *dst, u32 offset, u32 len)
+		__u32 __iomem *dst, u32 len)
 {
-	struct usb_hcd *hcd = priv_to_hcd(priv);
 	u32 val;
 	u8 *buff8;
 
@@ -136,11 +135,6 @@
 		printk(KERN_ERR "ERROR: buffer: %p len: %d\n", src, len);
 		return;
 	}
-	isp1760_writel(offset,  hcd->regs + HC_MEMORY_REG);
-	/* XXX
-	 * 90nsec delay, the spec says something how this could be avoided.
-	 */
-	mdelay(1);
 
 	while (len >= 4) {
 		*src = __raw_readl(dst);
@@ -987,8 +981,20 @@
 			printk(KERN_ERR "qh is 0\n");
 			continue;
 		}
-		priv_read_copy(priv, (u32 *)&ptd, usb_hcd->regs + atl_regs,
-				atl_regs, sizeof(ptd));
+		isp1760_writel(atl_regs + ISP_BANK(0), usb_hcd->regs +
+				HC_MEMORY_REG);
+		isp1760_writel(payload  + ISP_BANK(1), usb_hcd->regs +
+				HC_MEMORY_REG);
+		/*
+		 * write bank1 address twice to ensure the 90ns delay (time
+		 * between BANK0 write and the priv_read_copy() call is at
+		 * least 3*t_WHWL + 2*t_w11 = 3*25ns + 2*17ns = 92ns)
+		 */
+		isp1760_writel(payload  + ISP_BANK(1), usb_hcd->regs +
+				HC_MEMORY_REG);
+
+		priv_read_copy(priv, (u32 *)&ptd, usb_hcd->regs + atl_regs +
+				ISP_BANK(0), sizeof(ptd));
 
 		dw1 = le32_to_cpu(ptd.dw1);
 		dw2 = le32_to_cpu(ptd.dw2);
@@ -1091,7 +1097,7 @@
 			case IN_PID:
 				priv_read_copy(priv,
 					priv->atl_ints[queue_entry].data_buffer,
-					usb_hcd->regs + payload, payload,
+					usb_hcd->regs + payload + ISP_BANK(1),
 					length);
 
 			case OUT_PID:
@@ -1206,8 +1212,20 @@
 			continue;
 		}
 
-		priv_read_copy(priv, (u32 *)&ptd, usb_hcd->regs + int_regs,
-				int_regs, sizeof(ptd));
+		isp1760_writel(int_regs + ISP_BANK(0), usb_hcd->regs +
+				HC_MEMORY_REG);
+		isp1760_writel(payload  + ISP_BANK(1), usb_hcd->regs +
+				HC_MEMORY_REG);
+		/*
+		 * write bank1 address twice to ensure the 90ns delay (time
+		 * between BANK0 write and the priv_read_copy() call is at
+		 * least 3*t_WHWL + 2*t_w11 = 3*25ns + 2*17ns = 92ns)
+		 */
+		isp1760_writel(payload  + ISP_BANK(1), usb_hcd->regs +
+				HC_MEMORY_REG);
+
+		priv_read_copy(priv, (u32 *)&ptd, usb_hcd->regs + int_regs +
+				ISP_BANK(0), sizeof(ptd));
 		dw1 = le32_to_cpu(ptd.dw1);
 		dw3 = le32_to_cpu(ptd.dw3);
 		check_int_err_status(le32_to_cpu(ptd.dw4));
@@ -1242,7 +1260,7 @@
 			case IN_PID:
 				priv_read_copy(priv,
 					priv->int_ints[queue_entry].data_buffer,
-					usb_hcd->regs + payload , payload,
+					usb_hcd->regs + payload + ISP_BANK(1),
 					length);
 			case OUT_PID:
 
diff --git a/drivers/usb/host/isp1760-hcd.h b/drivers/usb/host/isp1760-hcd.h
index 6473dd8..3c61cd5 100644
--- a/drivers/usb/host/isp1760-hcd.h
+++ b/drivers/usb/host/isp1760-hcd.h
@@ -54,6 +54,8 @@
 #define BUFFER_MAP		0x7
 
 #define HC_MEMORY_REG		0x33c
+#define ISP_BANK(x)		((x) << 16)
+
 #define HC_PORT1_CTRL		0x374
 #define PORT1_POWER		(3 << 3)
 #define PORT1_INIT1		(1 << 7)