VME: Prevent D16 cycles being split into 8-bit blocks

The memcpy_fromio() and memcpy_toio() functions use the __memcpy() function,
at least on x86. This function carries out transfers smaller than 32 bits as
multiple 8 bit transfers, causing a single (aligned) 16 bit transfer to be
split into 2 8 bit transfers which may not be supported by the target VME
device.

The commit 53059aa05988761a738fa8bc082bbf3c5d4462d1 fixed this for the
ca91cx42, however this was not fixed for the tsi148 at the time. This patch
uses the same algorithm to fix the tsi148.

Reported-by: Daniel Lambert <daniel.lambert@clermont.in2p3.fr>
Signed-off-by: Martyn Welch <martyn.welch@ge.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
diff --git a/drivers/vme/bridges/vme_tsi148.c b/drivers/vme/bridges/vme_tsi148.c
index f6385f7..880d924 100644
--- a/drivers/vme/bridges/vme_tsi148.c
+++ b/drivers/vme/bridges/vme_tsi148.c
@@ -1263,12 +1263,55 @@
 	u32 aspace, cycle, dwidth;
 	struct vme_bus_error *vme_err = NULL;
 	struct vme_bridge *tsi148_bridge;
+	void *addr = image->kern_base + offset;
+	unsigned int done = 0;
+	unsigned int count32;
 
 	tsi148_bridge = image->parent;
 
 	spin_lock(&image->lock);
 
-	memcpy_fromio(buf, image->kern_base + offset, (unsigned int)count);
+	/* The following code handles VME address alignment. We cannot use
+	 * memcpy_xxx directly here because it may cut small data transfers in
+	 * to 8-bit cycles, thus making D16 cycle impossible.
+	 * On the other hand, the bridge itself assures that the maximum data
+	 * cycle configured for the transfer is used and splits it
+	 * automatically for non-aligned addresses, so we don't want the
+	 * overhead of needlessly forcing small transfers for the entire cycle.
+	 */
+	if ((uintptr_t)addr & 0x1) {
+		*(u8 *)buf = ioread8(addr);
+		done += 1;
+		if (done == count)
+			goto out;
+	}
+	if ((uintptr_t)addr & 0x2) {
+		if ((count - done) < 2) {
+			*(u8 *)(buf + done) = ioread8(addr + done);
+			done += 1;
+			goto out;
+		} else {
+			*(u16 *)(buf + done) = ioread16(addr + done);
+			done += 2;
+		}
+	}
+
+	count32 = (count - done) & ~0x3;
+	if (count32 > 0) {
+		memcpy_fromio(buf + done, addr + done, count32);
+		done += count32;
+	}
+
+	if ((count - done) & 0x2) {
+		*(u16 *)(buf + done) = ioread16(addr + done);
+		done += 2;
+	}
+	if ((count - done) & 0x1) {
+		*(u8 *)(buf + done) = ioread8(addr + done);
+		done += 1;
+	}
+
+out:
 	retval = count;
 
 	if (!err_chk)
@@ -1301,6 +1344,9 @@
 	int retval = 0, enabled;
 	unsigned long long vme_base, size;
 	u32 aspace, cycle, dwidth;
+	void *addr = image->kern_base + offset;
+	unsigned int done = 0;
+	unsigned int count32;
 
 	struct vme_bus_error *vme_err = NULL;
 	struct vme_bridge *tsi148_bridge;
@@ -1312,7 +1358,42 @@
 
 	spin_lock(&image->lock);
 
-	memcpy_toio(image->kern_base + offset, buf, (unsigned int)count);
+	/* Here we apply for the same strategy we do in master_read
+	 * function in order to assure D16 cycle when required.
+	 */
+	if ((uintptr_t)addr & 0x1) {
+		iowrite8(*(u8 *)buf, addr);
+		done += 1;
+		if (done == count)
+			goto out;
+	}
+	if ((uintptr_t)addr & 0x2) {
+		if ((count - done) < 2) {
+			iowrite8(*(u8 *)(buf + done), addr + done);
+			done += 1;
+			goto out;
+		} else {
+			iowrite16(*(u16 *)(buf + done), addr + done);
+			done += 2;
+		}
+	}
+
+	count32 = (count - done) & ~0x3;
+	if (count32 > 0) {
+		memcpy_toio(addr + done, buf + done, count32);
+		done += count32;
+	}
+
+	if ((count - done) & 0x2) {
+		iowrite16(*(u16 *)(buf + done), addr + done);
+		done += 2;
+	}
+	if ((count - done) & 0x1) {
+		iowrite8(*(u8 *)(buf + done), addr + done);
+		done += 1;
+	}
+
+out:
 	retval = count;
 
 	/*