m68knommu: add optimize memmove() function

Add an m68k/coldfire optimized memmove() function for the m68knommu arch.
This is the same function as used by m68k. Simple speed tests show this
is faster once buffers are larger than 4 bytes, and significantly faster
on much larger buffers (4 times faster above about 100 bytes).

This also goes part of the way to fixing a regression caused by commit
ea61bc461d09e8d331a307916530aaae808c72a2 ("m68k/m68knommu: merge MMU and
non-MMU string.h"), which breaks non-coldfire non-mmu builds (which is
the 68x328 and 68360 families). They currently have no memmove() fucntion
defined, since there was none in the m68knommu/lib functions.

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
diff --git a/arch/m68k/include/asm/string.h b/arch/m68k/include/asm/string.h
index ffc3c3f..3219845 100644
--- a/arch/m68k/include/asm/string.h
+++ b/arch/m68k/include/asm/string.h
@@ -99,10 +99,10 @@
 		: "+a" (cs), "+a" (ct), "=d" (res));
 	return res;
 }
+#endif /* CONFIG_COLDFIRE */
 
 #define __HAVE_ARCH_MEMMOVE
 extern void *memmove(void *, const void *, __kernel_size_t);
-#endif /* CONFIG_COLDFIRE */
 
 #define memcmp(d, s, n) __builtin_memcmp(d, s, n)
 
diff --git a/arch/m68knommu/lib/Makefile b/arch/m68knommu/lib/Makefile
index d94d709..32d852e 100644
--- a/arch/m68knommu/lib/Makefile
+++ b/arch/m68knommu/lib/Makefile
@@ -4,4 +4,4 @@
 
 lib-y	:= ashldi3.o ashrdi3.o lshrdi3.o \
 	   muldi3.o mulsi3.o divsi3.o udivsi3.o modsi3.o umodsi3.o \
-	   checksum.o memcpy.o memset.o delay.o
+	   checksum.o memcpy.o memmove.o memset.o delay.o
diff --git a/arch/m68knommu/lib/memmove.c b/arch/m68knommu/lib/memmove.c
new file mode 100644
index 0000000..b3dcfe9
--- /dev/null
+++ b/arch/m68knommu/lib/memmove.c
@@ -0,0 +1,105 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive
+ * for more details.
+ */
+
+#define __IN_STRING_C
+
+#include <linux/module.h>
+#include <linux/string.h>
+
+void *memmove(void *dest, const void *src, size_t n)
+{
+	void *xdest = dest;
+	size_t temp;
+
+	if (!n)
+		return xdest;
+
+	if (dest < src) {
+		if ((long)dest & 1) {
+			char *cdest = dest;
+			const char *csrc = src;
+			*cdest++ = *csrc++;
+			dest = cdest;
+			src = csrc;
+			n--;
+		}
+		if (n > 2 && (long)dest & 2) {
+			short *sdest = dest;
+			const short *ssrc = src;
+			*sdest++ = *ssrc++;
+			dest = sdest;
+			src = ssrc;
+			n -= 2;
+		}
+		temp = n >> 2;
+		if (temp) {
+			long *ldest = dest;
+			const long *lsrc = src;
+			temp--;
+			do
+				*ldest++ = *lsrc++;
+			while (temp--);
+			dest = ldest;
+			src = lsrc;
+		}
+		if (n & 2) {
+			short *sdest = dest;
+			const short *ssrc = src;
+			*sdest++ = *ssrc++;
+			dest = sdest;
+			src = ssrc;
+		}
+		if (n & 1) {
+			char *cdest = dest;
+			const char *csrc = src;
+			*cdest = *csrc;
+		}
+	} else {
+		dest = (char *)dest + n;
+		src = (const char *)src + n;
+		if ((long)dest & 1) {
+			char *cdest = dest;
+			const char *csrc = src;
+			*--cdest = *--csrc;
+			dest = cdest;
+			src = csrc;
+			n--;
+		}
+		if (n > 2 && (long)dest & 2) {
+			short *sdest = dest;
+			const short *ssrc = src;
+			*--sdest = *--ssrc;
+			dest = sdest;
+			src = ssrc;
+			n -= 2;
+		}
+		temp = n >> 2;
+		if (temp) {
+			long *ldest = dest;
+			const long *lsrc = src;
+			temp--;
+			do
+				*--ldest = *--lsrc;
+			while (temp--);
+			dest = ldest;
+			src = lsrc;
+		}
+		if (n & 2) {
+			short *sdest = dest;
+			const short *ssrc = src;
+			*--sdest = *--ssrc;
+			dest = sdest;
+			src = ssrc;
+		}
+		if (n & 1) {
+			char *cdest = dest;
+			const char *csrc = src;
+			*--cdest = *--csrc;
+		}
+	}
+	return xdest;
+}
+EXPORT_SYMBOL(memmove);