ARM: 7877/1: use built-in byte swap function

Enable the compiler intrinsic for byte swapping on arch ARM. This
allows the compiler to detect and be able to optimize out byte
swappings, and has a very modest benefit on vmlinux size (Linaro gcc
4.8):

text data bss dec hex filename
2840310 123932 61960 3026202 2e2d1a vmlinux-lart #orig
2840152 123932 61960 3026044 2e2c7c vmlinux-lart #builtin-bswap

6473120 314840 5616016 12403976 bd4508 vmlinux-mxs #orig
6472586 314848 5616016 12403450 bd42fa vmlinux-mxs #builtin-bswap

7419872 318372 379556 8117800 7bde28 vmlinux-imx_v6_v7 #orig
7419170 318364 379556 8117090 7bdb62 vmlinux-imx_v6_v7 #builtin-bswap

Signed-off-by: Kim Phillips <kim.phillips@freescale.com>
Reviewed-by: Nicolas Pitre <nico@linaro.org>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 47d7338..0573faa 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -13,7 +13,7 @@
 		   ashldi3.o ashrdi3.o lshrdi3.o muldi3.o             \
 		   ucmpdi2.o lib1funcs.o div64.o                      \
 		   io-readsb.o io-writesb.o io-readsl.o io-writesl.o  \
-		   call_with_stack.o
+		   call_with_stack.o bswapsdi2.o
 
 mmu-y	:= clear_user.o copy_page.o getuser.o putuser.o
 
diff --git a/arch/arm/lib/bswapsdi2.S b/arch/arm/lib/bswapsdi2.S
new file mode 100644
index 0000000..9fcdd15
--- /dev/null
+++ b/arch/arm/lib/bswapsdi2.S
@@ -0,0 +1,36 @@
+#include <linux/linkage.h>
+
+#if __LINUX_ARM_ARCH__ >= 6
+ENTRY(__bswapsi2)
+	rev r0, r0
+	bx lr
+ENDPROC(__bswapsi2)
+
+ENTRY(__bswapdi2)
+	rev r3, r0
+	rev r0, r1
+	mov r1, r3
+	bx lr
+ENDPROC(__bswapdi2)
+#else
+ENTRY(__bswapsi2)
+	eor r3, r0, r0, ror #16
+	mov r3, r3, lsr #8
+	bic r3, r3, #0xff00
+	eor r0, r3, r0, ror #8
+	mov pc, lr
+ENDPROC(__bswapsi2)
+
+ENTRY(__bswapdi2)
+	mov ip, r1
+	eor r3, ip, ip, ror #16
+	eor r1, r0, r0, ror #16
+	mov r1, r1, lsr #8
+	mov r3, r3, lsr #8
+	bic r3, r3, #0xff00
+	bic r1, r1, #0xff00
+	eor r1, r1, r0, ror #8
+	eor r0, r3, ip, ror #8
+	mov pc, lr
+ENDPROC(__bswapdi2)
+#endif