sh: __copy_user() optimizations for small copies.

This implements a fast-path for small (less than 12 bytes) copies,
with the existing path treated as the slow-path and left as the default
behaviour for all other copy sizes.

Signed-off-by: Stuart Menefy <stuart.menefy@st.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
diff --git a/arch/sh/mm/copy_page.S b/arch/sh/mm/copy_page.S
index ae039f2..a81dbdb 100644
--- a/arch/sh/mm/copy_page.S
+++ b/arch/sh/mm/copy_page.S
@@ -141,47 +141,38 @@
 	.long 9999b, 6000f	;	\
 	.previous
 ENTRY(__copy_user)
-	tst	r6,r6		! Check explicitly for zero
-	bf	1f
-	rts
-	 mov	#0,r0		! normal return
-1:
-	mov.l	r10,@-r15
-	mov.l	r9,@-r15
-	mov.l	r8,@-r15
+	! Check if small number of bytes
+	mov	#11,r0
 	mov	r4,r3
-	add	r6,r3		! last destination address
-	mov	#12,r0		! Check if small number of bytes
-	cmp/gt	r0,r6
-	bt	2f
-	bra	.L_cleanup_loop
-	 nop
-2:
-	neg	r5,r0		! Calculate bytes needed to align source
-	add	#4,r0
-	and	#3,r0
-	tst	r0,r0
-	bt	.L_jump
-	mov	r0,r1
+	cmp/gt	r0,r6		! r6 (len) > r0 (11)
+	bf/s	.L_cleanup_loop_no_pop
+	 add	r6,r3		! last destination address
 
-.L_loop1:
-	! Copy bytes to align source
-EX(	mov.b	@r5+,r0		)
-	dt	r1
-EX(	mov.b	r0,@r4		)
+	! Calculate bytes needed to align to src
+	mov.l	r11,@-r15
+	neg	r5,r0
+	mov.l	r10,@-r15
+	add	#4,r0
+	mov.l	r9,@-r15
+	and	#3,r0
+	mov.l	r8,@-r15
+	tst	r0,r0
+	bt	2f
+
+1:
+	! Copy bytes to long word align src
+EX(	mov.b	@r5+,r1		)
+	dt	r0
 	add	#-1,r6
-	bf/s	.L_loop1
+EX(	mov.b	r1,@r4		)
+	bf/s	1b
 	 add	#1,r4
 
-.L_jump:
-	mov	r6,r2		! Calculate number of longwords to copy
+	! Jump to appropriate routine depending on dest
+2:	mov	#3,r1
+	mov	r6, r2
+	and	r4,r1
 	shlr2	r2
-	tst	r2,r2
-	bt	.L_cleanup
-
-	mov	r4,r0		! Jump to appropriate routine
-	and	#3,r0
-	mov	r0,r1
 	shll2	r1
 	mova	.L_jump_tbl,r0
 	mov.l	@(r0,r1),r1
@@ -195,43 +186,97 @@
 	.long	.L_dest10
 	.long	.L_dest11
 
+/*
+ * Come here if there are less than 12 bytes to copy
+ *
+ * Keep the branch target close, so the bf/s callee doesn't overflow
+ * and result in a more expensive branch being inserted. This is the
+ * fast-path for small copies, the jump via the jump table will hit the
+ * default slow-path cleanup. -PFM.
+ */
+.L_cleanup_loop_no_pop:
+	tst	r6,r6		! Check explicitly for zero
+	bt	1f
+
+2:
+EX(	mov.b	@r5+,r0		)
+	dt	r6
+EX(	mov.b	r0,@r4		)
+	bf/s	2b
+	 add	#1,r4
+
+1:	mov	#0,r0		! normal return
+5000:
+
+# Exception handler:
+.section .fixup, "ax"
+6000:
+	mov.l	8000f,r1
+	mov	r3,r0
+	jmp	@r1
+	 sub	r4,r0
+	.align	2
+8000:	.long	5000b
+
+.previous
+	rts
+	 nop
+
 ! Destination = 00
 
 .L_dest00:
-	mov	r2,r7
-	shlr2	r7
-	shlr	r7
-	tst	r7,r7
-	mov	#7,r0
-	bt/s	1f
-	 and	r0,r2
-	.align 2
+	! Skip the large copy for small transfers
+	mov	#(32+32-4), r0
+	cmp/gt	r6, r0		! r0 (60) > r6 (len)
+	bt	1f
+
+	! Align dest to a 32 byte boundary
+	neg	r4,r0
+	add	#0x20, r0
+	and	#0x1f, r0
+	tst	r0, r0
+	bt	2f
+
+	sub	r0, r6
+	shlr2	r0
+3:
+EX(	mov.l	@r5+,r1		)
+	dt	r0
+EX(	mov.l	r1,@r4		)
+	bf/s	3b
+	 add	#4,r4
+
 2:
 EX(	mov.l	@r5+,r0		)
+EX(	mov.l	@r5+,r1		)
+EX(	mov.l	@r5+,r2		)
+EX(	mov.l	@r5+,r7		)
 EX(	mov.l	@r5+,r8		)
 EX(	mov.l	@r5+,r9		)
 EX(	mov.l	@r5+,r10	)
-EX(	mov.l	r0,@r4		)
-EX(	mov.l	r8,@(4,r4)	)
-EX(	mov.l	r9,@(8,r4)	)
-EX(	mov.l	r10,@(12,r4)	)
-EX(	mov.l	@r5+,r0		)
-EX(	mov.l	@r5+,r8		)
-EX(	mov.l	@r5+,r9		)
-EX(	mov.l	@r5+,r10	)
-	dt	r7
-EX(	mov.l	r0,@(16,r4)	)
-EX(	mov.l	r8,@(20,r4)	)
-EX(	mov.l	r9,@(24,r4)	)
-EX(	mov.l	r10,@(28,r4)	)
+EX(	mov.l	@r5+,r11	)
+EX(	movca.l	r0,@r4		)
+	add	#-32, r6
+EX(	mov.l	r1,@(4,r4)	)
+	mov	#32, r0
+EX(	mov.l	r2,@(8,r4)	)
+	cmp/gt	r6, r0		! r0 (32) > r6 (len)
+EX(	mov.l	r7,@(12,r4)	)
+EX(	mov.l	r8,@(16,r4)	)
+EX(	mov.l	r9,@(20,r4)	)
+EX(	mov.l	r10,@(24,r4)	)
+EX(	mov.l	r11,@(28,r4)	)
 	bf/s	2b
 	 add	#32,r4
-	tst	r2,r2
+
+1:	mov	r6, r0
+	shlr2	r0
+	tst	r0, r0
 	bt	.L_cleanup
 1:
-EX(	mov.l	@r5+,r0		)
-	dt	r2
-EX(	mov.l	r0,@r4		)
+EX(	mov.l	@r5+,r1		)
+	dt	r0
+EX(	mov.l	r1,@r4		)
 	bf/s	1b
 	 add	#4,r4
 
@@ -250,7 +295,7 @@
 	 and	r0,r2
 2:
 	dt	r7
-#ifdef __LITTLE_ENDIAN__
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
 EX(	mov.l	@r5+,r0		)
 EX(	mov.l	@r5+,r1		)
 EX(	mov.l	@r5+,r8		)
@@ -320,7 +365,7 @@
 1:	! Read longword, write two words per iteration
 EX(	mov.l	@r5+,r0		)
 	dt	r2
-#ifdef __LITTLE_ENDIAN__
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
 EX(	mov.w	r0,@r4		)
 	shlr16	r0
 EX(	mov.w 	r0,@(2,r4)	)
@@ -342,7 +387,7 @@
 	! Read longword, write byte, word, byte per iteration
 EX(	mov.l	@r5+,r0		)
 	dt	r2
-#ifdef __LITTLE_ENDIAN__
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
 EX(	mov.b	r0,@r4		)
 	shlr8	r0
 	add	#1,r4
@@ -379,6 +424,7 @@
 
 .L_exit:
 	mov	#0,r0		! normal return
+
 5000:
 
 # Exception handler:
@@ -394,5 +440,6 @@
 .previous
 	mov.l	@r15+,r8
 	mov.l	@r15+,r9
+	mov.l	@r15+,r10
 	rts
-	 mov.l	@r15+,r10
+	 mov.l	@r15+,r11