powerpc: Update 64bit __copy_tofrom_user() using CPU_FTR_UNALIGNED_LD_STD

In exactly the same way that we updated memcpy() with new feature
sections in commit 25d6e2d7c58ddc4a3b614fc5381591c0cfe66556 ("powerpc:
Update 64bit memcpy() using CPU_FTR_UNALIGNED_LD_STD"), we do the same
thing here for __copy_tofrom_user().  Once again this is purely a
performance tweak for Cell and Power6 - this has no effect on all the
other 64bit powerpc chips.

We can make these same changes to __copy_tofrom_user() because the
basic copy algorithm is the same as in memcpy() - this version just
has all the exception handling logic needed when copying to or from
userspace as well as a special case for copying whole 4K pages that
are page aligned.

CPU_FTR_UNALIGNED_LD_STD CPU was added in commit
4ec577a28980a0790df3c3dfe9c81f6e2222acfb ("powerpc: Add new CPU
feature: CPU_FTR_UNALIGNED_LD_STD").

We also make the same simple one line change from cmpldi r1,... to
cmpldi cr1,... for consistency.

Signed-off-by: Mark Nelson <markn@au1.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S
index 25ec537..70693a5 100644
--- a/arch/powerpc/lib/copyuser_64.S
+++ b/arch/powerpc/lib/copyuser_64.S
@@ -26,11 +26,24 @@
 	andi.	r6,r6,7
 	PPC_MTOCRF	0x01,r5
 	blt	cr1,.Lshort_copy
+/* Below we want to nop out the bne if we're on a CPU that has the
+ * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
+ * cleared.
+ * At the time of writing the only CPU that has this combination of bits
+ * set is Power6.
+ */
+BEGIN_FTR_SECTION
+	nop
+FTR_SECTION_ELSE
 	bne	.Ldst_unaligned
+ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
+		    CPU_FTR_UNALIGNED_LD_STD)
 .Ldst_aligned:
-	andi.	r0,r4,7
 	addi	r3,r3,-16
+BEGIN_FTR_SECTION
+	andi.	r0,r4,7
 	bne	.Lsrc_unaligned
+END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 	srdi	r7,r5,4
 20:	ld	r9,0(r4)
 	addi	r4,r4,-8
@@ -138,7 +151,7 @@
 	PPC_MTOCRF	0x01,r6		/* put #bytes to 8B bdry into cr7 */
 	subf	r5,r6,r5
 	li	r7,0
-	cmpldi	r1,r5,16
+	cmpldi	cr1,r5,16
 	bf	cr7*4+3,1f
 35:	lbz	r0,0(r4)
 81:	stb	r0,0(r3)