powerpc32: optimise csum_partial() loop On the 8xx, load latency is 2 cycles and taking branches also takes 2 cycles. So let's unroll the loop. This patch improves csum_partial() speed by around 10% on both: * 8xx (single issue processor with parallel execution) * 83xx (superscalar 6xx processor with dual instruction fetch and parallel execution) Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr> Signed-off-by: Scott Wood <oss@buserror.net>

commit: f867d556dd8525fe6ff0d22a34249528e590f994 [log] [tgz]
author: Christophe Leroy <christophe.leroy@c-s.fr> Tue Sep 22 16:34:32 2015 +0200
committer: Scott Wood <oss@buserror.net> Fri Mar 04 23:03:45 2016 -0600
tree: 32ebba9cfc1b00d1f394b480d5cfab443382864e
parent: 48821a34b1bdc5d89505cb814b3f7c166940f200 [diff]
diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 9c12602..0d34f47c 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S

@@ -38,10 +38,24 @@
 	srwi.	r6,r4,2		/* # words to do */
 	adde	r5,r5,r0
 	beq	3f
-1:	mtctr	r6
+1:	andi.	r6,r6,3		/* Prepare to handle words 4 by 4 */
+	beq	21f
+	mtctr	r6
 2:	lwzu	r0,4(r3)
 	adde	r5,r5,r0
 	bdnz	2b
+21:	srwi.	r6,r4,4		/* # blocks of 4 words to do */
+	beq	3f
+	mtctr	r6
+22:	lwz	r0,4(r3)
+	lwz	r6,8(r3)
+	lwz	r7,12(r3)
+	lwzu	r8,16(r3)
+	adde	r5,r5,r0
+	adde	r5,r5,r6
+	adde	r5,r5,r7
+	adde	r5,r5,r8
+	bdnz	22b
 3:	andi.	r0,r4,2
 	beq+	4f
 	lhz	r0,4(r3)
commit	f867d556dd8525fe6ff0d22a34249528e590f994	[log] [tgz]
author	Christophe Leroy <christophe.leroy@c-s.fr>	Tue Sep 22 16:34:32 2015 +0200
committer	Scott Wood <oss@buserror.net>	Fri Mar 04 23:03:45 2016 -0600
tree	32ebba9cfc1b00d1f394b480d5cfab443382864e
parent	48821a34b1bdc5d89505cb814b3f7c166940f200 [diff]