Diff - b3f271e86e5a440713716bb222e1aa1227994c50^! - kernel/msm-4.9

commit	b3f271e86e5a440713716bb222e1aa1227994c50	[log] [tgz]
author	Anton Blanchard <anton@samba.org>	Wed May 30 20:22:09 2012 +0000
committer	Benjamin Herrenschmidt <benh@kernel.crashing.org>	Tue Jul 03 14:14:46 2012 +1000
tree	a215f44357cf20c9f2b342a7bdb840bb85ca4cb1
parent	bce4b4bd91efab9dca693ac37c8ddf88103280d8 [diff] [blame]

powerpc: POWER7 optimised memcpy using VMX and enhanced prefetch

Implement a POWER7 optimised memcpy using VMX and enhanced prefetch
instructions.

This is a copy of the POWER7 optimised copy_to_user/copy_from_user
loop. Detailed implementation and performance details can be found in
commit a66086b8197d (powerpc: POWER7 optimised
copy_to_user/copy_from_user using VMX).

I noticed memcpy issues when profiling a RAID6 workload:

	.memcpy
	.async_memcpy
	.async_copy_data
	.__raid_run_ops
	.handle_stripe
	.raid5d
	.md_thread

I created a simplified testcase by building a RAID6 array with 4 1GB
ramdisks (booting with brd.rd_size=1048576):

# mdadm -CR -e 1.2 /dev/md0 --level=6 -n4 /dev/ram[0-3]

I then timed how long it took to write to the entire array:

# dd if=/dev/zero of=/dev/md0 bs=1M

Before: 892 MB/s
After:  999 MB/s

A 12% improvement.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S
index 82fea39..d2bbbc8 100644
--- a/arch/powerpc/lib/memcpy_64.S
+++ b/arch/powerpc/lib/memcpy_64.S

@@ -11,7 +11,11 @@
 
 	.align	7
 _GLOBAL(memcpy)
+BEGIN_FTR_SECTION
 	std	r3,48(r1)	/* save destination pointer for return value */
+FTR_SECTION_ELSE
+	b	memcpy_power7
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
 	PPC_MTOCRF(0x01,r5)
 	cmpldi	cr1,r5,16
 	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry