x86-64: Handle byte-wise tail copying in memcpy() without a loop While hard to measure, reducing the number of possibly/likely mis-predicted branches can generally be expected to be slightly better. Other than apparent at the first glance, this also doesn't grow the function size (the alignment gap to the next function just gets smaller). Signed-off-by: Jan Beulich <jbeulich@suse.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/4F218584020000780006F422@nat28.tlf.novell.com Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit: 9d8e22777e66f420e46490e9fc6f8cb7e0e2222b [log] [tgz]
author: Jan Beulich <JBeulich@suse.com> Thu Jan 26 15:55:32 2012 +0000
committer: Ingo Molnar <mingo@elte.hu> Thu Jan 26 21:19:20 2012 +0100
tree: dd0ec6122dda1409206dda70f6ae4fd3c9a2cd35
parent: 2ab560911a427fdc73bfd3a7d2944d8ee0ca6db8 [diff]
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 1235b04..1c273be 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S

@@ -164,18 +164,19 @@
 	retq
 	.p2align 4
 .Lless_3bytes:
-	cmpl $0, %edx
-	je .Lend
+	subl $1, %edx
+	jb .Lend
 	/*
 	 * Move data from 1 bytes to 3 bytes.
 	 */
-.Lloop_1:
-	movb (%rsi), %r8b
-	movb %r8b, (%rdi)
-	incq %rdi
-	incq %rsi
-	decl %edx
-	jnz .Lloop_1
+	movzbl (%rsi), %ecx
+	jz .Lstore_1byte
+	movzbq 1(%rsi), %r8
+	movzbq (%rsi, %rdx), %r9
+	movb %r8b, 1(%rdi)
+	movb %r9b, (%rdi, %rdx)
+.Lstore_1byte:
+	movb %cl, (%rdi)
 
 .Lend:
 	retq
commit	9d8e22777e66f420e46490e9fc6f8cb7e0e2222b	[log] [tgz]
author	Jan Beulich <JBeulich@suse.com>	Thu Jan 26 15:55:32 2012 +0000
committer	Ingo Molnar <mingo@elte.hu>	Thu Jan 26 21:19:20 2012 +0100
tree	dd0ec6122dda1409206dda70f6ae4fd3c9a2cd35
parent	2ab560911a427fdc73bfd3a7d2944d8ee0ca6db8 [diff]