X86 memcpy: use REPMOVSB instead of REPMOVS{Q,D,W} for inline copies when the subtarget has fast strings. This has two advantages: - Speed is improved. For example, on Haswell thoughput improvements increase linearly with size from 256 to 512 bytes, after which they plateau: (e.g. 1% for 260 bytes, 25% for 400 bytes, 40% for 508 bytes). - Code is much smaller (no need to handle boundaries). llvm-svn: 300957

commit: 1ce3b82dea8eb35e77974fc9d97f9a08c690c53d [log] [tgz]
author: Clement Courbet <courbet@google.com> Fri Apr 21 09:20:39 2017 +0000
committer: Clement Courbet <courbet@google.com> Fri Apr 21 09:20:39 2017 +0000
tree: 10718c7c21b90322462a789d671de895ccb18d54
parent: f8a964252643c4e65d0c091105cc9d4cbe813690 [diff]
diff --git a/llvm/test/CodeGen/X86/memcpy-struct-by-value.ll b/llvm/test/CodeGen/X86/memcpy-struct-by-value.ll
new file mode 100644
index 0000000..4bb022e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/memcpy-struct-by-value.ll

@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=-fast-string < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+fast-string < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST
+
+%struct.large = type { [4096 x i8] }
+
+declare void @foo(%struct.large* align 8 byval) nounwind
+
+define void @test1(%struct.large* nocapture %x) nounwind {
+  call void @foo(%struct.large* align 8 byval %x)
+  ret void
+
+; ALL-LABEL: test1:
+; NOFAST: rep;movsq
+; FAST: rep;movsb
+}
commit	1ce3b82dea8eb35e77974fc9d97f9a08c690c53d	[log] [tgz]
author	Clement Courbet <courbet@google.com>	Fri Apr 21 09:20:39 2017 +0000
committer	Clement Courbet <courbet@google.com>	Fri Apr 21 09:20:39 2017 +0000
tree	10718c7c21b90322462a789d671de895ccb18d54
parent	f8a964252643c4e65d0c091105cc9d4cbe813690 [diff]