X86 memcpy: use REPMOVSB instead of REPMOVS{Q,D,W} for inline copies when the subtarget has fast strings. This has two advantages: - Speed is improved. For example, on Haswell thoughput improvements increase linearly with size from 256 to 512 bytes, after which they plateau: (e.g. 1% for 260 bytes, 25% for 400 bytes, 40% for 508 bytes). - Code is much smaller (no need to handle boundaries). llvm-svn: 300957

commit: 1ce3b82dea8eb35e77974fc9d97f9a08c690c53d [log] [tgz]
author: Clement Courbet <courbet@google.com> Fri Apr 21 09:20:39 2017 +0000
committer: Clement Courbet <courbet@google.com> Fri Apr 21 09:20:39 2017 +0000
tree: 10718c7c21b90322462a789d671de895ccb18d54
parent: f8a964252643c4e65d0c091105cc9d4cbe813690 [diff] [blame]
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index d0d88d3..2b858c2 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h

@@ -232,6 +232,9 @@
   /// True if SHLD based rotate is fast.
   bool HasFastSHLDRotate;
 
+  /// True if the processor has fast REP MOVS.
+  bool HasFastString;
+
   /// True if the short functions should be padded to prevent
   /// a stall when returning too early.
   bool PadShortFunctions;
@@ -472,6 +475,7 @@
   bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
   bool hasFastLZCNT() const { return HasFastLZCNT; }
   bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
+  bool hasFastString() const { return HasFastString; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
   bool hasSlowDivide64() const { return HasSlowDivide64; }
   bool padShortFunctions() const { return PadShortFunctions; }
commit	1ce3b82dea8eb35e77974fc9d97f9a08c690c53d	[log] [tgz]
author	Clement Courbet <courbet@google.com>	Fri Apr 21 09:20:39 2017 +0000
committer	Clement Courbet <courbet@google.com>	Fri Apr 21 09:20:39 2017 +0000
tree	10718c7c21b90322462a789d671de895ccb18d54
parent	f8a964252643c4e65d0c091105cc9d4cbe813690 [diff] [blame]