X86 memcpy: use REPMOVSB instead of REPMOVS{Q,D,W} for inline copies
when the subtarget has fast strings.

This has two advantages:
  - Speed is improved. For example, on Haswell thoughput improvements increase
    linearly with size from 256 to 512 bytes, after which they plateau:
    (e.g. 1% for 260 bytes, 25% for 400 bytes, 40% for 508 bytes).
  - Code is much smaller (no need to handle boundaries).

llvm-svn: 300957
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 8fcc8e3..99d6c69 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -273,6 +273,13 @@
           "fast-shld-rotate", "HasFastSHLDRotate", "true",
           "SHLD can be used as a faster rotate">;
 
+// String operations (e.g. REP MOVS) are fast. See "REP String Enhancement" in
+// the Intel Software Development Manual.
+def FeatureFastString
+    : SubtargetFeature<
+          "fast-string", "HasFastString", "true",
+          "REP MOVS/STOS are fast">;
+
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
 //===----------------------------------------------------------------------===//
@@ -498,6 +505,7 @@
   FeatureAVX2,
   FeatureBMI,
   FeatureBMI2,
+  FeatureFastString,
   FeatureFMA,
   FeatureLZCNT,
   FeatureMOVBE,
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index e31d276..456a204 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -897,6 +897,7 @@
 def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
 def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
 def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
+def HasFastString : Predicate<"Subtarget->hasFastString()">;
 def HasMFence    : Predicate<"Subtarget->hasMFence()">;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
index 2ab4ecb..d893738 100644
--- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -215,7 +215,12 @@
     return SDValue();
 
   MVT AVT;
-  if (Align & 1)
+  if (Subtarget.hasFastString())
+    // If the target has fast strings, then it's at least as fast to use
+    // REP MOVSB instead of REP MOVS{W,D,Q}, and it avoids having to handle
+    // BytesLeft.
+    AVT = MVT::i8;
+  else if (Align & 1)
     AVT = MVT::i8;
   else if (Align & 2)
     AVT = MVT::i16;
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 92a6875..0bd29a5 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -303,6 +303,7 @@
   HasFastVectorFSQRT = false;
   HasFastLZCNT = false;
   HasFastSHLDRotate = false;
+  HasFastString = false;
   HasSlowDivide32 = false;
   HasSlowDivide64 = false;
   PadShortFunctions = false;
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index d0d88d3..2b858c2 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -232,6 +232,9 @@
   /// True if SHLD based rotate is fast.
   bool HasFastSHLDRotate;
 
+  /// True if the processor has fast REP MOVS.
+  bool HasFastString;
+
   /// True if the short functions should be padded to prevent
   /// a stall when returning too early.
   bool PadShortFunctions;
@@ -472,6 +475,7 @@
   bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
   bool hasFastLZCNT() const { return HasFastLZCNT; }
   bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
+  bool hasFastString() const { return HasFastString; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
   bool hasSlowDivide64() const { return HasSlowDivide64; }
   bool padShortFunctions() const { return PadShortFunctions; }