rename "slow-unaligned-mem-under-32" to slow-unaligned-mem-16" (NFCI)
This is a follow-on suggested by:
http://reviews.llvm.org/D12154 ( http://reviews.llvm.org/rL245729 )
http://reviews.llvm.org/D10662 ( http://reviews.llvm.org/rL245075 )
This makes the attribute name match most of the existing lowering logic
and regression test expectations.
But the current use of this attribute is inconsistent; see the FIXME
comment for "allowsMisalignedMemoryAccesses()". That change will
result in functional changes and should be coming soon.
llvm-svn: 246585
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index d00a111..3a3b038 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -79,9 +79,10 @@
                                        "Bit testing of memory is slow">;
 def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
                                        "SHLD instruction is slow">;
-def FeatureSlowUAMem : SubtargetFeature<"slow-unaligned-mem-under-32",
-                                "IsUAMemUnder32Slow", "true",
-                                "Slow unaligned 16-byte-or-less memory access">;
+// FIXME: This should not apply to CPUs that do not have SSE.
+def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
+                                "IsUAMem16Slow", "true",
+                                "Slow unaligned 16-byte memory access">;
 def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
                                 "IsUAMem32Slow", "true",
                                 "Slow unaligned 32-byte memory access">;
@@ -209,42 +210,45 @@
 class Proc<string Name, list<SubtargetFeature> Features>
  : ProcessorModel<Name, GenericModel, Features>;
 
-def : Proc<"generic",         [FeatureSlowUAMem]>;
-def : Proc<"i386",            [FeatureSlowUAMem]>;
-def : Proc<"i486",            [FeatureSlowUAMem]>;
-def : Proc<"i586",            [FeatureSlowUAMem]>;
-def : Proc<"pentium",         [FeatureSlowUAMem]>;
-def : Proc<"pentium-mmx",     [FeatureSlowUAMem, FeatureMMX]>;
-def : Proc<"i686",            [FeatureSlowUAMem]>;
-def : Proc<"pentiumpro",      [FeatureSlowUAMem, FeatureCMOV]>;
-def : Proc<"pentium2",        [FeatureSlowUAMem, FeatureMMX, FeatureCMOV]>;
-def : Proc<"pentium3",        [FeatureSlowUAMem, FeatureSSE1]>;
-def : Proc<"pentium3m",       [FeatureSlowUAMem, FeatureSSE1, FeatureSlowBTMem]>;
-def : Proc<"pentium-m",       [FeatureSlowUAMem, FeatureSSE2, FeatureSlowBTMem]>;
-def : Proc<"pentium4",        [FeatureSlowUAMem, FeatureSSE2]>;
-def : Proc<"pentium4m",       [FeatureSlowUAMem, FeatureSSE2, FeatureSlowBTMem]>;
+def : Proc<"generic",         [FeatureSlowUAMem16]>;
+def : Proc<"i386",            [FeatureSlowUAMem16]>;
+def : Proc<"i486",            [FeatureSlowUAMem16]>;
+def : Proc<"i586",            [FeatureSlowUAMem16]>;
+def : Proc<"pentium",         [FeatureSlowUAMem16]>;
+def : Proc<"pentium-mmx",     [FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"i686",            [FeatureSlowUAMem16]>;
+def : Proc<"pentiumpro",      [FeatureSlowUAMem16, FeatureCMOV]>;
+def : Proc<"pentium2",        [FeatureSlowUAMem16, FeatureMMX, FeatureCMOV]>;
+def : Proc<"pentium3",        [FeatureSlowUAMem16, FeatureSSE1]>;
+def : Proc<"pentium3m",       [FeatureSlowUAMem16, FeatureSSE1,
+                               FeatureSlowBTMem]>;
+def : Proc<"pentium-m",       [FeatureSlowUAMem16, FeatureSSE2,
+                               FeatureSlowBTMem]>;
+def : Proc<"pentium4",        [FeatureSlowUAMem16, FeatureSSE2]>;
+def : Proc<"pentium4m",       [FeatureSlowUAMem16, FeatureSSE2,
+                               FeatureSlowBTMem]>;
 
 // Intel Core Duo.
 def : ProcessorModel<"yonah", SandyBridgeModel,
-                     [FeatureSlowUAMem, FeatureSSE3, FeatureSlowBTMem]>;
+                     [FeatureSlowUAMem16, FeatureSSE3, FeatureSlowBTMem]>;
 
 // NetBurst.
-def : Proc<"prescott", [FeatureSlowUAMem, FeatureSSE3, FeatureSlowBTMem]>;
-def : Proc<"nocona",   [FeatureSlowUAMem, FeatureSSE3, FeatureCMPXCHG16B,
+def : Proc<"prescott", [FeatureSlowUAMem16, FeatureSSE3, FeatureSlowBTMem]>;
+def : Proc<"nocona",   [FeatureSlowUAMem16, FeatureSSE3, FeatureCMPXCHG16B,
                         FeatureSlowBTMem]>;
 
 // Intel Core 2 Solo/Duo.
 def : ProcessorModel<"core2", SandyBridgeModel,
-                     [FeatureSlowUAMem, FeatureSSSE3, FeatureCMPXCHG16B,
+                     [FeatureSlowUAMem16, FeatureSSSE3, FeatureCMPXCHG16B,
                       FeatureSlowBTMem]>;
 def : ProcessorModel<"penryn", SandyBridgeModel,
-                     [FeatureSlowUAMem, FeatureSSE41, FeatureCMPXCHG16B,
+                     [FeatureSlowUAMem16, FeatureSSE41, FeatureCMPXCHG16B,
                       FeatureSlowBTMem]>;
 
 // Atom CPUs.
 class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [
                                    ProcIntelAtom,
-                                   FeatureSlowUAMem,
+                                   FeatureSlowUAMem16,
                                    FeatureSSSE3,
                                    FeatureCMPXCHG16B,
                                    FeatureMOVBE,
@@ -399,38 +403,38 @@
 
 // AMD CPUs.
 
-def : Proc<"k6",              [FeatureSlowUAMem, FeatureMMX]>;
-def : Proc<"k6-2",            [FeatureSlowUAMem, Feature3DNow]>;
-def : Proc<"k6-3",            [FeatureSlowUAMem, Feature3DNow]>;
-def : Proc<"athlon",          [FeatureSlowUAMem, Feature3DNowA,
+def : Proc<"k6",              [FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"k6-2",            [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"k6-3",            [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"athlon",          [FeatureSlowUAMem16, Feature3DNowA,
                                FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon-tbird",    [FeatureSlowUAMem, Feature3DNowA,
+def : Proc<"athlon-tbird",    [FeatureSlowUAMem16, Feature3DNowA,
                                FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon-4",        [FeatureSlowUAMem, FeatureSSE1, Feature3DNowA,
+def : Proc<"athlon-4",        [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
                                FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon-xp",       [FeatureSlowUAMem, FeatureSSE1, Feature3DNowA,
+def : Proc<"athlon-xp",       [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
                                FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon-mp",       [FeatureSlowUAMem, FeatureSSE1, Feature3DNowA,
+def : Proc<"athlon-mp",       [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA,
                                FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"k8",              [FeatureSlowUAMem, FeatureSSE2, Feature3DNowA,
+def : Proc<"k8",              [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
                                Feature64Bit, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
-def : Proc<"opteron",         [FeatureSlowUAMem, FeatureSSE2, Feature3DNowA,
+def : Proc<"opteron",         [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
                                Feature64Bit, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
-def : Proc<"athlon64",        [FeatureSlowUAMem, FeatureSSE2,   Feature3DNowA,
+def : Proc<"athlon64",        [FeatureSlowUAMem16, FeatureSSE2,   Feature3DNowA,
                                Feature64Bit, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
-def : Proc<"athlon-fx",       [FeatureSlowUAMem, FeatureSSE2,   Feature3DNowA,
+def : Proc<"athlon-fx",       [FeatureSlowUAMem16, FeatureSSE2,   Feature3DNowA,
                                Feature64Bit, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
-def : Proc<"k8-sse3",         [FeatureSlowUAMem, FeatureSSE3,   Feature3DNowA,
+def : Proc<"k8-sse3",         [FeatureSlowUAMem16, FeatureSSE3,   Feature3DNowA,
                                FeatureCMPXCHG16B, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
-def : Proc<"opteron-sse3",    [FeatureSlowUAMem, FeatureSSE3,   Feature3DNowA,
+def : Proc<"opteron-sse3",    [FeatureSlowUAMem16, FeatureSSE3,   Feature3DNowA,
                                FeatureCMPXCHG16B, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
-def : Proc<"athlon64-sse3",   [FeatureSlowUAMem, FeatureSSE3,   Feature3DNowA,
+def : Proc<"athlon64-sse3",   [FeatureSlowUAMem16, FeatureSSE3,   Feature3DNowA,
                                FeatureCMPXCHG16B, FeatureSlowBTMem,
                                FeatureSlowSHLD]>;
 def : Proc<"amdfam10",        [FeatureSSE4A,
@@ -483,12 +487,12 @@
                                FeatureTBM, FeatureFMA, FeatureSSE4A,
                                FeatureFSGSBase]>;
 
-def : Proc<"geode",           [FeatureSlowUAMem, Feature3DNowA]>;
+def : Proc<"geode",           [FeatureSlowUAMem16, Feature3DNowA]>;
 
-def : Proc<"winchip-c6",      [FeatureSlowUAMem, FeatureMMX]>;
-def : Proc<"winchip2",        [FeatureSlowUAMem, Feature3DNow]>;
-def : Proc<"c3",              [FeatureSlowUAMem, Feature3DNow]>;
-def : Proc<"c3-2",            [FeatureSlowUAMem, FeatureSSE1]>;
+def : Proc<"winchip-c6",      [FeatureSlowUAMem16, FeatureMMX]>;
+def : Proc<"winchip2",        [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"c3",              [FeatureSlowUAMem16, Feature3DNow]>;
+def : Proc<"c3-2",            [FeatureSlowUAMem16, FeatureSSE1]>;
 
 // We also provide a generic 64-bit specific x86 processor model which tries to
 // be good for modern chips without enabling instruction set encodings past the
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 707fa5e..bfa4145 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1869,7 +1869,7 @@
   if ((!IsMemset || ZeroMemset) &&
       !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
     if (Size >= 16 &&
-        (!Subtarget->isUnalignedMemUnder32Slow() ||
+        (!Subtarget->isUnalignedMem16Slow() ||
          ((DstAlign == 0 || DstAlign >= 16) &&
           (SrcAlign == 0 || SrcAlign >= 16)))) {
       if (Size >= 32) {
@@ -1916,7 +1916,9 @@
     if (VT.getSizeInBits() == 256)
       *Fast = !Subtarget->isUnalignedMem32Slow();
     else
-      *Fast = !Subtarget->isUnalignedMemUnder32Slow();
+      // FIXME: We should always return that 8-byte and under accesses are fast.
+      // That is what other x86 lowering code assumes.
+      *Fast = !Subtarget->isUnalignedMem16Slow();
   }
   return true;
 }
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 7a37d4c..cf9d8a8 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -5511,7 +5511,7 @@
   // TODO: Check if 32-byte or greater accesses are slow too?
   if (!MI->hasOneMemOperand() &&
       RC == &X86::VR128RegClass &&
-      Subtarget.isUnalignedMemUnder32Slow())
+      Subtarget.isUnalignedMem16Slow())
     // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
     // conservatively assume the address is unaligned. That's bad for
     // performance.
@@ -5659,7 +5659,7 @@
                             cast<MachineSDNode>(N)->memoperands_end());
     if (!(*MMOs.first) &&
         RC == &X86::VR128RegClass &&
-        Subtarget.isUnalignedMemUnder32Slow())
+        Subtarget.isUnalignedMem16Slow())
       // Do not introduce a slow unaligned load.
       return false;
     // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
@@ -5704,7 +5704,7 @@
                              cast<MachineSDNode>(N)->memoperands_end());
     if (!(*MMOs.first) &&
         RC == &X86::VR128RegClass &&
-        Subtarget.isUnalignedMemUnder32Slow())
+        Subtarget.isUnalignedMem16Slow())
       // Do not introduce a slow unaligned store.
       return false;
     // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index b23b3c0..5b53ca9 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -197,7 +197,7 @@
   // introduced with Intel's Nehalem/Silvermont and AMD's Family10h
   // micro-architectures respectively.
   if (hasSSE42() || hasSSE4A())
-    IsUAMemUnder32Slow = false;
+    IsUAMem16Slow = false;
   
   InstrItins = getInstrItineraryForCPU(CPUName);
 
@@ -262,7 +262,7 @@
   HasMPX = false;
   IsBTMemSlow = false;
   IsSHLDSlow = false;
-  IsUAMemUnder32Slow = false;
+  IsUAMem16Slow = false;
   IsUAMem32Slow = false;
   HasSSEUnalignedMem = false;
   HasCmpxchg16b = false;
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index d5d0027..c5d74e6 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -146,8 +146,8 @@
   /// True if SHLD instructions are slow.
   bool IsSHLDSlow;
 
-  /// True if unaligned memory accesses of 16-bytes or smaller are slow.
-  bool IsUAMemUnder32Slow;
+  /// True if unaligned memory accesses of 16-bytes are slow.
+  bool IsUAMem16Slow;
 
   /// True if unaligned memory accesses of 32-bytes are slow.
   bool IsUAMem32Slow;
@@ -357,7 +357,7 @@
   bool hasRDSEED() const { return HasRDSEED; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
   bool isSHLDSlow() const { return IsSHLDSlow; }
-  bool isUnalignedMemUnder32Slow() const { return IsUAMemUnder32Slow; }
+  bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
   bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
   bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }