Teach if-converter to be more careful with predicating instructions that would
take multiple cycles to decode.
For the current if-converter clients (actually only ARM), the instructions that
are predicated on false are not nops. They would still take machine cycles to
decode. Micro-coded instructions such as LDM / STM can potentially take multiple
cycles to decode. If-converter should take treat them as non-micro-coded
simple instructions.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@113570 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index d6a8f19..f3693e3 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -91,6 +91,15 @@
 
 include "ARMSchedule.td"
 
+// ARM processor families.
+def ProcOthers  : SubtargetFeature<"others", "ARMProcFamily", "Others",
+                                   "One of the other ARM processor families">;
+def ProcA8      : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8",
+                                   "Cortex-A8 ARM processors",
+                                   [FeatureSlowFPBrcc, FeatureNEONForFP]>;
+def ProcA9      : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
+                                   "Cortex-A9 ARM processors">;
+
 class ProcNoItin<string Name, list<SubtargetFeature> Features>
  : Processor<Name, GenericItineraries, Features>;
 
@@ -150,10 +159,10 @@
 
 // V7 Processors.
 def : Processor<"cortex-a8",        CortexA8Itineraries,
-                [ArchV7A, FeatureHasSlowVMLx,
-                 FeatureSlowFPBrcc, FeatureNEONForFP, FeatureT2XtPk]>;
+                                    [ArchV7A, ProcA8,
+                                     FeatureHasSlowVMLx, FeatureT2XtPk]>;
 def : Processor<"cortex-a9",        CortexA9Itineraries,
-                [ArchV7A, FeatureT2XtPk]>;
+                                    [ArchV7A, ProcA9, FeatureT2XtPk]>;
 
 // V7M Processors.
 def : ProcNoItin<"cortex-m3",       [ArchV7M]>;
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index c824b8b..e7b35c6 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1415,13 +1415,13 @@
 
 unsigned
 ARMBaseInstrInfo::getNumMicroOps(const MachineInstr *MI,
-                                 const InstrItineraryData &ItinData) const {
-  if (ItinData.isEmpty())
+                                 const InstrItineraryData *ItinData) const {
+  if (!ItinData || ItinData->isEmpty())
     return 1;
 
   const TargetInstrDesc &Desc = MI->getDesc();
   unsigned Class = Desc.getSchedClass();
-  unsigned UOps = ItinData.Itineratries[Class].NumMicroOps;
+  unsigned UOps = ItinData->Itineratries[Class].NumMicroOps;
   if (UOps)
     return UOps;
 
@@ -1430,16 +1430,19 @@
   default:
     llvm_unreachable("Unexpected multi-uops instruction!");
     break;
+  case ARM::VLDMQ:
   case ARM::VSTMQ:
     return 2;
 
   // The number of uOps for load / store multiple are determined by the number
   // registers.
-  // On Cortex-A8, each odd / even pair of register loads / stores
-  // (e.g. r5 + r6) can be completed on the same cycle. The minimum is
-  // 2. For VFP / NEON load / store multiple, the formula is
+  // On Cortex-A8, each pair of register loads / stores can be scheduled on the
+  // same cycle. The scheduling for the first load / store must be done
+  // separately by assuming the the address is not 64-bit aligned.
+  // On Cortex-A9, the formula is simply (#reg / 2) + (#reg % 2). If the address
+  // is not 64-bit aligned, then AGU would take an extra cycle.
+  // For VFP / NEON load / store multiple, the formula is
   // (#reg / 2) + (#reg % 2) + 1.
-  // On Cortex-A9, the formula is simply (#reg / 2) + (#reg % 2).
   case ARM::VLDMD:
   case ARM::VLDMS:
   case ARM::VLDMD_UPD:
@@ -1467,11 +1470,24 @@
   case ARM::t2LDM_UPD:
   case ARM::t2STM:
   case ARM::t2STM_UPD: {
-    // FIXME: Distinquish between Cortex-A8 / Cortex-A9 and other processor
-    // families.
-    unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands();
-    UOps = (NumRegs / 2) + (NumRegs % 2);
-    return (UOps > 2) ? UOps : 2;
+    unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1;
+    if (Subtarget.isCortexA8()) {
+      // 4 registers would be issued: 1, 2, 1.
+      // 5 registers would be issued: 1, 2, 2.
+      return 1 + (NumRegs / 2);
+    } else if (Subtarget.isCortexA9()) {
+      UOps = (NumRegs / 2);
+      // If there are odd number of registers or if it's not 64-bit aligned,
+      // then it takes an extra AGU (Address Generation Unit) cycle.
+      if ((NumRegs % 2) ||
+          !MI->hasOneMemOperand() ||
+          (*MI->memoperands_begin())->getAlignment() < 8)
+        ++UOps;
+      return UOps;
+    } else {
+      // Assume the worst.
+      return NumRegs;
+    }      
   }
   }
 }
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index b3abdee..f471b67 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -350,7 +350,7 @@
                                     MachineInstr *CmpInstr) const;
 
   virtual unsigned getNumMicroOps(const MachineInstr *MI,
-                                  const InstrItineraryData &ItinData) const;
+                                  const InstrItineraryData *ItinData) const;
 };
 
 static inline
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index d4198a5..637c6e3 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -177,6 +177,7 @@
     : TargetLowering(TM, createTLOF(TM)) {
   Subtarget = &TM.getSubtarget<ARMSubtarget>();
   RegInfo = TM.getRegisterInfo();
+  Itins = TM.getInstrItineraryData();
 
   if (Subtarget->isTargetDarwin()) {
     // Uses VFP for Thumb libfuncs if available.
@@ -749,8 +750,7 @@
   if (TID.mayLoad())
     return Sched::Latency;
 
-  const InstrItineraryData &Itins = getTargetMachine().getInstrItineraryData();
-  if (!Itins.isEmpty() && Itins.getStageLatency(TID.getSchedClass()) > 2)
+  if (!Itins->isEmpty() && Itins->getStageLatency(TID.getSchedClass()) > 2)
     return Sched::Latency;
   return Sched::RegPressure;
 }
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index ba9ea7f..58b8b9e 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -301,6 +301,8 @@
 
     const TargetRegisterInfo *RegInfo;
 
+    const InstrItineraryData *Itins;
+
     /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created.
     ///
     unsigned ARMPCLabelIndex;
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index cb539f4..8a4052b 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -30,6 +30,7 @@
 ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
                            bool isT)
   : ARMArchVersion(V4)
+  , ARMProcFamily(Others)
   , ARMFPUType(None)
   , UseNEONForSinglePrecisionFP(false)
   , SlowVMLx(false)
@@ -50,7 +51,7 @@
   , CPUString("generic")
   , TargetType(isELF) // Default to ELF unless otherwise specified.
   , TargetABI(ARM_ABI_APCS) {
-  // default to soft float ABI
+  // Default to soft float ABI
   if (FloatABIType == FloatABI::Default)
     FloatABIType = FloatABI::Soft;
 
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 67e5803..34f571f 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -29,6 +29,10 @@
     V4, V4T, V5T, V5TE, V6, V6M, V6T2, V7A, V7M
   };
 
+  enum ARMProcFamilyEnum {
+    Others, CortexA8, CortexA9
+  };
+
   enum ARMFPEnum {
     None, VFPv2, VFPv3, NEON
   };
@@ -42,6 +46,9 @@
   /// V6, V6T2, V7A, V7M.
   ARMArchEnum ARMArchVersion;
 
+  /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
+  ARMProcFamilyEnum ARMProcFamily;
+
   /// ARMFPUType - Floating Point Unit type.
   ARMFPEnum ARMFPUType;
 
@@ -143,6 +150,9 @@
   bool hasV6T2Ops() const { return ARMArchVersion >= V6T2; }
   bool hasV7Ops()   const { return ARMArchVersion >= V7A;  }
 
+  bool isCortexA8() const { return ARMProcFamily == CortexA8; }
+  bool isCortexA9() const { return ARMProcFamily == CortexA9; }
+
   bool hasARMOps() const { return !NoARM; }
 
   bool hasVFP2() const { return ARMFPUType >= VFPv2; }
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index 17e5425..9b375d7 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -45,8 +45,8 @@
   virtual const ARMFrameInfo     *getFrameInfo() const { return &FrameInfo; }
   virtual       ARMJITInfo       *getJITInfo()         { return &JITInfo; }
   virtual const ARMSubtarget  *getSubtargetImpl() const { return &Subtarget; }
-  virtual const InstrItineraryData getInstrItineraryData() const {
-    return InstrItins;
+  virtual const InstrItineraryData *getInstrItineraryData() const {
+    return &InstrItins;
   }
 
   // Pass Pipeline Configuration
diff --git a/lib/Target/ARM/Thumb2HazardRecognizer.h b/lib/Target/ARM/Thumb2HazardRecognizer.h
index 4726658..aa4411f 100644
--- a/lib/Target/ARM/Thumb2HazardRecognizer.h
+++ b/lib/Target/ARM/Thumb2HazardRecognizer.h
@@ -26,7 +26,7 @@
   MachineInstr *ITBlockMIs[4];
 
 public:
-  Thumb2HazardRecognizer(const InstrItineraryData &ItinData) :
+  Thumb2HazardRecognizer(const InstrItineraryData *ItinData) :
     PostRAHazardRecognizer(ItinData) {}
 
   virtual HazardType getHazardType(SUnit *SU);
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 442f41d..962b312 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -194,7 +194,7 @@
 }
 
 ScheduleHazardRecognizer *Thumb2InstrInfo::
-CreateTargetPostRAHazardRecognizer(const InstrItineraryData &II) const {
+CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II) const {
   return (ScheduleHazardRecognizer *)new Thumb2HazardRecognizer(II);
 }
 
diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h
index 3a9f8b1..b66be8e 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/lib/Target/ARM/Thumb2InstrInfo.h
@@ -72,7 +72,7 @@
   const Thumb2RegisterInfo &getRegisterInfo() const { return RI; }
 
   ScheduleHazardRecognizer *
-  CreateTargetPostRAHazardRecognizer(const InstrItineraryData &II) const;
+  CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II) const;
 };
 
 /// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical
diff --git a/lib/Target/CellSPU/SPUTargetMachine.h b/lib/Target/CellSPU/SPUTargetMachine.h
index 7e02701..e306883 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.h
+++ b/lib/Target/CellSPU/SPUTargetMachine.h
@@ -75,8 +75,8 @@
     return &DataLayout;
   }
 
-  virtual const InstrItineraryData getInstrItineraryData() const {
-    return InstrItins;
+  virtual const InstrItineraryData *getInstrItineraryData() const {
+    return &InstrItins;
   }
   
   // Pass Pipeline Configuration
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 626ddbb..6f0fb15 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -58,8 +58,8 @@
   
   virtual const TargetData    *getTargetData() const    { return &DataLayout; }
   virtual const PPCSubtarget  *getSubtargetImpl() const { return &Subtarget; }
-  virtual const InstrItineraryData getInstrItineraryData() const {  
-    return InstrItins;
+  virtual const InstrItineraryData *getInstrItineraryData() const {  
+    return &InstrItins;
   }
 
   // Pass Pipeline Configuration
diff --git a/lib/Target/TargetInstrInfo.cpp b/lib/Target/TargetInstrInfo.cpp
index 118afd4..47cd0fb 100644
--- a/lib/Target/TargetInstrInfo.cpp
+++ b/lib/Target/TargetInstrInfo.cpp
@@ -50,12 +50,12 @@
 
 unsigned
 TargetInstrInfo::getNumMicroOps(const MachineInstr *MI,
-                                const InstrItineraryData &ItinData) const {
-  if (ItinData.isEmpty())
+                                const InstrItineraryData *ItinData) const {
+  if (!ItinData || ItinData->isEmpty())
     return 1;
 
   unsigned Class = MI->getDesc().getSchedClass();
-  unsigned UOps = ItinData.Itineratries[Class].NumMicroOps;
+  unsigned UOps = ItinData->Itineratries[Class].NumMicroOps;
   if (UOps)
     return UOps;