Teach if-converter to be more careful with predicating instructions that would
take multiple cycles to decode.
For the current if-converter clients (actually only ARM), the instructions that
are predicated on false are not nops. They would still take machine cycles to
decode. Micro-coded instructions such as LDM / STM can potentially take multiple
cycles to decode. If-converter should take treat them as non-micro-coded
simple instructions.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@113570 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index c824b8b..e7b35c6 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1415,13 +1415,13 @@
 
 unsigned
 ARMBaseInstrInfo::getNumMicroOps(const MachineInstr *MI,
-                                 const InstrItineraryData &ItinData) const {
-  if (ItinData.isEmpty())
+                                 const InstrItineraryData *ItinData) const {
+  if (!ItinData || ItinData->isEmpty())
     return 1;
 
   const TargetInstrDesc &Desc = MI->getDesc();
   unsigned Class = Desc.getSchedClass();
-  unsigned UOps = ItinData.Itineratries[Class].NumMicroOps;
+  unsigned UOps = ItinData->Itineratries[Class].NumMicroOps;
   if (UOps)
     return UOps;
 
@@ -1430,16 +1430,19 @@
   default:
     llvm_unreachable("Unexpected multi-uops instruction!");
     break;
+  case ARM::VLDMQ:
   case ARM::VSTMQ:
     return 2;
 
   // The number of uOps for load / store multiple are determined by the number
   // registers.
-  // On Cortex-A8, each odd / even pair of register loads / stores
-  // (e.g. r5 + r6) can be completed on the same cycle. The minimum is
-  // 2. For VFP / NEON load / store multiple, the formula is
+  // On Cortex-A8, each pair of register loads / stores can be scheduled on the
+  // same cycle. The scheduling for the first load / store must be done
+  // separately by assuming the the address is not 64-bit aligned.
+  // On Cortex-A9, the formula is simply (#reg / 2) + (#reg % 2). If the address
+  // is not 64-bit aligned, then AGU would take an extra cycle.
+  // For VFP / NEON load / store multiple, the formula is
   // (#reg / 2) + (#reg % 2) + 1.
-  // On Cortex-A9, the formula is simply (#reg / 2) + (#reg % 2).
   case ARM::VLDMD:
   case ARM::VLDMS:
   case ARM::VLDMD_UPD:
@@ -1467,11 +1470,24 @@
   case ARM::t2LDM_UPD:
   case ARM::t2STM:
   case ARM::t2STM_UPD: {
-    // FIXME: Distinquish between Cortex-A8 / Cortex-A9 and other processor
-    // families.
-    unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands();
-    UOps = (NumRegs / 2) + (NumRegs % 2);
-    return (UOps > 2) ? UOps : 2;
+    unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1;
+    if (Subtarget.isCortexA8()) {
+      // 4 registers would be issued: 1, 2, 1.
+      // 5 registers would be issued: 1, 2, 2.
+      return 1 + (NumRegs / 2);
+    } else if (Subtarget.isCortexA9()) {
+      UOps = (NumRegs / 2);
+      // If there are odd number of registers or if it's not 64-bit aligned,
+      // then it takes an extra AGU (Address Generation Unit) cycle.
+      if ((NumRegs % 2) ||
+          !MI->hasOneMemOperand() ||
+          (*MI->memoperands_begin())->getAlignment() < 8)
+        ++UOps;
+      return UOps;
+    } else {
+      // Assume the worst.
+      return NumRegs;
+    }      
   }
   }
 }