Enhance the InstrStage object to enable the specification of an Itinerary with overlapping stages. The default is to maintain the current behavior that the "next" stage immediately follows the previous one.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78827 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/CodeGen/ExactHazardRecognizer.cpp b/lib/CodeGen/ExactHazardRecognizer.cpp
index 48043f2..8bac08a 100644
--- a/lib/CodeGen/ExactHazardRecognizer.cpp
+++ b/lib/CodeGen/ExactHazardRecognizer.cpp
@@ -34,12 +34,12 @@
       // If the begin stage of an itinerary has 0 cycles and units,
       // then we have reached the end of the itineraries.
       const InstrStage *IS = ItinData.begin(idx), *E = ItinData.end(idx);
-      if ((IS->Cycles == 0) && (IS->Units == 0))
+      if ((IS->getCycles() == 0) && (IS->getUnits() == 0))
         break;
 
       unsigned ItinDepth = 0;
       for (; IS != E; ++IS)
-        ItinDepth += std::max(1U, IS->Cycles);
+        ItinDepth += IS->getCycles();
 
       ScoreboardDepth = std::max(ScoreboardDepth, ItinDepth);
     }
@@ -89,27 +89,25 @@
   unsigned idx = SU->getInstr()->getDesc().getSchedClass();
   for (const InstrStage *IS = ItinData.begin(idx), *E = ItinData.end(idx);
        IS != E; ++IS) {
-    // If the stages cycles are 0, then we must have the FU free in
-    // the current cycle, but we don't advance the cycle time .
-    unsigned StageCycles = std::max(1U, IS->Cycles);
-
     // We must find one of the stage's units free for every cycle the
-    // stage is occupied.
-    for (unsigned int i = 0; i < StageCycles; ++i) {
-      assert((cycle < ScoreboardDepth) && "Scoreboard depth exceeded!");
+    // stage is occupied. FIXME it would be more accurate to find the
+    // same unit free in all the cycles.
+    for (unsigned int i = 0; i < IS->getCycles(); ++i) {
+      assert(((cycle + i) < ScoreboardDepth) && 
+             "Scoreboard depth exceeded!");
 
-      unsigned index = getFutureIndex(cycle);
-      unsigned freeUnits = IS->Units & ~Scoreboard[index];
+      unsigned index = getFutureIndex(cycle + i);
+      unsigned freeUnits = IS->getUnits() & ~Scoreboard[index];
       if (!freeUnits) {
-        DEBUG(errs() << "*** Hazard in cycle " << cycle << ", ");
+        DEBUG(errs() << "*** Hazard in cycle " << (cycle + i) << ", ");
         DEBUG(errs() << "SU(" << SU->NodeNum << "): ");
         DEBUG(SU->getInstr()->dump());
         return Hazard;
       }
-
-      if (IS->Cycles > 0)
-        ++cycle;
     }
+
+    // Advance the cycle to the next stage.
+    cycle += IS->getNextCycles();
   }
 
   return NoHazard;
@@ -123,17 +121,15 @@
   unsigned idx = SU->getInstr()->getDesc().getSchedClass();
   for (const InstrStage *IS = ItinData.begin(idx), *E = ItinData.end(idx);
        IS != E; ++IS) {
-    // If the stages cycles are 0, then we must reserve the FU in the
-    // current cycle, but we don't advance the cycle time .
-    unsigned StageCycles = std::max(1U, IS->Cycles);
-
     // We must reserve one of the stage's units for every cycle the
-    // stage is occupied.
-    for (unsigned int i = 0; i < StageCycles; ++i) {
-      assert((cycle < ScoreboardDepth) && "Scoreboard depth exceeded!");
+    // stage is occupied. FIXME it would be more accurate to reserve
+    // the same unit free in all the cycles.
+    for (unsigned int i = 0; i < IS->getCycles(); ++i) {
+      assert(((cycle + i) < ScoreboardDepth) &&
+             "Scoreboard depth exceeded!");
 
-      unsigned index = getFutureIndex(cycle);
-      unsigned freeUnits = IS->Units & ~Scoreboard[index];
+      unsigned index = getFutureIndex(cycle + i);
+      unsigned freeUnits = IS->getUnits() & ~Scoreboard[index];
       
       // reduce to a single unit
       unsigned freeUnit = 0;
@@ -144,10 +140,10 @@
 
       assert(freeUnit && "No function unit available!");
       Scoreboard[index] |= freeUnit;
-      
-      if (IS->Cycles > 0)
-        ++cycle;
     }
+
+    // Advance the cycle to the next stage.
+    cycle += IS->getNextCycles();
   }
 
   DEBUG(dumpScoreboard());
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 0985265..8e5a01d 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -615,7 +615,7 @@
                     []>;
 
 // On non-Darwin platforms R9 is callee-saved.
-let isCall = 1, Itinerary = IIC_Br,
+let isCall = 1,
   Defs = [R0,  R1,  R2,  R3,  R12, LR,
           D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
           D16, D17, D18, D19, D20, D21, D22, D23,
@@ -652,7 +652,7 @@
 }
 
 // On Darwin R9 is call-clobbered.
-let isCall = 1, Itinerary = IIC_Br,
+let isCall = 1,
   Defs = [R0,  R1,  R2,  R3,  R9,  R12, LR,
           D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
           D16, D17, D18, D19, D20, D21, D22, D23,
@@ -685,7 +685,7 @@
   }
 }
 
-let isBranch = 1, isTerminator = 1, Itinerary = IIC_Br in {
+let isBranch = 1, isTerminator = 1 in {
   // B is "predicable" since it can be xformed into a Bcc.
   let isBarrier = 1 in {
     let isPredicable = 1 in
@@ -1057,7 +1057,7 @@
                           BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
 
 def BFC    : I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm),
-               AddrMode1, Size4Bytes, IndexModeNone, DPFrm, NoItinerary,
+               AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALU,
                "bfc", " $dst, $imm", "$src = $dst",
                [(set GPR:$dst, (and GPR:$src, bf_inv_mask_imm:$imm))]>,
                Requires<[IsARM, HasV6T2]> {
@@ -1084,16 +1084,16 @@
 //
 
 let isCommutable = 1 in
-def MUL   : AsMul1I<0b0000000, (outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iALU,
+def MUL   : AsMul1I<0b0000000, (outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMPY,
                     "mul", " $dst, $a, $b",
                    [(set GPR:$dst, (mul GPR:$a, GPR:$b))]>;
 
 def MLA   : AsMul1I<0b0000001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
-                    IIC_iALU, "mla", " $dst, $a, $b, $c",
+                    IIC_iMPY, "mla", " $dst, $a, $b, $c",
                    [(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]>;
 
 def MLS   : AMul1I<0b0000011, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
-                   IIC_iALU, "mls", " $dst, $a, $b, $c",
+                   IIC_iMPY, "mls", " $dst, $a, $b, $c",
                    [(set GPR:$dst, (sub GPR:$c, (mul GPR:$a, GPR:$b)))]>,
                    Requires<[IsARM, HasV6T2]>;
 
@@ -1101,32 +1101,32 @@
 let neverHasSideEffects = 1 in {
 let isCommutable = 1 in {
 def SMULL : AsMul1I<0b0000110, (outs GPR:$ldst, GPR:$hdst),
-                               (ins GPR:$a, GPR:$b), IIC_iALU,
+                               (ins GPR:$a, GPR:$b), IIC_iMPY,
                     "smull", " $ldst, $hdst, $a, $b", []>;
 
 def UMULL : AsMul1I<0b0000100, (outs GPR:$ldst, GPR:$hdst),
-                               (ins GPR:$a, GPR:$b), IIC_iALU,
+                               (ins GPR:$a, GPR:$b), IIC_iMPY,
                     "umull", " $ldst, $hdst, $a, $b", []>;
 }
 
 // Multiply + accumulate
 def SMLAL : AsMul1I<0b0000111, (outs GPR:$ldst, GPR:$hdst),
-                               (ins GPR:$a, GPR:$b), IIC_iALU,
+                               (ins GPR:$a, GPR:$b), IIC_iMPY,
                     "smlal", " $ldst, $hdst, $a, $b", []>;
 
 def UMLAL : AsMul1I<0b0000101, (outs GPR:$ldst, GPR:$hdst),
-                               (ins GPR:$a, GPR:$b), IIC_iALU,
+                               (ins GPR:$a, GPR:$b), IIC_iMPY,
                     "umlal", " $ldst, $hdst, $a, $b", []>;
 
 def UMAAL : AMul1I <0b0000010, (outs GPR:$ldst, GPR:$hdst),
-                               (ins GPR:$a, GPR:$b), IIC_iALU,
+                               (ins GPR:$a, GPR:$b), IIC_iMPY,
                     "umaal", " $ldst, $hdst, $a, $b", []>,
                     Requires<[IsARM, HasV6]>;
 } // neverHasSideEffects
 
 // Most significant word multiply
 def SMMUL : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-               IIC_iALU, "smmul", " $dst, $a, $b",
+               IIC_iMPY, "smmul", " $dst, $a, $b",
                [(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]>,
             Requires<[IsARM, HasV6]> {
   let Inst{7-4}   = 0b0001;
@@ -1134,7 +1134,7 @@
 }
 
 def SMMLA : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
-               IIC_iALU, "smmla", " $dst, $a, $b, $c",
+               IIC_iMPY, "smmla", " $dst, $a, $b, $c",
                [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]>,
             Requires<[IsARM, HasV6]> {
   let Inst{7-4}   = 0b0001;
@@ -1142,7 +1142,7 @@
 
 
 def SMMLS : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
-               IIC_iALU, "smmls", " $dst, $a, $b, $c",
+               IIC_iMPY, "smmls", " $dst, $a, $b, $c",
                [(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]>,
             Requires<[IsARM, HasV6]> {
   let Inst{7-4}   = 0b1101;
@@ -1150,7 +1150,7 @@
 
 multiclass AI_smul<string opc, PatFrag opnode> {
   def BB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              IIC_iALU, !strconcat(opc, "bb"), " $dst, $a, $b",
+              IIC_iMPY, !strconcat(opc, "bb"), " $dst, $a, $b",
               [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
                                       (sext_inreg GPR:$b, i16)))]>,
            Requires<[IsARM, HasV5TE]> {
@@ -1159,7 +1159,7 @@
            }
 
   def BT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              IIC_iALU, !strconcat(opc, "bt"), " $dst, $a, $b",
+              IIC_iMPY, !strconcat(opc, "bt"), " $dst, $a, $b",
               [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
                                       (sra GPR:$b, (i32 16))))]>,
            Requires<[IsARM, HasV5TE]> {
@@ -1168,7 +1168,7 @@
            }
 
   def TB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              IIC_iALU, !strconcat(opc, "tb"), " $dst, $a, $b",
+              IIC_iMPY, !strconcat(opc, "tb"), " $dst, $a, $b",
               [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
                                       (sext_inreg GPR:$b, i16)))]>,
            Requires<[IsARM, HasV5TE]> {
@@ -1177,7 +1177,7 @@
            }
 
   def TT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              IIC_iALU, !strconcat(opc, "tt"), " $dst, $a, $b",
+              IIC_iMPY, !strconcat(opc, "tt"), " $dst, $a, $b",
               [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
                                       (sra GPR:$b, (i32 16))))]>,
             Requires<[IsARM, HasV5TE]> {
@@ -1186,7 +1186,7 @@
            }
 
   def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              IIC_iALU, !strconcat(opc, "wb"), " $dst, $a, $b",
+              IIC_iMPY, !strconcat(opc, "wb"), " $dst, $a, $b",
               [(set GPR:$dst, (sra (opnode GPR:$a,
                                     (sext_inreg GPR:$b, i16)), (i32 16)))]>,
            Requires<[IsARM, HasV5TE]> {
@@ -1195,7 +1195,7 @@
            }
 
   def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
-              IIC_iALU, !strconcat(opc, "wt"), " $dst, $a, $b",
+              IIC_iMPY, !strconcat(opc, "wt"), " $dst, $a, $b",
               [(set GPR:$dst, (sra (opnode GPR:$a,
                                     (sra GPR:$b, (i32 16))), (i32 16)))]>,
             Requires<[IsARM, HasV5TE]> {
@@ -1207,7 +1207,7 @@
 
 multiclass AI_smla<string opc, PatFrag opnode> {
   def BB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              IIC_iALU, !strconcat(opc, "bb"), " $dst, $a, $b, $acc",
+              IIC_iMPY, !strconcat(opc, "bb"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc,
                                (opnode (sext_inreg GPR:$a, i16),
                                        (sext_inreg GPR:$b, i16))))]>,
@@ -1217,7 +1217,7 @@
            }
 
   def BT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              IIC_iALU, !strconcat(opc, "bt"), " $dst, $a, $b, $acc",
+              IIC_iMPY, !strconcat(opc, "bt"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16),
                                                      (sra GPR:$b, (i32 16)))))]>,
            Requires<[IsARM, HasV5TE]> {
@@ -1226,7 +1226,7 @@
            }
 
   def TB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              IIC_iALU, !strconcat(opc, "tb"), " $dst, $a, $b, $acc",
+              IIC_iMPY, !strconcat(opc, "tb"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
                                                  (sext_inreg GPR:$b, i16))))]>,
            Requires<[IsARM, HasV5TE]> {
@@ -1235,7 +1235,7 @@
            }
 
   def TT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              IIC_iALU, !strconcat(opc, "tt"), " $dst, $a, $b, $acc",
+              IIC_iMPY, !strconcat(opc, "tt"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
                                                      (sra GPR:$b, (i32 16)))))]>,
             Requires<[IsARM, HasV5TE]> {
@@ -1244,7 +1244,7 @@
            }
 
   def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              IIC_iALU, !strconcat(opc, "wb"), " $dst, $a, $b, $acc",
+              IIC_iMPY, !strconcat(opc, "wb"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
                                        (sext_inreg GPR:$b, i16)), (i32 16))))]>,
            Requires<[IsARM, HasV5TE]> {
@@ -1253,7 +1253,7 @@
            }
 
   def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
-              IIC_iALU, !strconcat(opc, "wt"), " $dst, $a, $b, $acc",
+              IIC_iMPY, !strconcat(opc, "wt"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
                                          (sra GPR:$b, (i32 16))), (i32 16))))]>,
             Requires<[IsARM, HasV5TE]> {
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 0da798e..1a82331 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -508,7 +508,7 @@
 
 // multiply register
 let isCommutable = 1 in
-def tMUL : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALU,
+def tMUL : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMPY,
                  "mul", " $dst, $rhs",
                  [(set tGPR:$dst, (mul tGPR:$lhs, tGPR:$rhs))]>;
 
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 0ef9cc0..5800a43 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -808,80 +808,80 @@
 //  Multiply Instructions.
 //
 let isCommutable = 1 in
-def t2MUL: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iALU,
+def t2MUL: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMPY,
                 "mul", " $dst, $a, $b",
                 [(set GPR:$dst, (mul GPR:$a, GPR:$b))]>;
 
-def t2MLA: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iALU,
+def t2MLA: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMPY,
 		"mla", " $dst, $a, $b, $c",
 		[(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]>;
 
-def t2MLS: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iALU,
+def t2MLS: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMPY,
 		"mls", " $dst, $a, $b, $c",
                 [(set GPR:$dst, (sub GPR:$c, (mul GPR:$a, GPR:$b)))]>;
 
 // Extra precision multiplies with low / high results
 let neverHasSideEffects = 1 in {
 let isCommutable = 1 in {
-def t2SMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iALU,
+def t2SMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMPY,
                    "smull", " $ldst, $hdst, $a, $b", []>;
 
-def t2UMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iALU,
+def t2UMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMPY,
                    "umull", " $ldst, $hdst, $a, $b", []>;
 }
 
 // Multiply + accumulate
-def t2SMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iALU,
+def t2SMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMPY,
                   "smlal", " $ldst, $hdst, $a, $b", []>;
 
-def t2UMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iALU,
+def t2UMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMPY,
                   "umlal", " $ldst, $hdst, $a, $b", []>;
 
-def t2UMAAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iALU,
+def t2UMAAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMPY,
                   "umaal", " $ldst, $hdst, $a, $b", []>;
 } // neverHasSideEffects
 
 // Most significant word multiply
-def t2SMMUL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iALU,
+def t2SMMUL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMPY,
                   "smmul", " $dst, $a, $b",
                   [(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]>;
 
-def t2SMMLA : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iALU,
+def t2SMMLA : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMPY,
                   "smmla", " $dst, $a, $b, $c",
                   [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]>;
 
 
-def t2SMMLS : T2I <(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iALU,
+def t2SMMLS : T2I <(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMPY,
                    "smmls", " $dst, $a, $b, $c",
                    [(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]>;
 
 multiclass T2I_smul<string opc, PatFrag opnode> {
-  def BB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iALU,
+  def BB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMPY,
               !strconcat(opc, "bb"), " $dst, $a, $b",
               [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
                                       (sext_inreg GPR:$b, i16)))]>;
 
-  def BT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iALU,
+  def BT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMPY,
               !strconcat(opc, "bt"), " $dst, $a, $b",
               [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
                                       (sra GPR:$b, (i32 16))))]>;
 
-  def TB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iALU,
+  def TB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMPY,
               !strconcat(opc, "tb"), " $dst, $a, $b",
               [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
                                       (sext_inreg GPR:$b, i16)))]>;
 
-  def TT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iALU,
+  def TT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMPY,
               !strconcat(opc, "tt"), " $dst, $a, $b",
               [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
                                       (sra GPR:$b, (i32 16))))]>;
 
-  def WB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iALU,
+  def WB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMPY,
               !strconcat(opc, "wb"), " $dst, $a, $b",
               [(set GPR:$dst, (sra (opnode GPR:$a,
                                     (sext_inreg GPR:$b, i16)), (i32 16)))]>;
 
-  def WT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iALU,
+  def WT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMPY,
               !strconcat(opc, "wt"), " $dst, $a, $b",
               [(set GPR:$dst, (sra (opnode GPR:$a,
                                     (sra GPR:$b, (i32 16))), (i32 16)))]>;
@@ -889,33 +889,33 @@
 
 
 multiclass T2I_smla<string opc, PatFrag opnode> {
-  def BB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iALU,
+  def BB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMPY,
               !strconcat(opc, "bb"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc,
                                (opnode (sext_inreg GPR:$a, i16),
                                        (sext_inreg GPR:$b, i16))))]>;
 
-  def BT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iALU,
+  def BT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMPY,
              !strconcat(opc, "bt"), " $dst, $a, $b, $acc",
              [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16),
                                                     (sra GPR:$b, (i32 16)))))]>;
 
-  def TB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iALU,
+  def TB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMPY,
               !strconcat(opc, "tb"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
                                                  (sext_inreg GPR:$b, i16))))]>;
 
-  def TT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iALU,
+  def TT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMPY,
               !strconcat(opc, "tt"), " $dst, $a, $b, $acc",
              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
                                                     (sra GPR:$b, (i32 16)))))]>;
 
-  def WB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iALU,
+  def WB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMPY,
               !strconcat(opc, "wb"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
                                        (sext_inreg GPR:$b, i16)), (i32 16))))]>;
 
-  def WT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iALU,
+  def WT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMPY,
               !strconcat(opc, "wt"), " $dst, $a, $b, $acc",
               [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
                                          (sra GPR:$b, (i32 16))), (i32 16))))]>;
diff --git a/lib/Target/ARM/ARMScheduleV7.td b/lib/Target/ARM/ARMScheduleV7.td
index 8a7b42e..a789d71 100644
--- a/lib/Target/ARM/ARMScheduleV7.td
+++ b/lib/Target/ARM/ARMScheduleV7.td
@@ -16,34 +16,34 @@
   // two fully-pipelined integer ALU pipelines
   InstrItinData<IIC_iALU    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>,
   // one fully-pipelined integer Multiply pipeline
-  // function units are used in alpha order, so use FU_Pipe1
-  // for the Multiple pipeline
-  InstrItinData<IIC_iMPY    , [InstrStage<1, [FU_Pipe1]>]>,
+  // function units are reserved by the scheduler in reverse alpha order,
+  // so use FU_Pipe0 for the Multiple pipeline
+  InstrItinData<IIC_iMPY    , [InstrStage<1, [FU_Pipe0]>]>,
   // loads have an extra cycle of latency, but are fully pipelined
-  // use a 0 cycle FU_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_iLoad   , [InstrStage<0, [FU_Issue]>, 
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_iLoad   , [InstrStage<1, [FU_Issue], 0>, 
                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
                                InstrStage<1, [FU_LdSt0]>]>,
   // fully-pipelined stores
-  // use a 0 cycle FU_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_iStore  , [InstrStage<0, [FU_Issue]>, 
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_iStore  , [InstrStage<1, [FU_Issue], 0>, 
                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>,
   // no delay slots, so the latency of a branch is unimportant
   InstrItinData<IIC_Br      , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>,
 
-  // VFP ALU is not pipelined so stall all issues 
-  // FIXME assume NFP pipeline and 7 cycle non-pipelined latency
-  InstrItinData<IIC_fpALU   , [InstrStage<7, [FU_Pipe0, FU_Pipe1]>]>,
+  // NFP ALU is not pipelined so stall all issues 
+  InstrItinData<IIC_fpALU   , [InstrStage<7, [FU_Pipe0], 0>,
+                               InstrStage<7, [FU_Pipe1], 0>]>,
   // VFP MPY is not pipelined so stall all issues 
-  // FIXME assume NFP pipeline and 7 cycle non-pipelined latency
-  InstrItinData<IIC_fpMPY   , [InstrStage<7, [FU_Pipe0, FU_Pipe1]>]>,
+  InstrItinData<IIC_fpMPY   , [InstrStage<7, [FU_Pipe0], 0>,
+                               InstrStage<7, [FU_Pipe1], 0>]>,
   // loads have an extra cycle of latency, but are fully pipelined
-  // use a 0 cycle FU_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpLoad  , [InstrStage<0, [FU_Issue]>, 
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpLoad  , [InstrStage<1, [FU_Issue], 0>, 
                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
                                InstrStage<1, [FU_LdSt0]>]>,
-  // use a 0 cycle FU_Issue to enforce the 1 load/store per cycle limit
-  InstrItinData<IIC_fpStore , [InstrStage<0, [FU_Issue]>, 
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpStore , [InstrStage<1, [FU_Issue], 0>, 
                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>
 ]>;