Subzero: Make the register allocator more robust with -reg-use and -reg-exclude.

The problem is that if you too aggressively -reg-use or -reg-exclude, you can get failures because of inherently high register pressure, and there are also contributions from the "specialty" register classes.

For example, when you combine load optimization, address mode inference, local register availability optimization, and the div instruction, you can end up needing 5 simultaneously live infinite-weight registers.

The fix/enhancement here is to keep track of the "reserve" set of registers for each register class, and allow the register allocator to draw from that as a last resort.  This behavior is guarded by the -reg-reserve flag.

This CL also includes two improvements in lowering sequences to reduce register pressure.

BUG= none
R=kschimpf@google.com

Review URL: https://codereview.chromium.org/1641653004 .
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 4e50248..99c55ac 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -55,6 +55,30 @@
 
 Cfg::~Cfg() { assert(ICE_TLS_GET_FIELD(CurrentCfg) == nullptr); }
 
+/// Create a string like "foo(i=123:b=9)" indicating the function name, number
+/// of high-level instructions, and number of basic blocks.  This string is only
+/// used for dumping and other diagnostics, and the idea is that given a set of
+/// functions to debug a problem on, it's easy to find the smallest or simplest
+/// function to attack.  Note that the counts may change somewhat depending on
+/// what point it is called during the translation passes.
+IceString Cfg::getFunctionNameAndSize() const {
+  if (!BuildDefs::dump())
+    return getFunctionName();
+  SizeT NodeCount = 0;
+  SizeT InstCount = 0;
+  for (CfgNode *Node : getNodes()) {
+    ++NodeCount;
+    // Note: deleted instructions are *not* ignored.
+    InstCount += Node->getPhis().size();
+    for (Inst &I : Node->getInsts()) {
+      if (!llvm::isa<InstTarget>(&I))
+        ++InstCount;
+    }
+  }
+  return getFunctionName() + "(i=" + std::to_string(InstCount) + ":b=" +
+         std::to_string(NodeCount) + ")";
+}
+
 void Cfg::setError(const IceString &Message) {
   HasError = true;
   ErrorMessage = Message;
@@ -1075,7 +1099,9 @@
       Str << Args[i]->getType() << " ";
       Args[i]->dump(this);
     }
-    Str << ") {\n";
+    // Append an extra copy of the function name here, in order to print its
+    // size stats but not mess up lit tests.
+    Str << ") { # " << getFunctionNameAndSize() << "\n";
   }
   resetCurrentNode();
   if (isVerbose(IceV_Liveness)) {
diff --git a/src/IceCfg.h b/src/IceCfg.h
index 47bdd04..8e81614 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -64,7 +64,8 @@
   /// \name Manage the name and return type of the function being translated.
   /// @{
   void setFunctionName(const IceString &Name) { FunctionName = Name; }
-  IceString getFunctionName() const { return FunctionName; }
+  const IceString &getFunctionName() const { return FunctionName; }
+  IceString getFunctionNameAndSize() const;
   void setReturnType(Type Ty) { ReturnType = Ty; }
   Type getReturnType() const { return ReturnType; }
   /// @}
diff --git a/src/IceClFlags.cpp b/src/IceClFlags.cpp
index 28db1b0..f776634 100644
--- a/src/IceClFlags.cpp
+++ b/src/IceClFlags.cpp
@@ -185,6 +185,13 @@
                                 cl::desc("Randomize register allocation"),
                                 cl::init(false));
 
+/// Allow failsafe access to registers that were restricted via -reg-use or
+/// -reg-exclude.
+cl::opt<bool>
+    RegAllocReserve("reg-reserve",
+                    cl::desc("Let register allocation use reserve registers"),
+                    cl::init(false));
+
 /// Repeat register allocation until convergence.
 cl::opt<bool>
     RepeatRegAlloc("regalloc-repeat",
@@ -545,6 +552,7 @@
   OutFlags.setShouldReorderBasicBlocks(::ReorderBasicBlocks);
   OutFlags.setShouldDoNopInsertion(::ShouldDoNopInsertion);
   OutFlags.setShouldRandomizeRegAlloc(::RandomizeRegisterAllocation);
+  OutFlags.setRegAllocReserve(::RegAllocReserve);
   OutFlags.setShouldRepeatRegAlloc(::RepeatRegAlloc);
   OutFlags.setShouldReorderFunctions(::ReorderFunctions);
   OutFlags.setShouldReorderGlobalVariables(::ReorderGlobalVariables);
diff --git a/src/IceClFlags.h b/src/IceClFlags.h
index ee650c4..d3e8b26 100644
--- a/src/IceClFlags.h
+++ b/src/IceClFlags.h
@@ -169,6 +169,11 @@
   /// Set ClFlags::RandomRegAlloc to a new value
   void setShouldRandomizeRegAlloc(bool NewValue) { RandomRegAlloc = NewValue; }
 
+  /// Get the value of ClFlags::RegAllocReserve
+  bool getRegAllocReserve() const { return RegAllocReserve; }
+  /// Set ClFlags::RegAllocReserve to a new value
+  void setRegAllocReserve(bool NewValue) { RegAllocReserve = NewValue; }
+
   /// Get the value of ClFlags::RepeatRegAlloc
   bool shouldRepeatRegAlloc() const { return RepeatRegAlloc; }
   /// Set ClFlags::RepeatRegAlloc to a new value
@@ -425,6 +430,8 @@
   bool RandomNopInsertion;
   /// see anonymous_namespace{IceClFlags.cpp}::RandomizeRegisterAllocation
   bool RandomRegAlloc;
+  /// see anonymous_namespace{IceClFlags.cpp}::RegAllocReserve
+  bool RegAllocReserve;
   /// see anonymous_namespace{IceClFlags.cpp}::RepeatRegAlloc
   bool RepeatRegAlloc;
   /// see anonymous_namespace{IceClFlags.cpp}::ReorderBasicBlocks
diff --git a/src/IceGlobalContext.cpp b/src/IceGlobalContext.cpp
index 0b1dc76..4bb9b7d 100644
--- a/src/IceGlobalContext.cpp
+++ b/src/IceGlobalContext.cpp
@@ -309,7 +309,8 @@
       getErrorStatus()->assign(EC_Translation);
       OstreamLocker L(this);
       getStrError() << "ICE translation error: " << Func->getFunctionName()
-                    << ": " << Func->getError() << "\n";
+                    << ": " << Func->getError() << ": "
+                    << Func->getFunctionNameAndSize() << "\n";
       Item = new EmitterWorkItem(Func->getSequenceNumber());
     } else {
       Func->getAssembler<>()->setInternal(Func->getInternal());
@@ -320,7 +321,7 @@
         // The Cfg has already emitted into the assembly buffer, so
         // stats have been fully collected into this thread's TLS.
         // Dump them before TLS is reset for the next Cfg.
-        dumpStats(Func->getFunctionName());
+        dumpStats(Func->getFunctionNameAndSize());
         Assembler *Asm = Func->releaseAssembler();
         // Copy relevant fields into Asm before Func is deleted.
         Asm->setFunctionName(Func->getFunctionName());
@@ -549,7 +550,7 @@
         Cfg::setCurrentCfg(Func.get());
         Func->emit();
         Cfg::setCurrentCfg(nullptr);
-        dumpStats(Func->getFunctionName());
+        dumpStats(Func->getFunctionNameAndSize());
       } break;
       }
     }
diff --git a/src/IceRegAlloc.cpp b/src/IceRegAlloc.cpp
index 395c5aa..fc46e2b 100644
--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
@@ -76,11 +76,25 @@
   Str << "  Range=" << Var->getLiveRange();
 }
 
+int32_t findMinWeightIndex(
+    const llvm::SmallBitVector &RegMask,
+    const llvm::SmallVector<RegWeight, LinearScan::REGS_SIZE> &Weights) {
+  int32_t MinWeightIndex = RegMask.find_first();
+  assert(MinWeightIndex >= 0);
+  for (int32_t i = RegMask.find_next(MinWeightIndex); i != -1;
+       i = RegMask.find_next(i)) {
+    if (Weights[i] < Weights[MinWeightIndex])
+      MinWeightIndex = i;
+  }
+  return MinWeightIndex;
+}
+
 } // end of anonymous namespace
 
 LinearScan::LinearScan(Cfg *Func)
     : Func(Func), Ctx(Func->getContext()), Target(Func->getTarget()),
-      Verbose(BuildDefs::dump() && Func->isVerbose(IceV_LinearScan)) {}
+      Verbose(BuildDefs::dump() && Func->isVerbose(IceV_LinearScan)),
+      UseReserve(Ctx->getFlags().getRegAllocReserve()) {}
 
 // Prepare for full register allocation of all variables. We depend on liveness
 // analysis to have calculated live ranges.
@@ -545,8 +559,8 @@
   }
 }
 
-// Remove registers from the Free[] list where an Inactive range overlaps with
-// the current range.
+// Remove registers from the Iter.Free[] list where an Inactive range overlaps
+// with the current range.
 void LinearScan::filterFreeWithInactiveRanges(IterationState &Iter) {
   for (const Variable *Item : Inactive) {
     if (!Item->rangeOverlaps(Iter.Cur))
@@ -555,10 +569,11 @@
     // TODO(stichnot): Do this with bitvector ops, not a loop, for efficiency.
     for (int32_t RegAlias = Aliases.find_first(); RegAlias >= 0;
          RegAlias = Aliases.find_next(RegAlias)) {
-      // Don't assert(Free[RegNum]) because in theory (though probably never in
-      // practice) there could be two inactive variables that were marked with
-      // AllowOverlap.
+      // Don't assert(Iter.Free[RegNum]) because in theory (though probably
+      // never in practice) there could be two inactive variables that were
+      // marked with AllowOverlap.
       Iter.Free[RegAlias] = false;
+      Iter.FreeUnfiltered[RegAlias] = false;
       // Disable AllowOverlap if an Inactive variable, which is not Prefer,
       // shares Prefer's register, and has a definition within Cur's live range.
       if (Iter.AllowOverlap && Item != Iter.Prefer &&
@@ -570,11 +585,11 @@
   }
 }
 
-// Remove registers from the Free[] list where an Unhandled pre-colored range
-// overlaps with the current range, and set those registers to infinite weight
-// so that they aren't candidates for eviction. Cur->rangeEndsBefore(Item) is an
-// early exit check that turns a guaranteed O(N^2) algorithm into expected
-// linear complexity.
+// Remove registers from the Iter.Free[] list where an Unhandled pre-colored
+// range overlaps with the current range, and set those registers to infinite
+// weight so that they aren't candidates for eviction.
+// Cur->rangeEndsBefore(Item) is an early exit check that turns a guaranteed
+// O(N^2) algorithm into expected linear complexity.
 void LinearScan::filterFreeWithPrecoloredRanges(IterationState &Iter) {
   // TODO(stichnot): Partition UnhandledPrecolored according to register class,
   // to restrict the number of overlap comparisons needed.
@@ -590,6 +605,7 @@
          RegAlias = Aliases.find_next(RegAlias)) {
       Iter.Weights[RegAlias].setWeight(RegWeight::Inf);
       Iter.Free[RegAlias] = false;
+      Iter.FreeUnfiltered[RegAlias] = false;
       Iter.PrecoloredUnhandledMask[RegAlias] = true;
       // Disable Iter.AllowOverlap if the preferred register is one of these
       // pre-colored unhandled overlapping ranges.
@@ -630,10 +646,14 @@
   Active.push_back(Iter.Cur);
 }
 
-void LinearScan::allocateFreeRegister(IterationState &Iter) {
-  int32_t RegNum = Iter.Free.find_first();
+void LinearScan::allocateFreeRegister(IterationState &Iter, bool Filtered) {
+  const int32_t RegNum =
+      Filtered ? Iter.Free.find_first() : Iter.FreeUnfiltered.find_first();
   Iter.Cur->setRegNumTmp(RegNum);
-  dumpLiveRangeTrace("Allocating   ", Iter.Cur);
+  if (Filtered)
+    dumpLiveRangeTrace("Allocating   ", Iter.Cur);
+  else
+    dumpLiveRangeTrace("Allocating X ", Iter.Cur);
   const llvm::SmallBitVector &Aliases = *RegAliases[RegNum];
   for (int32_t RegAlias = Aliases.find_first(); RegAlias >= 0;
        RegAlias = Aliases.find_next(RegAlias)) {
@@ -672,77 +692,97 @@
   }
 
   // All the weights are now calculated. Find the register with smallest weight.
-  int32_t MinWeightIndex = Iter.RegMask.find_first();
-  // MinWeightIndex must be valid because of the initial RegMask.any() test.
-  assert(MinWeightIndex >= 0);
-  for (SizeT i = MinWeightIndex + 1; i < Iter.Weights.size(); ++i) {
-    if (Iter.RegMask[i] && Iter.Weights[i] < Iter.Weights[MinWeightIndex])
-      MinWeightIndex = i;
-  }
+  int32_t MinWeightIndex = findMinWeightIndex(Iter.RegMask, Iter.Weights);
 
   if (Iter.Cur->getWeight(Func) <= Iter.Weights[MinWeightIndex]) {
-    // Cur doesn't have priority over any other live ranges, so don't allocate
-    // any register to it, and move it to the Handled state.
-    Handled.push_back(Iter.Cur);
-    if (Iter.Cur->mustHaveReg()) {
-      if (Kind == RAK_Phi) {
-        addSpillFill(Iter);
-      } else {
-        dumpLiveRangeTrace("Failing      ", Iter.Cur);
-        Func->setError("Unable to find a physical register for an "
-                       "infinite-weight live range: " +
-                       Iter.Cur->getName(Func));
+    if (!Iter.Cur->mustHaveReg()) {
+      // Iter.Cur doesn't have priority over any other live ranges, so don't
+      // allocate any register to it, and move it to the Handled state.
+      Handled.push_back(Iter.Cur);
+      return;
+    }
+    if (Kind == RAK_Phi) {
+      // Iter.Cur is infinite-weight but all physical registers are already
+      // taken, so we need to force one to be temporarily available.
+      addSpillFill(Iter);
+      Handled.push_back(Iter.Cur);
+      return;
+    }
+    // The remaining portion of the enclosing "if" block should only be
+    // reachable if we are manually limiting physical registers for testing.
+    if (UseReserve) {
+      if (Iter.FreeUnfiltered.any()) {
+        // There is some available physical register held in reserve, so use it.
+        constexpr bool NotFiltered = false;
+        allocateFreeRegister(Iter, NotFiltered);
+        // Iter.Cur is now on the Active list.
+        return;
       }
+      // At this point, we need to find some reserve register that is already
+      // assigned to a non-infinite-weight variable.  This could happen if some
+      // variable was previously assigned an alias of such a register.
+      MinWeightIndex = findMinWeightIndex(Iter.RegMaskUnfiltered, Iter.Weights);
     }
-  } else {
-    // Evict all live ranges in Active that register number MinWeightIndex is
-    // assigned to.
-    const llvm::SmallBitVector &Aliases = *RegAliases[MinWeightIndex];
-    for (SizeT I = Active.size(); I > 0; --I) {
-      const SizeT Index = I - 1;
-      Variable *Item = Active[Index];
-      int32_t RegNum = Item->getRegNumTmp();
-      if (Aliases[RegNum]) {
-        dumpLiveRangeTrace("Evicting A   ", Item);
-        const llvm::SmallBitVector &Aliases = *RegAliases[RegNum];
-        for (int32_t RegAlias = Aliases.find_first(); RegAlias >= 0;
-             RegAlias = Aliases.find_next(RegAlias)) {
-          --RegUses[RegAlias];
-          assert(RegUses[RegAlias] >= 0);
-        }
-        Item->setRegNumTmp(Variable::NoRegister);
-        moveItem(Active, Index, Handled);
-        Evicted.push_back(Item);
-      }
+    if (Iter.Cur->getWeight(Func) <= Iter.Weights[MinWeightIndex]) {
+      dumpLiveRangeTrace("Failing      ", Iter.Cur);
+      Func->setError("Unable to find a physical register for an "
+                     "infinite-weight live range "
+                     "(consider using -reg-reserve): " +
+                     Iter.Cur->getName(Func));
+      Handled.push_back(Iter.Cur);
+      return;
     }
-    // Do the same for Inactive.
-    for (SizeT I = Inactive.size(); I > 0; --I) {
-      const SizeT Index = I - 1;
-      Variable *Item = Inactive[Index];
-      // Note: The Item->rangeOverlaps(Cur) clause is not part of the
-      // description of AssignMemLoc() in the original paper. But there doesn't
-      // seem to be any need to evict an inactive live range that doesn't
-      // overlap with the live range currently being considered. It's especially
-      // bad if we would end up evicting an infinite-weight but
-      // currently-inactive live range. The most common situation for this would
-      // be a scratch register kill set for call instructions.
-      if (Aliases[Item->getRegNumTmp()] && Item->rangeOverlaps(Iter.Cur)) {
-        dumpLiveRangeTrace("Evicting I   ", Item);
-        Item->setRegNumTmp(Variable::NoRegister);
-        moveItem(Inactive, Index, Handled);
-        Evicted.push_back(Item);
-      }
-    }
-    // Assign the register to Cur.
-    Iter.Cur->setRegNumTmp(MinWeightIndex);
-    for (int32_t RegAlias = Aliases.find_first(); RegAlias >= 0;
-         RegAlias = Aliases.find_next(RegAlias)) {
-      assert(RegUses[RegAlias] >= 0);
-      ++RegUses[RegAlias];
-    }
-    Active.push_back(Iter.Cur);
-    dumpLiveRangeTrace("Allocating   ", Iter.Cur);
+    // At this point, MinWeightIndex points to a valid reserve register to
+    // reallocate to Iter.Cur, so drop into the eviction code.
   }
+
+  // Evict all live ranges in Active that register number MinWeightIndex is
+  // assigned to.
+  const llvm::SmallBitVector &Aliases = *RegAliases[MinWeightIndex];
+  for (SizeT I = Active.size(); I > 0; --I) {
+    const SizeT Index = I - 1;
+    Variable *Item = Active[Index];
+    int32_t RegNum = Item->getRegNumTmp();
+    if (Aliases[RegNum]) {
+      dumpLiveRangeTrace("Evicting A   ", Item);
+      const llvm::SmallBitVector &Aliases = *RegAliases[RegNum];
+      for (int32_t RegAlias = Aliases.find_first(); RegAlias >= 0;
+           RegAlias = Aliases.find_next(RegAlias)) {
+        --RegUses[RegAlias];
+        assert(RegUses[RegAlias] >= 0);
+      }
+      Item->setRegNumTmp(Variable::NoRegister);
+      moveItem(Active, Index, Handled);
+      Evicted.push_back(Item);
+    }
+  }
+  // Do the same for Inactive.
+  for (SizeT I = Inactive.size(); I > 0; --I) {
+    const SizeT Index = I - 1;
+    Variable *Item = Inactive[Index];
+    // Note: The Item->rangeOverlaps(Cur) clause is not part of the description
+    // of AssignMemLoc() in the original paper. But there doesn't seem to be any
+    // need to evict an inactive live range that doesn't overlap with the live
+    // range currently being considered. It's especially bad if we would end up
+    // evicting an infinite-weight but currently-inactive live range. The most
+    // common situation for this would be a scratch register kill set for call
+    // instructions.
+    if (Aliases[Item->getRegNumTmp()] && Item->rangeOverlaps(Iter.Cur)) {
+      dumpLiveRangeTrace("Evicting I   ", Item);
+      Item->setRegNumTmp(Variable::NoRegister);
+      moveItem(Inactive, Index, Handled);
+      Evicted.push_back(Item);
+    }
+  }
+  // Assign the register to Cur.
+  Iter.Cur->setRegNumTmp(MinWeightIndex);
+  for (int32_t RegAlias = Aliases.find_first(); RegAlias >= 0;
+       RegAlias = Aliases.find_next(RegAlias)) {
+    assert(RegUses[RegAlias] >= 0);
+    ++RegUses[RegAlias];
+  }
+  Active.push_back(Iter.Cur);
+  dumpLiveRangeTrace("Allocating   ", Iter.Cur);
 }
 
 void LinearScan::assignFinalRegisters(
@@ -843,6 +883,8 @@
     dumpLiveRangeTrace("\nConsidering  ", Iter.Cur);
     assert(Target->getRegistersForVariable(Iter.Cur).any());
     Iter.RegMask = RegMaskFull & Target->getRegistersForVariable(Iter.Cur);
+    Iter.RegMaskUnfiltered =
+        RegMaskFull & Target->getAllRegistersForVariable(Iter.Cur);
     KillsRange.trim(Iter.Cur->getLiveRange().getStart());
 
     // Check for pre-colored ranges. If Cur is pre-colored, it definitely gets
@@ -857,11 +899,14 @@
     handleActiveRangeExpiredOrInactive(Iter.Cur);
     handleInactiveRangeExpiredOrReactivated(Iter.Cur);
 
-    // Calculate available registers into Free[].
+    // Calculate available registers into Iter.Free[] and Iter.FreeUnfiltered[].
     Iter.Free = Iter.RegMask;
+    Iter.FreeUnfiltered = Iter.RegMaskUnfiltered;
     for (SizeT i = 0; i < Iter.RegMask.size(); ++i) {
-      if (RegUses[i] > 0)
+      if (RegUses[i] > 0) {
         Iter.Free[i] = false;
+        Iter.FreeUnfiltered[i] = false;
+      }
     }
 
     findRegisterPreference(Iter);
@@ -889,11 +934,12 @@
 
     filterFreeWithPrecoloredRanges(Iter);
 
-    // Remove scratch registers from the Free[] list, and mark their Weights[]
-    // as infinite, if KillsRange overlaps Cur's live range.
+    // Remove scratch registers from the Iter.Free[] list, and mark their
+    // Iter.Weights[] as infinite, if KillsRange overlaps Cur's live range.
     constexpr bool UseTrimmed = true;
     if (Iter.Cur->getLiveRange().overlaps(KillsRange, UseTrimmed)) {
       Iter.Free.reset(KillsMask);
+      Iter.FreeUnfiltered.reset(KillsMask);
       for (int i = KillsMask.find_first(); i != -1;
            i = KillsMask.find_next(i)) {
         Iter.Weights[i].setWeight(RegWeight::Inf);
@@ -906,7 +952,7 @@
     if (BuildDefs::dump() && Verbose) {
       Ostream &Str = Ctx->getStrDump();
       for (SizeT i = 0; i < Iter.RegMask.size(); ++i) {
-        if (Iter.RegMask[i]) {
+        if (Iter.RegMaskUnfiltered[i]) {
           Str << Target->getRegName(i, Iter.Cur->getType())
               << "(U=" << RegUses[i] << ",F=" << Iter.Free[i]
               << ",P=" << Iter.PrecoloredUnhandledMask[i] << ") ";
@@ -921,7 +967,8 @@
       allocatePreferredRegister(Iter);
     } else if (Iter.Free.any()) {
       // Second choice: any free register.
-      allocateFreeRegister(Iter);
+      constexpr bool Filtered = true;
+      allocateFreeRegister(Iter, Filtered);
     } else {
       // Fallback: there are no free registers, so we look for the lowest-weight
       // register and see if Cur has higher weight.
diff --git a/src/IceRegAlloc.h b/src/IceRegAlloc.h
index 2776f5b..01c1f3c 100644
--- a/src/IceRegAlloc.h
+++ b/src/IceRegAlloc.h
@@ -61,7 +61,9 @@
     int32_t PreferReg = Variable::NoRegister;
     bool AllowOverlap = false;
     llvm::SmallBitVector RegMask;
+    llvm::SmallBitVector RegMaskUnfiltered;
     llvm::SmallBitVector Free;
+    llvm::SmallBitVector FreeUnfiltered;
     llvm::SmallBitVector PrecoloredUnhandledMask; // Note: only used for dumping
     llvm::SmallVector<RegWeight, REGS_SIZE> Weights;
   };
@@ -98,7 +100,7 @@
   void filterFreeWithPrecoloredRanges(IterationState &Iter);
   void allocatePrecoloredRegister(Variable *Cur);
   void allocatePreferredRegister(IterationState &Iter);
-  void allocateFreeRegister(IterationState &Iter);
+  void allocateFreeRegister(IterationState &Iter, bool Filtered);
   void handleNoFreeRegisters(IterationState &Iter);
   void assignFinalRegisters(const llvm::SmallBitVector &RegMaskFull,
                             const llvm::SmallBitVector &PreDefinedRegisters,
@@ -130,6 +132,7 @@
   bool FindOverlap = false;
 
   const bool Verbose;
+  const bool UseReserve;
 };
 
 } // end of namespace Ice
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 3d3452e..d406fb5 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -275,8 +275,15 @@
 
   virtual llvm::SmallBitVector getRegisterSet(RegSetMask Include,
                                               RegSetMask Exclude) const = 0;
+  /// Get the set of physical registers available for the specified Variable's
+  /// register class, applying register restrictions from the command line.
   virtual const llvm::SmallBitVector &
   getRegistersForVariable(const Variable *Var) const = 0;
+  /// Get the set of *all* physical registers available for the specified
+  /// Variable's register class, *not* applying register restrictions from the
+  /// command line.
+  virtual const llvm::SmallBitVector &
+  getAllRegistersForVariable(const Variable *Var) const = 0;
   virtual const llvm::SmallBitVector &getAliasesForRegister(SizeT) const = 0;
 
   void regAlloc(RegAllocKind Kind);
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index f369ff2..72ba954 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -342,6 +342,9 @@
   TypeToRegisterSet[IceType_v4i32] = VectorRegisters;
   TypeToRegisterSet[IceType_v4f32] = VectorRegisters;
 
+  for (size_t i = 0; i < llvm::array_lengthof(TypeToRegisterSet); ++i)
+    TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
+
   filterTypeToRegisterSet(
       Ctx, RegARM32::Reg_NUM, TypeToRegisterSet,
       llvm::array_lengthof(TypeToRegisterSet), [](int32_t RegNum) -> IceString {
@@ -6514,6 +6517,8 @@
 }
 
 llvm::SmallBitVector TargetARM32::TypeToRegisterSet[RegARM32::RCARM32_NUM];
+llvm::SmallBitVector
+    TargetARM32::TypeToRegisterSetUnfiltered[RegARM32::RCARM32_NUM];
 llvm::SmallBitVector TargetARM32::RegisterAliases[RegARM32::Reg_NUM];
 
 } // end of namespace ARM32
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index b8c1040..af55141 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -88,6 +88,12 @@
     assert(RC < RC_Target);
     return TypeToRegisterSet[RC];
   }
+  const llvm::SmallBitVector &
+  getAllRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(RC < RC_Target);
+    return TypeToRegisterSetUnfiltered[RC];
+  }
   const llvm::SmallBitVector &getAliasesForRegister(SizeT Reg) const override {
     return RegisterAliases[Reg];
   }
@@ -1020,6 +1026,8 @@
   uint32_t MaxOutArgsSizeBytes = 0;
   // TODO(jpp): std::array instead of array.
   static llvm::SmallBitVector TypeToRegisterSet[RegARM32::RCARM32_NUM];
+  static llvm::SmallBitVector
+      TypeToRegisterSetUnfiltered[RegARM32::RCARM32_NUM];
   static llvm::SmallBitVector RegisterAliases[RegARM32::Reg_NUM];
   llvm::SmallBitVector RegsUsed;
   VarList PhysicalRegisters[IceType_NUM];
diff --git a/src/IceTargetLoweringMIPS32.cpp b/src/IceTargetLoweringMIPS32.cpp
index 3cbf03e..d8cd65c 100644
--- a/src/IceTargetLoweringMIPS32.cpp
+++ b/src/IceTargetLoweringMIPS32.cpp
@@ -116,6 +116,9 @@
   TypeToRegisterSet[IceType_v4i32] = VectorRegisters;
   TypeToRegisterSet[IceType_v4f32] = VectorRegisters;
 
+  for (size_t i = 0; i < llvm::array_lengthof(TypeToRegisterSet); ++i)
+    TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
+
   filterTypeToRegisterSet(Ctx, RegMIPS32::Reg_NUM, TypeToRegisterSet,
                           llvm::array_lengthof(TypeToRegisterSet),
                           RegMIPS32::getRegName, getRegClassName);
@@ -1126,6 +1129,7 @@
 }
 
 llvm::SmallBitVector TargetMIPS32::TypeToRegisterSet[RCMIPS32_NUM];
+llvm::SmallBitVector TargetMIPS32::TypeToRegisterSetUnfiltered[RCMIPS32_NUM];
 llvm::SmallBitVector TargetMIPS32::RegisterAliases[RegMIPS32::Reg_NUM];
 
 } // end of namespace MIPS32
diff --git a/src/IceTargetLoweringMIPS32.h b/src/IceTargetLoweringMIPS32.h
index 4fdb081..472eeb5 100644
--- a/src/IceTargetLoweringMIPS32.h
+++ b/src/IceTargetLoweringMIPS32.h
@@ -57,6 +57,12 @@
     assert(RC < RC_Target);
     return TypeToRegisterSet[RC];
   }
+  const llvm::SmallBitVector &
+  getAllRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(RC < RC_Target);
+    return TypeToRegisterSetUnfiltered[RC];
+  }
   const llvm::SmallBitVector &getAliasesForRegister(SizeT Reg) const override {
     return RegisterAliases[Reg];
   }
@@ -263,6 +269,7 @@
   bool UsesFramePointer = false;
   bool NeedsStackAlignment = false;
   static llvm::SmallBitVector TypeToRegisterSet[RCMIPS32_NUM];
+  static llvm::SmallBitVector TypeToRegisterSetUnfiltered[RCMIPS32_NUM];
   static llvm::SmallBitVector RegisterAliases[RegMIPS32::Reg_NUM];
   llvm::SmallBitVector RegsUsed;
   VarList PhysicalRegisters[IceType_NUM];
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 3e45c64..73e3a7b 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -107,6 +107,10 @@
     TargetX86Base<X8632::Traits>::TypeToRegisterSet = {{}};
 
 template <>
+std::array<llvm::SmallBitVector, RCX86_NUM>
+    TargetX86Base<X8632::Traits>::TypeToRegisterSetUnfiltered = {{}};
+
+template <>
 std::array<llvm::SmallBitVector,
            TargetX86Base<X8632::Traits>::Traits::RegisterSet::Reg_NUM>
     TargetX86Base<X8632::Traits>::RegisterAliases = {{}};
diff --git a/src/IceTargetLoweringX8664.cpp b/src/IceTargetLoweringX8664.cpp
index aed9b30..758023b 100644
--- a/src/IceTargetLoweringX8664.cpp
+++ b/src/IceTargetLoweringX8664.cpp
@@ -107,6 +107,10 @@
     TargetX86Base<X8664::Traits>::TypeToRegisterSet = {{}};
 
 template <>
+std::array<llvm::SmallBitVector, RCX86_NUM>
+    TargetX86Base<X8664::Traits>::TypeToRegisterSetUnfiltered = {{}};
+
+template <>
 std::array<llvm::SmallBitVector,
            TargetX86Base<X8664::Traits>::Traits::RegisterSet::Reg_NUM>
     TargetX86Base<X8664::Traits>::RegisterAliases = {{}};
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index 631d06b..f4119f0 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -124,6 +124,13 @@
     return TypeToRegisterSet[RC];
   }
 
+  const llvm::SmallBitVector &
+  getAllRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(static_cast<RegClassX86>(RC) < RCX86_NUM);
+    return TypeToRegisterSetUnfiltered[RC];
+  }
+
   const llvm::SmallBitVector &getAliasesForRegister(SizeT Reg) const override {
     assert(Reg < Traits::RegisterSet::Reg_NUM);
     return RegisterAliases[Reg];
@@ -974,6 +981,8 @@
   bool PrologEmitsFixedAllocas = false;
   uint32_t MaxOutArgsSizeBytes = 0;
   static std::array<llvm::SmallBitVector, RCX86_NUM> TypeToRegisterSet;
+  static std::array<llvm::SmallBitVector, RCX86_NUM>
+      TypeToRegisterSetUnfiltered;
   static std::array<llvm::SmallBitVector, Traits::RegisterSet::Reg_NUM>
       RegisterAliases;
   llvm::SmallBitVector RegsUsed;
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 48c342f..d2a3530 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -379,6 +379,8 @@
 void TargetX86Base<TraitsType>::staticInit(GlobalContext *Ctx) {
   Traits::initRegisterSet(Ctx->getFlags(), &TypeToRegisterSet,
                           &RegisterAliases);
+  for (size_t i = 0; i < TypeToRegisterSet.size(); ++i)
+    TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
   filterTypeToRegisterSet(Ctx, Traits::RegisterSet::Reg_NUM,
                           TypeToRegisterSet.data(), TypeToRegisterSet.size(),
                           Traits::getRegName, getRegClassName);
@@ -1945,8 +1947,6 @@
       Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem);
       _mov(T_1, Src0Hi);
       _imul(T_1, Src1Lo);
-      _mov(T_2, Src1Hi);
-      _imul(T_2, Src0Lo);
       _mov(T_3, Src0Lo, Traits::RegisterSet::Reg_eax);
       _mul(T_4Lo, T_3, Src1Lo);
       // The mul instruction produces two dest variables, edx:eax. We create a
@@ -1954,6 +1954,8 @@
       Context.insert<InstFakeDef>(T_4Hi, T_4Lo);
       _mov(DestLo, T_4Lo);
       _add(T_4Hi, T_1);
+      _mov(T_2, Src1Hi);
+      _imul(T_2, Src0Lo);
       _add(T_4Hi, T_2);
       _mov(DestHi, T_4Hi);
     } break;
@@ -5801,8 +5803,8 @@
   if (!Traits::Is64Bit && Ty == IceType_i64) {
     Value = legalizeUndef(Value);
     Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
-    Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
     _store(ValueHi, llvm::cast<X86OperandMem>(hiOperand(NewAddr)));
+    Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
     _store(ValueLo, llvm::cast<X86OperandMem>(loOperand(NewAddr)));
   } else if (isVectorType(Ty)) {
     _storep(legalizeToReg(Value), NewAddr);
diff --git a/tests_lit/llvm2ice_tests/64bit.pnacl.ll b/tests_lit/llvm2ice_tests/64bit.pnacl.ll
index e783b85..241c4ec 100644
--- a/tests_lit/llvm2ice_tests/64bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/64bit.pnacl.ll
@@ -125,7 +125,7 @@
 ; OPTM1-LABEL: pass64BitConstArg
 ; OPTM1:      sub     esp
 ; OPTM1:      mov     DWORD PTR [esp+0x4]
-; OPTM1-NEXT: mov     DWORD PTR [esp]
+; OPTM1:      mov     DWORD PTR [esp]
 ; OPTM1-NEXT: mov     DWORD PTR [esp+0x8],0x7b
 ; Bundle padding might be added (so not using -NEXT).
 ; OPTM1:      mov     DWORD PTR [esp+0x10],0xdeadbeef
@@ -277,16 +277,16 @@
 }
 ; CHECK-LABEL: mul64BitSigned
 ; CHECK: imul
-; CHECK: imul
 ; CHECK: mul
 ; CHECK: add
+; CHECK: imul
 ; CHECK: add
 ;
 ; OPTM1-LABEL: mul64BitSigned
 ; OPTM1: imul
-; OPTM1: imul
 ; OPTM1: mul
 ; OPTM1: add
+; OPTM1: imul
 ; OPTM1: add
 
 ; ARM32-LABEL: mul64BitSigned
@@ -302,16 +302,16 @@
 }
 ; CHECK-LABEL: mul64BitUnsigned
 ; CHECK: imul
-; CHECK: imul
 ; CHECK: mul
 ; CHECK: add
+; CHECK: imul
 ; CHECK: add
 ;
 ; OPTM1-LABEL: mul64BitUnsigned
 ; OPTM1: imul
-; OPTM1: imul
 ; OPTM1: mul
 ; OPTM1: add
+; OPTM1: imul
 ; OPTM1: add
 
 ; ARM32-LABEL: mul64BitUnsigned