Subzero. ARM32. Combine allocas.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1465213002 .
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 3fe37f3..f8d40a5 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -484,8 +484,14 @@
     } else {
       // Addressing is relative to the stack pointer or to a user pointer.  Add
       // the offset before adding the size of the object, because it grows
-      // upwards from the stack pointer.
-      Offsets.push_back(CurrentOffset);
+      // upwards from the stack pointer. In addition, if the addressing is
+      // relative to the stack pointer, we need to add the pre-computed max out
+      // args size bytes.
+      const uint32_t OutArgsOffsetOrZero =
+          (BaseVariableType == BVT_StackPointer)
+              ? getTarget()->maxOutArgsSizeBytes()
+              : 0;
+      Offsets.push_back(CurrentOffset + OutArgsOffsetOrZero);
     }
     // Update the running offset of the fused alloca region.
     CurrentOffset += Size;
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 1350afa..bac403f 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -211,6 +211,7 @@
   virtual uint32_t getStackAlignment() const = 0;
   virtual void reserveFixedAllocaArea(size_t Size, size_t Align) = 0;
   virtual int32_t getFrameFixedAllocaOffset() const = 0;
+  virtual uint32_t maxOutArgsSizeBytes() const { return 0; }
 
   /// Return whether a 64-bit Variable should be split into a Variable64On32.
   virtual bool shouldSplitToVariable64On32(Type Ty) const = 0;
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index 12810f6..f23609b 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -265,7 +265,7 @@
 }
 
 void TargetARM32::findMaxStackOutArgsSize() {
-  // MinNeededOutArgsBytes should be updated if the Target ever creates an
+  // MinNeededOutArgsBytes should be updated if the Target ever creates a
   // high-level InstCall that requires more stack bytes.
   constexpr size_t MinNeededOutArgsBytes = 0;
   MaxOutArgsSizeBytes = MinNeededOutArgsBytes;
@@ -291,7 +291,7 @@
   findMaxStackOutArgsSize();
 
   // Do not merge Alloca instructions, and lay out the stack.
-  static constexpr bool SortAndCombineAllocas = false;
+  static constexpr bool SortAndCombineAllocas = true;
   Func->processAllocas(SortAndCombineAllocas);
   Func->dump("After Alloca processing");
 
@@ -356,6 +356,7 @@
   regAlloc(RAK_Global);
   if (Func->hasError())
     return;
+
   copyRegAllocFromInfWeightVariable64On32(Func->getVariables());
   Func->dump("After linear scan regalloc");
 
@@ -364,6 +365,8 @@
     Func->dump("After advanced Phi lowering");
   }
 
+  ForbidTemporaryWithoutReg _(this);
+
   // Stack frame mapping.
   Func->genFrame();
   if (Func->hasError())
@@ -399,8 +402,8 @@
   findMaxStackOutArgsSize();
 
   // Do not merge Alloca instructions, and lay out the stack.
-  static constexpr bool SortAndCombineAllocas = false;
-  Func->processAllocas(SortAndCombineAllocas);
+  static constexpr bool DontSortAndCombineAllocas = false;
+  Func->processAllocas(DontSortAndCombineAllocas);
   Func->dump("After Alloca processing");
 
   Func->placePhiLoads();
@@ -424,9 +427,12 @@
   regAlloc(RAK_InfOnly);
   if (Func->hasError())
     return;
+
   copyRegAllocFromInfWeightVariable64On32(Func->getVariables());
   Func->dump("After regalloc of infinite-weight variables");
 
+  ForbidTemporaryWithoutReg _(this);
+
   Func->genFrame();
   if (Func->hasError())
     return;
@@ -520,6 +526,7 @@
     llvm::report_fatal_error(
         "Infinite-weight Variable has no register assigned");
   }
+  assert(!Var->isRematerializable());
   int32_t Offset = Var->getStackOffset();
   int32_t BaseRegNum = Var->getBaseRegNum();
   if (BaseRegNum == Variable::NoRegister) {
@@ -850,6 +857,9 @@
     SpillAreaSizeBytes = StackSize - StackOffset;
   }
 
+  // Combine fixed alloca with SpillAreaSize.
+  SpillAreaSizeBytes += FixedAllocaSizeBytes;
+
   // Generate "sub sp, SpillAreaSizeBytes"
   if (SpillAreaSizeBytes) {
     // Use the scratch register if needed to legalize the immediate.
@@ -857,7 +867,11 @@
                                   Legal_Reg | Legal_Flex, getReservedTmpReg());
     Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
     _sub(SP, SP, SubAmount);
+    if (FixedAllocaAlignBytes > ARM32_STACK_ALIGNMENT_BYTES) {
+      alignRegisterPow2(SP, FixedAllocaAlignBytes);
+    }
   }
+
   Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
 
   // Fill in stack offsets for stack args, and copy args into registers for
@@ -1034,6 +1048,7 @@
                                                Variable *OrigBaseReg,
                                                Variable **NewBaseReg,
                                                int32_t *NewBaseOffset) {
+  assert(!OrigBaseReg->isRematerializable());
   if (isLegalMemOffset(Ty, Offset)) {
     return OperandARM32Mem::create(
         Func, Ty, OrigBaseReg,
@@ -1053,6 +1068,7 @@
     OffsetDiff = 0;
   }
 
+  assert(!(*NewBaseReg)->isRematerializable());
   return OperandARM32Mem::create(
       Func, Ty, *NewBaseReg,
       llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(OffsetDiff)),
@@ -1076,8 +1092,9 @@
 
   bool Legalized = false;
   if (!Dest->hasReg()) {
-    auto *const SrcR = llvm::cast<Variable>(Src);
+    auto *SrcR = llvm::cast<Variable>(Src);
     assert(SrcR->hasReg());
+    assert(!SrcR->isRematerializable());
     const int32_t Offset = Dest->getStackOffset();
     // This is a _mov(Mem(), Variable), i.e., a store.
     _str(SrcR, createMemOperand(DestTy, Offset, OrigBaseReg, NewBaseReg,
@@ -1087,12 +1104,26 @@
     Context.insert(InstFakeDef::create(Func, Dest));
     Legalized = true;
   } else if (auto *Var = llvm::dyn_cast<Variable>(Src)) {
-    if (!Var->hasReg()) {
-      const int32_t Offset = Var->getStackOffset();
-      _ldr(Dest, createMemOperand(DestTy, Offset, OrigBaseReg, NewBaseReg,
-                                  NewBaseOffset),
-           MovInstr->getPredicate());
+    if (Var->isRematerializable()) {
+      // Rematerialization arithmetic.
+      const int32_t ExtraOffset =
+          (static_cast<SizeT>(Var->getRegNum()) == getFrameReg())
+              ? getFrameFixedAllocaOffset()
+              : 0;
+
+      const int32_t Offset = Var->getStackOffset() + ExtraOffset;
+      Operand *OffsetRF = legalize(Ctx->getConstantInt32(Offset),
+                                   Legal_Reg | Legal_Flex, Dest->getRegNum());
+      _add(Dest, Var, OffsetRF);
       Legalized = true;
+    } else {
+      if (!Var->hasReg()) {
+        const int32_t Offset = Var->getStackOffset();
+        _ldr(Dest, createMemOperand(DestTy, Offset, OrigBaseReg, NewBaseReg,
+                                    NewBaseOffset),
+             MovInstr->getPredicate());
+        Legalized = true;
+      }
     }
   }
 
@@ -1163,13 +1194,15 @@
     // increment) in case of duplication.
     assert(Mem->getAddrMode() == OperandARM32Mem::Offset ||
            Mem->getAddrMode() == OperandARM32Mem::NegOffset);
+    Variable *BaseR = legalizeToReg(Mem->getBase());
     if (Mem->isRegReg()) {
-      return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(),
-                                     Mem->getIndex(), Mem->getShiftOp(),
-                                     Mem->getShiftAmt(), Mem->getAddrMode());
+      Variable *IndexR = legalizeToReg(Mem->getIndex());
+      return OperandARM32Mem::create(Func, IceType_i32, BaseR, IndexR,
+                                     Mem->getShiftOp(), Mem->getShiftAmt(),
+                                     Mem->getAddrMode());
     } else {
-      return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(),
-                                     Mem->getOffset(), Mem->getAddrMode());
+      return OperandARM32Mem::create(Func, IceType_i32, BaseR, Mem->getOffset(),
+                                     Mem->getAddrMode());
     }
   }
   llvm_unreachable("Unsupported operand type");
@@ -1201,7 +1234,9 @@
       Variable *NewBase = Func->makeVariable(Base->getType());
       lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add, NewBase,
                                              Base, Four));
-      return OperandARM32Mem::create(Func, SplitType, NewBase, Mem->getIndex(),
+      Variable *BaseR = legalizeToReg(NewBase);
+      Variable *IndexR = legalizeToReg(Mem->getIndex());
+      return OperandARM32Mem::create(Func, SplitType, BaseR, IndexR,
                                      Mem->getShiftOp(), Mem->getShiftAmt(),
                                      Mem->getAddrMode());
     } else {
@@ -1216,16 +1251,17 @@
         // mode into a RegReg addressing mode. Since NaCl sandboxing disallows
         // RegReg addressing modes, prefer adding to base and replacing
         // instead. Thus we leave the old offset alone.
-        Constant *Four = Ctx->getConstantInt32(4);
+        Constant *_4 = Ctx->getConstantInt32(4);
         Variable *NewBase = Func->makeVariable(Base->getType());
         lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add,
-                                               NewBase, Base, Four));
+                                               NewBase, Base, _4));
         Base = NewBase;
       } else {
         Offset =
             llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(NextOffsetVal));
       }
-      return OperandARM32Mem::create(Func, SplitType, Base, Offset,
+      Variable *BaseR = legalizeToReg(Base);
+      return OperandARM32Mem::create(Func, SplitType, BaseR, Offset,
                                      Mem->getAddrMode());
     }
   }
@@ -1264,7 +1300,6 @@
 }
 
 void TargetARM32::lowerAlloca(const InstAlloca *Inst) {
-  UsesFramePointer = true;
   // Conservatively require the stack to be aligned. Some stack adjustment
   // operations implemented below assume that the stack is aligned before the
   // alloca. All the alloca code ensures that the stack alignment is preserved
@@ -1272,29 +1307,53 @@
   // cases.
   NeedsStackAlignment = true;
 
-  // TODO(stichnot): minimize the number of adjustments of SP, etc.
-  Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
-  Variable *Dest = Inst->getDest();
-  uint32_t AlignmentParam = Inst->getAlignInBytes();
   // For default align=0, set it to the real value 1, to avoid any
   // bit-manipulation problems below.
-  AlignmentParam = std::max(AlignmentParam, 1u);
+  const uint32_t AlignmentParam = std::max(1u, Inst->getAlignInBytes());
 
   // LLVM enforces power of 2 alignment.
   assert(llvm::isPowerOf2_32(AlignmentParam));
   assert(llvm::isPowerOf2_32(ARM32_STACK_ALIGNMENT_BYTES));
 
-  uint32_t Alignment = std::max(AlignmentParam, ARM32_STACK_ALIGNMENT_BYTES);
-  if (Alignment > ARM32_STACK_ALIGNMENT_BYTES) {
+  const uint32_t Alignment =
+      std::max(AlignmentParam, ARM32_STACK_ALIGNMENT_BYTES);
+  const bool OverAligned = Alignment > ARM32_STACK_ALIGNMENT_BYTES;
+  const bool OptM1 = Ctx->getFlags().getOptLevel() == Opt_m1;
+  const bool AllocaWithKnownOffset = Inst->getKnownFrameOffset();
+  const bool UseFramePointer =
+      hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
+
+  if (UseFramePointer)
+    setHasFramePointer();
+
+  Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
+  if (OverAligned) {
     alignRegisterPow2(SP, Alignment);
   }
+
+  Variable *Dest = Inst->getDest();
   Operand *TotalSize = Inst->getSizeInBytes();
+
   if (const auto *ConstantTotalSize =
           llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
-    uint32_t Value = ConstantTotalSize->getValue();
-    Value = Utils::applyAlignment(Value, Alignment);
-    Operand *SubAmount = legalize(Ctx->getConstantInt32(Value));
-    _sub(SP, SP, SubAmount);
+    const uint32_t Value =
+        Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
+    // Constant size alloca.
+    if (!UseFramePointer) {
+      // If we don't need a Frame Pointer, this alloca has a known offset to the
+      // stack pointer. We don't need adjust the stack pointer, nor assign any
+      // value to Dest, as Dest is rematerializable.
+      assert(Dest->isRematerializable());
+      FixedAllocaSizeBytes += Value;
+      Context.insert(InstFakeDef::create(Func, Dest));
+      return;
+    }
+
+    // If a frame pointer is required, then we need to store the alloca'd result
+    // in Dest.
+    Operand *SubAmountRF =
+        legalize(Ctx->getConstantInt32(Value), Legal_Reg | Legal_Flex);
+    _sub(SP, SP, SubAmountRF);
   } else {
     // Non-constant sizes need to be adjusted to the next highest multiple of
     // the required alignment at runtime.
@@ -1306,6 +1365,8 @@
     alignRegisterPow2(T, Alignment);
     _sub(SP, SP, T);
   }
+
+  // Adds back a few bytes to SP to account for the out args area.
   Variable *T = SP;
   if (MaxOutArgsSizeBytes != 0) {
     T = makeReg(getPointerType());
@@ -1313,6 +1374,7 @@
         Ctx->getConstantInt32(MaxOutArgsSizeBytes), Legal_Reg | Legal_Flex);
     _add(T, SP, OutArgsSizeRF);
   }
+
   _mov(Dest, T);
 }
 
@@ -1976,6 +2038,12 @@
 
 void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
   Variable *Dest = Inst->getDest();
+
+  if (Dest->isRematerializable()) {
+    Context.insert(InstFakeDef::create(Func, Dest));
+    return;
+  }
+
   if (Dest->getType() == IceType_i1) {
     lowerInt1Arithmetic(Inst);
     return;
@@ -2139,8 +2207,8 @@
     if (Srcs.hasConstOperand()) {
       // TODO(jpp): lowering Src0R here is wrong -- Src0R it is not guaranteed
       // to be used.
-      Variable *Src0R = Srcs.src0R(this);
       if (Srcs.immediateIsFlexEncodable()) {
+        Variable *Src0R = Srcs.src0R(this);
         Operand *Src1RF = Srcs.src1RF(this);
         if (Srcs.swappedOperands()) {
           _rsb(T, Src0R, Src1RF);
@@ -2151,6 +2219,7 @@
         return;
       }
       if (!Srcs.swappedOperands() && Srcs.negatedImmediateIsFlexEncodable()) {
+        Variable *Src0R = Srcs.src0R(this);
         Operand *Src1F = Srcs.negatedSrc1F(this);
         _add(T, Src0R, Src1F);
         _mov(Dest, T);
@@ -2215,6 +2284,12 @@
 
 void TargetARM32::lowerAssign(const InstAssign *Inst) {
   Variable *Dest = Inst->getDest();
+
+  if (Dest->isRematerializable()) {
+    Context.insert(InstFakeDef::create(Func, Dest));
+    return;
+  }
+
   Operand *Src0 = Inst->getSrc(0);
   assert(Dest->getType() == Src0->getType());
   if (Dest->getType() == IceType_i64) {
@@ -4425,13 +4500,17 @@
   assert(OffsetImm < 0 ? (ValidImmMask & -OffsetImm) == -OffsetImm
                        : (ValidImmMask & OffsetImm) == OffsetImm);
 
+  Variable *BaseR = makeReg(getPointerType());
+  Context.insert(InstAssign::create(Func, BaseR, BaseVar));
   if (OffsetReg != nullptr) {
-    return OperandARM32Mem::create(Func, Ty, BaseVar, OffsetReg, ShiftKind,
+    Variable *OffsetR = makeReg(getPointerType());
+    Context.insert(InstAssign::create(Func, OffsetR, OffsetReg));
+    return OperandARM32Mem::create(Func, Ty, BaseR, OffsetR, ShiftKind,
                                    OffsetRegShamt);
   }
 
   return OperandARM32Mem::create(
-      Func, Ty, BaseVar,
+      Func, Ty, BaseR,
       llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(OffsetImm)));
 }
 
@@ -4630,7 +4709,8 @@
   if (RegNum == Variable::NoRegister) {
     if (Variable *Subst = getContext().availabilityGet(From)) {
       // At this point we know there is a potential substitution available.
-      if (Subst->mustHaveReg() && !Subst->hasReg()) {
+      if (!Subst->isRematerializable() && Subst->mustHaveReg() &&
+          !Subst->hasReg()) {
         // At this point we know the substitution will have a register.
         if (From->getType() == Subst->getType()) {
           // At this point we know the substitution's register is compatible.
@@ -4788,6 +4868,13 @@
   }
 
   if (auto *Var = llvm::dyn_cast<Variable>(From)) {
+    if (Var->isRematerializable()) {
+      // TODO(jpp): We don't need to rematerialize Var if legalize() was invoked
+      // for a Variable in a Mem operand.
+      Variable *T = makeReg(Var->getType(), RegNum);
+      _mov(T, Var);
+      return T;
+    }
     // Check if the variable is guaranteed a physical register. This can happen
     // either when the variable is pre-colored or when it is assigned infinite
     // weight.
@@ -4844,9 +4931,9 @@
   // If we didn't do address mode optimization, then we only have a
   // base/offset to work with. ARM always requires a base register, so
   // just use that to hold the operand.
-  Variable *Base = legalizeToReg(Operand);
+  Variable *BaseR = legalizeToReg(Operand);
   return OperandARM32Mem::create(
-      Func, Ty, Base,
+      Func, Ty, BaseR,
       llvm::cast<ConstantInteger32>(Ctx->getConstantZero(IceType_i32)));
 }
 
@@ -4863,6 +4950,7 @@
 Variable *TargetARM32::makeReg(Type Type, int32_t RegNum) {
   // There aren't any 64-bit integer registers for ARM32.
   assert(Type != IceType_i64);
+  assert(AllowTemporaryWithNoReg || RegNum != Variable::NoRegister);
   Variable *Reg = Func->makeVariable(Type);
   if (RegNum == Variable::NoRegister)
     Reg->setMustHaveReg();
@@ -4871,7 +4959,8 @@
   return Reg;
 }
 
-void TargetARM32::alignRegisterPow2(Variable *Reg, uint32_t Align) {
+void TargetARM32::alignRegisterPow2(Variable *Reg, uint32_t Align,
+                                    int32_t TmpRegNum) {
   assert(llvm::isPowerOf2_32(Align));
   uint32_t RotateAmt;
   uint32_t Immed_8;
@@ -4880,10 +4969,12 @@
   // it fits at all). Assume Align is usually small, in which case BIC works
   // better. Thus, this rounds down to the alignment.
   if (OperandARM32FlexImm::canHoldImm(Align - 1, &RotateAmt, &Immed_8)) {
-    Mask = legalize(Ctx->getConstantInt32(Align - 1), Legal_Reg | Legal_Flex);
+    Mask = legalize(Ctx->getConstantInt32(Align - 1), Legal_Reg | Legal_Flex,
+                    TmpRegNum);
     _bic(Reg, Reg, Mask);
   } else {
-    Mask = legalize(Ctx->getConstantInt32(-Align), Legal_Reg | Legal_Flex);
+    Mask = legalize(Ctx->getConstantInt32(-Align), Legal_Reg | Legal_Flex,
+                    TmpRegNum);
     _and(Reg, Reg, Mask);
   }
 }
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index cf38fe7..e8f2d1a 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -99,16 +99,15 @@
   }
   uint32_t getStackAlignment() const override;
   void reserveFixedAllocaArea(size_t Size, size_t Align) override {
-    // TODO(sehr,jpp): Implement fixed stack layout.
-    (void)Size;
-    (void)Align;
-    llvm::report_fatal_error("Not yet implemented");
+    FixedAllocaSizeBytes = Size;
+    assert(llvm::isPowerOf2_32(Align));
+    FixedAllocaAlignBytes = Align;
+    PrologEmitsFixedAllocas = true;
   }
   int32_t getFrameFixedAllocaOffset() const override {
-    // TODO(sehr,jpp): Implement fixed stack layout.
-    llvm::report_fatal_error("Not yet implemented");
-    return 0;
+    return FixedAllocaSizeBytes - (SpillAreaSizeBytes - MaxOutArgsSizeBytes);
   }
+  uint32_t maxOutArgsSizeBytes() const override { return MaxOutArgsSizeBytes; }
 
   bool shouldSplitToVariable64On32(Type Ty) const override {
     return Ty == IceType_i64;
@@ -250,7 +249,8 @@
   Variable *makeReg(Type Ty, int32_t RegNum = Variable::NoRegister);
   static Type stackSlotType();
   Variable *copyToReg(Operand *Src, int32_t RegNum = Variable::NoRegister);
-  void alignRegisterPow2(Variable *Reg, uint32_t Align);
+  void alignRegisterPow2(Variable *Reg, uint32_t Align,
+                         int32_t TmpRegNum = Variable::NoRegister);
 
   /// Returns a vector in a register with the given constant entries.
   Variable *makeVectorOfZeros(Type Ty, int32_t RegNum = Variable::NoRegister);
@@ -811,7 +811,7 @@
   }
 
   // Iterates over the CFG and determines the maximum outgoing stack arguments
-  // bytes. This information is later used during addProlog() do pre-allocate
+  // bytes. This information is later used during addProlog() to pre-allocate
   // the outargs area.
   // TODO(jpp): This could live in the Parser, if we provided a Target-specific
   // method that the Parser could call.
@@ -852,6 +852,9 @@
   bool NeedsStackAlignment = false;
   bool MaybeLeafFunc = true;
   size_t SpillAreaSizeBytes = 0;
+  size_t FixedAllocaSizeBytes = 0;
+  size_t FixedAllocaAlignBytes = 0;
+  bool PrologEmitsFixedAllocas = false;
   uint32_t MaxOutArgsSizeBytes = 0;
   // TODO(jpp): std::array instead of array.
   static llvm::SmallBitVector TypeToRegisterSet[RCARM32_NUM];
@@ -970,6 +973,29 @@
   };
 
   BoolComputationTracker BoolComputations;
+
+  // AllowTemporaryWithNoReg indicates if TargetARM32::makeReg() can be invoked
+  // without specifying a physical register. This is needed for creating unbound
+  // temporaries during Ice -> ARM lowering, but before register allocation.
+  // This a safe-guard that, during the legalization post-passes no unbound
+  // temporaries are created.
+  bool AllowTemporaryWithNoReg = true;
+  // ForbidTemporaryWithoutReg is a RAII class that manages
+  // AllowTemporaryWithNoReg.
+  class ForbidTemporaryWithoutReg {
+    ForbidTemporaryWithoutReg() = delete;
+    ForbidTemporaryWithoutReg(const ForbidTemporaryWithoutReg&) = delete;
+    ForbidTemporaryWithoutReg &operator=(const ForbidTemporaryWithoutReg&) = delete;
+
+  public:
+    explicit ForbidTemporaryWithoutReg(TargetARM32 *Target) : Target(Target) {
+      Target->AllowTemporaryWithNoReg = false;
+    }
+    ~ForbidTemporaryWithoutReg() { Target->AllowTemporaryWithNoReg = true; }
+
+  private:
+    TargetARM32 *const Target;
+  };
 };
 
 class TargetDataARM32 final : public TargetDataLowering {
diff --git a/tests_lit/assembler/arm32/bic.ll b/tests_lit/assembler/arm32/bic.ll
index 8d50c27..2849a74 100644
--- a/tests_lit/assembler/arm32/bic.ll
+++ b/tests_lit/assembler/arm32/bic.ll
@@ -28,70 +28,65 @@
 
 ; ASM-LABEL:AllocBigAlign:
 ; ASM-NEXT:.LAllocBigAlign$__0:
-; ASM-NEXT:  push    {fp}
-; ASM-NEXT:  mov     fp, sp
-; ASM-NEXT:  sub     sp, sp, #12
-; ASM-NEXT:  bic     sp, sp, #31
-; ASM-NEXT:  sub     sp, sp, #32
-; ASM-NEXT:  mov     r0, sp
-; ASM-NEXT:  mov     sp, fp
-; ASM-NEXT:  pop     {fp}
-; ASM-NEXT:  # fp = def.pseudo 
-; ASM-NEXT:  bx      lr
-
 ; DIS-LABEL:00000000 <AllocBigAlign>:
-; DIS-NEXT:   0:        e52db004
-; DIS-NEXT:   4:        e1a0b00d
-; DIS-NEXT:   8:        e24dd00c
-; DIS-NEXT:   c:        e3cdd01f
-; DIS-NEXT:  10:        e24dd020
-; DIS-NEXT:  14:        e1a0000d
-; DIS-NEXT:  18:        e1a0d00b
-; DIS-NEXT:  1c:        e49db004
-; DIS-NEXT:  20:        e12fff1e
-
 ; IASM-LABEL:AllocBigAlign:
 ; IASM-NEXT:.LAllocBigAlign$__0:
+
+; ASM-NEXT:  push    {fp}
+; DIS-NEXT:   0:        e52db004
 ; IASM-NEXT:    .byte 0x4
 ; IASM-NEXT:    .byte 0xb0
 ; IASM-NEXT:    .byte 0x2d
 ; IASM-NEXT:    .byte 0xe5
 
+; ASM-NEXT:  mov     fp, sp
+; DIS-NEXT:   4:        e1a0b00d
 ; IASM:         .byte 0xd
 ; IASM-NEXT:    .byte 0xb0
 ; IASM-NEXT:    .byte 0xa0
 ; IASM-NEXT:    .byte 0xe1
 
-; IASM:         .byte 0xc
-; IASM-NEXT:    .byte 0xd0
-; IASM-NEXT:    .byte 0x4d
-; IASM-NEXT:    .byte 0xe2
-
-; IASM:         .byte 0x1f
-; IASM-NEXT:    .byte 0xd0
-; IASM-NEXT:    .byte 0xcd
-; IASM-NEXT:    .byte 0xe3
-
+; ASM-NEXT:  sub     sp, sp, #32
+; DIS-NEXT:   8:        e24dd020
 ; IASM:         .byte 0x20
 ; IASM-NEXT:    .byte 0xd0
 ; IASM-NEXT:    .byte 0x4d
 ; IASM-NEXT:    .byte 0xe2
 
-; IASM:         .byte 0xd
-; IASM-NEXT:    .byte 0x0
-; IASM-NEXT:    .byte 0xa0
-; IASM-NEXT:    .byte 0xe1
+; ASM-NEXT:  bic     sp, sp, #31
+; DIS-NEXT:   c:        e3cdd01f
+; IASM:         .byte 0x1f
+; IASM-NEXT:    .byte 0xd0
+; IASM-NEXT:    .byte 0xcd
+; IASM-NEXT:    .byte 0xe3
 
+; ASM-NEXT:  # sp = def.pseudo 
+
+; ASM-NEXT:  add     r0, sp, #0
+; DIS-NEXT:  10:        e28d0000
+; IASM:         .byte 0x0
+; IASM-NEXT:    .byte 0x0
+; IASM-NEXT:    .byte 0x8d
+; IASM-NEXT:    .byte 0xe2
+
+; ASM-NEXT:  mov     sp, fp
+; DIS-NEXT:  14:        e1a0d00b
 ; IASM:         .byte 0xb
 ; IASM-NEXT:    .byte 0xd0
 ; IASM-NEXT:    .byte 0xa0
 ; IASM-NEXT:    .byte 0xe1
 
+; ASM-NEXT:  pop     {fp}
+; DIS-NEXT:  18:        e49db004
 ; IASM-NEXT:    .byte 0x4
 ; IASM-NEXT:    .byte 0xb0
 ; IASM-NEXT:    .byte 0x9d
 ; IASM-NEXT:    .byte 0xe4
 
+; ASM-NEXT:  # fp = def.pseudo 
+
+; ASM-NEXT:  bx      lr
+; DIS-NEXT:  1c:        e12fff1e
 ; IASM:         .byte 0x1e
 ; IASM-NEXT:    .byte 0xff
 ; IASM-NEXT:    .byte 0x2f
diff --git a/tests_lit/llvm2ice_tests/alloc.ll b/tests_lit/llvm2ice_tests/alloc.ll
index 7b08afb..c90b678 100644
--- a/tests_lit/llvm2ice_tests/alloc.ll
+++ b/tests_lit/llvm2ice_tests/alloc.ll
@@ -17,14 +17,14 @@
 ; RUN:   --disassemble --target arm32 -i %s --args -O2 --skip-unimplemented \
 ; RUN:   -allow-externally-defined-symbols \
 ; RUN:   | %if --need=target_ARM32 --need=allow_dump \
-; RUN:   --command FileCheck --check-prefix ARM32 %s
+; RUN:   --command FileCheck --check-prefix ARM32 --check-prefix=ARM-OPT2 %s
 
 ; RUN: %if --need=target_ARM32 --need=allow_dump \
 ; RUN:   --command %p2i --filetype=asm --assemble \
 ; RUN:   --disassemble --target arm32 -i %s --args -Om1 --skip-unimplemented \
 ; RUN:   -allow-externally-defined-symbols \
 ; RUN:   | %if --need=target_ARM32 --need=allow_dump \
-; RUN:   --command FileCheck --check-prefix ARM32 %s
+; RUN:   --command FileCheck --check-prefix ARM32 --check-prefix=ARM-OPTM1 %s
 
 define internal void @fixed_416_align_16(i32 %n) {
 entry:
@@ -47,8 +47,9 @@
 ; CHECK-OPTM1:      call {{.*}} R_{{.*}}    f1
 
 ; ARM32-LABEL: fixed_416_align_16
-; ARM32:      sub sp, sp, #416
-; ARM32:      bl {{.*}} R_{{.*}}    f1
+; ARM32-OPT2:  sub sp, sp, #428
+; ARM32-OPTM1: sub sp, sp, #416
+; ARM32:       bl {{.*}} R_{{.*}}    f1
 
 define internal void @fixed_416_align_32(i32 %n) {
 entry:
@@ -67,9 +68,10 @@
 ; CHECK:      call {{.*}} R_{{.*}}    f1
 
 ; ARM32-LABEL: fixed_416_align_32
-; ARM32:      bic sp, sp, #31
-; ARM32:      sub sp, sp, #416
-; ARM32:      bl {{.*}} R_{{.*}}    f1
+; ARM32-OPT2:  sub sp, sp, #424
+; ARM32-OPTM1: sub sp, sp, #416
+; ARM32:       bic sp, sp, #31
+; ARM32:       bl {{.*}} R_{{.*}}    f1
 
 ; Show that the amount to allocate will be rounded up.
 define internal void @fixed_351_align_16(i32 %n) {
@@ -91,8 +93,9 @@
 ; CHECK-OPTM1:      call {{.*}} R_{{.*}}    f1
 
 ; ARM32-LABEL: fixed_351_align_16
-; ARM32:      sub sp, sp, #352
-; ARM32:      bl {{.*}} R_{{.*}}    f1
+; ARM32-OPT2:  sub sp, sp, #364
+; ARM32-OPTM1: sub sp, sp, #352
+; ARM32:       bl {{.*}} R_{{.*}}    f1
 
 define internal void @fixed_351_align_32(i32 %n) {
 entry:
@@ -111,9 +114,10 @@
 ; CHECK:      call {{.*}} R_{{.*}}    f1
 
 ; ARM32-LABEL: fixed_351_align_32
-; ARM32:      bic sp, sp, #31
-; ARM32:      sub sp, sp, #352
-; ARM32:      bl {{.*}} R_{{.*}}    f1
+; ARM32-OPT2:  sub sp, sp, #360
+; ARM32-OPTM1: sub sp, sp, #352
+; ARM32:       bic sp, sp, #31
+; ARM32:       bl {{.*}} R_{{.*}}    f1
 
 declare void @f1(i32 %ignored)