ARM: Add a postRA pass to legalize stack offsets. Greedy approach (reserve IP).

Make a post-register allocation and post-addProlog pass to
go through variables with stack offsets and legalize them
in case the offsets are not encodeable. The naive approach
is to reserve IP, and use IP to movw/movt the offset, then
add/sub the frame/stack pointer to IP and use IP as the new
base instead of the frame/stack pointer. We do some amount
of CSE within a basic block, and share the IP base pointer
when it is (a) within range for later stack references,
and (b) IP hasn't been clobbered (e.g., by a function call).
I chose to do this greedy approach for both Om1 and O2,
since it should just be a linear pass, and it reduces the
amount of variables/instructions created compared to the
super-naive peephole approach (so might be faster?).

Introduce a test-only flag and use that to artificially
bloat the stack frame so that spill offsets are out
of range for ARM. Use that flag for cross tests to
stress this new code a bit more (than would have been
stressed by simply doing a lit test + FileCheck).

Also, the previous version of emitVariable() was using the
Var's type to determine the range (only +/- 255 for i16,
vs +/- 4095 for i32), even though mov's emit() always
uses a full 32-bit "ldr" instead of a 16-bit "ldrh".
Use a common legality check, which uses the stackSlotType
instead of the Var's type. This previously caused the
test_bitmanip to spuriously complain, even though the
offsets for Om1 were "only" in the 300 byte range. With this
fixed, we can then enable the test_bitmanip test too.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1241763002 .
diff --git a/Makefile.standalone b/Makefile.standalone
index ae9d505..a892870 100644
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -349,6 +349,7 @@
 	  -i x8632,sandbox,sse4.1,Om1 \
 	  -i arm32,native,neon,simple_loop \
 	  -i arm32,native,neon,mem_intrin \
+	  -i arm32,native,neon,test_bitmanip \
 	  -i arm32,native,neon,test_stacksave \
 	  -i arm32,native,neon,test_strengthreduce
 	PNACL_BIN_PATH=$(PNACL_BIN_PATH) \
diff --git a/pydir/crosstest.py b/pydir/crosstest.py
index 7a479c3..c8ab306 100755
--- a/pydir/crosstest.py
+++ b/pydir/crosstest.py
@@ -29,8 +29,13 @@
                  'arm32': targets.ARM32Target }
     arch_sz_flags = { 'x8632': [],
                       'x8664': [],
-                      # TODO(jvoung): remove skip-unimplemented when implemented
-                      'arm32': ['--skip-unimplemented']
+                      # TODO(jvoung): remove skip-unimplemented when
+                      # implemented.
+                      # For ARM, test a large stack offset as well, until we
+                      # are more confident. +/- 4095 is the limit, so test
+                      # somewhere near that boundary.
+                      'arm32': ['--skip-unimplemented',
+                                '--test-stack-extra', '4084']
     }
     arch_llc_flags_extra = {
         # Use sse2 instructions regardless of input -mattr
diff --git a/src/IceCfg.h b/src/IceCfg.h
index 499e226..cf572c1 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -119,8 +119,8 @@
 
   /// Create a new Variable with a particular type and an optional
   /// name.  The Node argument is the node where the variable is defined.
-  // TODO(jpp): untemplate this with two separate methods: makeVariable and
-  // makeSpillVariable.
+  // TODO(jpp): untemplate this with separate methods: makeVariable,
+  // makeSpillVariable, and makeStackVariable.
   template <typename T = Variable> T *makeVariable(Type Ty) {
     SizeT Index = Variables.size();
     T *Var = T::create(this, Ty, Index);
diff --git a/src/IceClFlags.cpp b/src/IceClFlags.cpp
index 5066a4b..aa4b4b6 100644
--- a/src/IceClFlags.cpp
+++ b/src/IceClFlags.cpp
@@ -145,6 +145,13 @@
         clEnumValN(Ice::Target_MIPS32, "mips", "mips32"),
         clEnumValN(Ice::Target_MIPS32, "mips32", "mips32 (same as mips)"),
         clEnumValEnd));
+
+cl::opt<uint32_t> TestStackExtra(
+    "test-stack-extra",
+    cl::desc(
+        "Extra amount of stack to add to the frame in bytes (for testing)."),
+    cl::init(0));
+
 cl::opt<Ice::TargetInstructionSet> TargetInstructionSet(
     "mattr", cl::desc("Target architecture attributes"),
     cl::init(Ice::BaseInstructionSet),
@@ -364,6 +371,7 @@
   OutFlags.RandomizeAndPoolImmediatesThreshold = 0xffff;
   OutFlags.ReorderFunctionsWindowSize = 8;
   OutFlags.TArch = TargetArch_NUM;
+  OutFlags.TestStackExtra = 0;
   OutFlags.VMask = IceV_None;
   // IceString fields.
   OutFlags.DefaultFunctionPrefix = "";
@@ -421,6 +429,7 @@
   OutFlags.setTargetArch(::TargetArch);
   OutFlags.setTargetInstructionSet(::TargetInstructionSet);
   OutFlags.setTestPrefix(::TestPrefix);
+  OutFlags.setTestStackExtra(::TestStackExtra);
   OutFlags.setTimeEachFunction(::TimeEachFunction);
   OutFlags.setTimingFocusOn(::TimingFocusOn);
   OutFlags.setTranslateOnly(::TranslateOnly);
diff --git a/src/IceClFlags.h b/src/IceClFlags.h
index 244790b..2698236 100644
--- a/src/IceClFlags.h
+++ b/src/IceClFlags.h
@@ -132,6 +132,14 @@
   void setTargetInstructionSet(TargetInstructionSet NewValue) {
     TInstrSet = NewValue;
   }
+  uint32_t getTestStackExtra() const {
+    return BuildDefs::minimal() ? 0 : TestStackExtra;
+  }
+  void setTestStackExtra(uint32_t NewValue) {
+    if (BuildDefs::minimal())
+      return;
+    TestStackExtra = NewValue;
+  }
 
   VerboseMask getVerbose() const {
     return BuildDefs::dump() ? VMask : (VerboseMask)IceV_None;
@@ -251,6 +259,7 @@
   int RandomNopProbabilityAsPercentage;
   uint32_t ReorderFunctionsWindowSize;
   TargetArch TArch;
+  uint32_t TestStackExtra;
   TargetInstructionSet TInstrSet;
   VerboseMask VMask;
 
diff --git a/src/IceInstARM32.cpp b/src/IceInstARM32.cpp
index ddc94a7..70b6791 100644
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -407,7 +407,9 @@
     Operand *Src0 = getSrc(0);
     if (const auto *Src0V = llvm::dyn_cast<Variable>(Src0)) {
       if (!Src0V->hasReg()) {
-        Opcode = IceString("ldr"); // Always use the whole stack slot.
+        // Always use the whole stack slot. A 32-bit load has a larger range
+        // of offsets than 16-bit, etc.
+        Opcode = IceString("ldr");
       }
     } else {
       if (llvm::isa<OperandARM32Mem>(Src0))
diff --git a/src/IceInstARM32.def b/src/IceInstARM32.def
index 93b1e22..5bf54ee 100644
--- a/src/IceInstARM32.def
+++ b/src/IceInstARM32.def
@@ -20,6 +20,9 @@
 // TODO(jvoung): Allow r9 to be isInt when sandboxing is turned off
 // (native mode).
 //
+// IP is not considered isInt to reserve it as a scratch register. A scratch
+// register is useful for expanding instructions post-register allocation.
+//
 // LR is not considered isInt to avoid being allocated as a register.
 // It is technically preserved, but save/restore is handled separately,
 // based on whether or not the function MaybeLeafFunc.
@@ -37,7 +40,7 @@
   X(Reg_r9,  = Reg_r0 + 9,   "r9",  0, 1, 0, 0, 0, 0)                   \
   X(Reg_r10, = Reg_r0 + 10,  "r10", 0, 1, 0, 0, 1, 0)                   \
   X(Reg_fp,  = Reg_r0 + 11,  "fp",  0, 1, 0, 1, 1, 0)                   \
-  X(Reg_ip,  = Reg_r0 + 12,  "ip",  1, 0, 0, 0, 1, 0)                   \
+  X(Reg_ip,  = Reg_r0 + 12,  "ip",  1, 0, 0, 0, 0, 0)                   \
   X(Reg_sp,  = Reg_r0 + 13,  "sp",  0, 0, 1, 0, 0, 0)                   \
   X(Reg_lr,  = Reg_r0 + 14,  "lr",  0, 0, 0, 0, 0, 0)                   \
   X(Reg_pc,  = Reg_r0 + 15,  "pc",  0, 0, 0, 0, 0, 0)                   \
diff --git a/src/IceInstARM32.h b/src/IceInstARM32.h
index 030c194..d254180 100644
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -242,6 +242,35 @@
   Operand *ShiftAmt;
 };
 
+/// StackVariable represents a Var that isn't assigned a register (stack-only).
+/// It is assigned a stack slot, but the slot's offset may be too large to
+/// represent in the native addressing mode, and so it has a separate
+/// base register from SP/FP, where the offset from that base register is
+/// then in range.
+class StackVariable final : public Variable {
+  StackVariable() = delete;
+  StackVariable(const StackVariable &) = delete;
+  StackVariable &operator=(const StackVariable &) = delete;
+
+public:
+  static StackVariable *create(Cfg *Func, Type Ty, SizeT Index) {
+    return new (Func->allocate<StackVariable>()) StackVariable(Ty, Index);
+  }
+  const static OperandKind StackVariableKind =
+      static_cast<OperandKind>(kVariable_Target);
+  static bool classof(const Operand *Operand) {
+    return Operand->getKind() == StackVariableKind;
+  }
+  void setBaseRegNum(int32_t RegNum) { BaseRegNum = RegNum; }
+  int32_t getBaseRegNum() const override { return BaseRegNum; }
+  // Inherit dump() and emit() from Variable.
+
+private:
+  StackVariable(Type Ty, SizeT Index)
+      : Variable(StackVariableKind, Ty, Index) {}
+  int32_t BaseRegNum = Variable::NoRegister;
+};
+
 /// Base class for ARM instructions. While most ARM instructions can be
 /// conditionally executed, a few of them are not predicable (halt,
 /// memory barriers, etc.).
@@ -778,6 +807,7 @@
   void emitIAS(const Cfg *Func) const override;
   void dump(const Cfg *Func) const override;
   static bool classof(const Inst *Inst) { return isClassof(Inst, Adjuststack); }
+  SizeT getAmount() const { return Amount; }
 
 private:
   InstARM32AdjustStack(Cfg *Func, Variable *SP, SizeT Amount,
diff --git a/src/IceOperand.cpp b/src/IceOperand.cpp
index 815efaf..e46fac0 100644
--- a/src/IceOperand.cpp
+++ b/src/IceOperand.cpp
@@ -409,9 +409,11 @@
   } else if (Func->getTarget()->hasComputedFrame()) {
     if (Func->isVerbose(IceV_RegOrigins))
       Str << ":";
+    int32_t BaseRegisterNumber = getBaseRegNum();
+    if (BaseRegisterNumber == NoRegister)
+      BaseRegisterNumber = Func->getTarget()->getFrameOrStackReg();
     Str << "["
-        << Func->getTarget()->getRegName(
-               Func->getTarget()->getFrameOrStackReg(), IceType_i32);
+        << Func->getTarget()->getRegName(BaseRegisterNumber, IceType_i32);
     int32_t Offset = getStackOffset();
     if (Offset) {
       if (Offset > 0)
diff --git a/src/IceOperand.h b/src/IceOperand.h
index ccfefb9..5bf4676 100644
--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -506,6 +506,9 @@
   using Operand::dump;
   void dump(const Cfg *Func, Ostream &Str) const override;
 
+  /// Return reg num of base register, if different from stack/frame register.
+  virtual int32_t getBaseRegNum() const { return NoRegister; }
+
   static bool classof(const Operand *Operand) {
     OperandKind Kind = Operand->getKind();
     return Kind >= kVariable && Kind <= kVariable_Num;
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index 417ad2f..d1ca570 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -359,6 +359,9 @@
       *SpillAreaSizeBytes += Increment;
     }
   }
+  // For testing legalization of large stack offsets on targets with limited
+  // offset bits in instruction encodings, add some padding.
+  *SpillAreaSizeBytes += Ctx->getFlags().getTestStackExtra();
 }
 
 void TargetLowering::alignStackSpillAreas(uint32_t SpillAreaStartOffset,
@@ -390,10 +393,21 @@
                                          size_t GlobalsAndSubsequentPaddingSize,
                                          bool UsesFramePointer) {
   const VariablesMetadata *VMetadata = Func->getVMetadata();
+  // For testing legalization of large stack offsets on targets with limited
+  // offset bits in instruction encodings, add some padding. This assumes that
+  // SpillAreaSizeBytes has accounted for the extra test padding.
+  // When UseFramePointer is true, the offset depends on the padding,
+  // not just the SpillAreaSizeBytes. On the other hand, when UseFramePointer
+  // is false, the offsets depend on the gap between SpillAreaSizeBytes
+  // and SpillAreaPaddingBytes, so we don't increment that.
+  size_t TestPadding = Ctx->getFlags().getTestStackExtra();
+  if (UsesFramePointer)
+    SpillAreaPaddingBytes += TestPadding;
   size_t GlobalsSpaceUsed = SpillAreaPaddingBytes;
   size_t NextStackOffset = SpillAreaPaddingBytes;
   std::vector<size_t> LocalsSize(Func->getNumNodes());
   const bool SimpleCoalescing = !callsReturnsTwice();
+
   for (Variable *Var : SortedSpilledVariables) {
     size_t Increment = typeWidthInBytesOnStack(Var->getType());
     if (SimpleCoalescing && VMetadata->isTracked(Var)) {
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index c7ef918..2d03a76 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -93,6 +93,24 @@
   void advanceForward(InstList::iterator &I) const;
 };
 
+/// A helper class to advance the LoweringContext at each loop iteration.
+class PostIncrLoweringContext {
+  PostIncrLoweringContext() = delete;
+  PostIncrLoweringContext(const PostIncrLoweringContext &) = delete;
+  PostIncrLoweringContext &operator=(const PostIncrLoweringContext &) = delete;
+
+public:
+  explicit PostIncrLoweringContext(LoweringContext &Context)
+      : Context(Context) {}
+  ~PostIncrLoweringContext() {
+    Context.advanceCur();
+    Context.advanceNext();
+  }
+
+private:
+  LoweringContext &Context;
+};
+
 class TargetLowering {
   TargetLowering() = delete;
   TargetLowering(const TargetLowering &) = delete;
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index 5a87a0b..f137a93 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -285,6 +285,11 @@
     return;
   Func->dump("After stack frame mapping");
 
+  legalizeStackSlots();
+  if (Func->hasError())
+    return;
+  Func->dump("After legalizeStackSlots");
+
   Func->contractEmptyNodes();
   Func->reorderNodes();
 
@@ -335,6 +340,11 @@
     return;
   Func->dump("After stack frame mapping");
 
+  legalizeStackSlots();
+  if (Func->hasError())
+    return;
+  Func->dump("After legalizeStackSlots");
+
   // Nop insertion
   if (Ctx->getFlags().shouldDoNopInsertion()) {
     Func->doNopInsertion();
@@ -390,6 +400,8 @@
 }
 
 void TargetARM32::emitVariable(const Variable *Var) const {
+  if (!BuildDefs::dump())
+    return;
   Ostream &Str = Ctx->getStrEmit();
   if (Var->hasReg()) {
     Str << getRegName(Var->getRegNum(), Var->getType());
@@ -400,16 +412,17 @@
         "Infinite-weight Variable has no register assigned");
   }
   int32_t Offset = Var->getStackOffset();
-  if (!hasFramePointer())
-    Offset += getStackAdjustment();
-  // TODO(jvoung): Handle out of range. Perhaps we need a scratch register
-  // to materialize a larger offset.
-  constexpr bool SignExt = false;
-  if (!OperandARM32Mem::canHoldOffset(Var->getType(), SignExt, Offset)) {
+  int32_t BaseRegNum = Var->getBaseRegNum();
+  if (BaseRegNum == Variable::NoRegister) {
+    BaseRegNum = getFrameOrStackReg();
+    if (!hasFramePointer())
+      Offset += getStackAdjustment();
+  }
+  if (!isLegalVariableStackOffset(Offset)) {
     llvm::report_fatal_error("Illegal stack offset");
   }
-  const Type FrameSPTy = IceType_i32;
-  Str << "[" << getRegName(getFrameOrStackReg(), FrameSPTy);
+  const Type FrameSPTy = stackSlotType();
+  Str << "[" << getRegName(BaseRegNum, FrameSPTy);
   if (Offset != 0) {
     Str << ", " << getConstantPrefix() << Offset;
   }
@@ -562,7 +575,7 @@
   // | 1. preserved registers |
   // +------------------------+
   // | 2. padding             |
-  // +------------------------+
+  // +------------------------+ <--- FramePointer (if used)
   // | 3. global spill area   |
   // +------------------------+
   // | 4. padding             |
@@ -572,7 +585,7 @@
   // | 6. padding             |
   // +------------------------+
   // | 7. allocas             |
-  // +------------------------+
+  // +------------------------+ <--- StackPointer
   //
   // The following variables record the size in bytes of the given areas:
   //  * PreservedRegsSizeBytes: area 1
@@ -687,10 +700,9 @@
 
   // Generate "sub sp, SpillAreaSizeBytes"
   if (SpillAreaSizeBytes) {
-    // Use the IP inter-procedural scratch register if needed to legalize
-    // the immediate.
+    // Use the scratch register if needed to legalize the immediate.
     Operand *SubAmount = legalize(Ctx->getConstantInt32(SpillAreaSizeBytes),
-                                  Legal_Reg | Legal_Flex, RegARM32::Reg_ip);
+                                  Legal_Reg | Legal_Flex, getReservedTmpReg());
     Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
     _sub(SP, SP, SubAmount);
   }
@@ -791,10 +803,10 @@
   } else {
     // add SP, SpillAreaSizeBytes
     if (SpillAreaSizeBytes) {
-      // Use the IP inter-procedural scratch register if needed to legalize
-      // the immediate. It shouldn't be live at this point.
-      Operand *AddAmount = legalize(Ctx->getConstantInt32(SpillAreaSizeBytes),
-                                    Legal_Reg | Legal_Flex, RegARM32::Reg_ip);
+      // Use the scratch register if needed to legalize the immediate.
+      Operand *AddAmount =
+          legalize(Ctx->getConstantInt32(SpillAreaSizeBytes),
+                   Legal_Reg | Legal_Flex, getReservedTmpReg());
       _add(SP, SP, AddAmount);
     }
   }
@@ -844,6 +856,156 @@
   RI->setDeleted();
 }
 
+bool TargetARM32::isLegalVariableStackOffset(int32_t Offset) const {
+  constexpr bool SignExt = false;
+  return OperandARM32Mem::canHoldOffset(stackSlotType(), SignExt, Offset);
+}
+
+StackVariable *TargetARM32::legalizeVariableSlot(Variable *Var,
+                                                 Variable *OrigBaseReg) {
+  int32_t Offset = Var->getStackOffset();
+  // Legalize will likely need a movw/movt combination, but if the top
+  // bits are all 0 from negating the offset and subtracting, we could
+  // use that instead.
+  bool ShouldSub = (-Offset & 0xFFFF0000) == 0;
+  if (ShouldSub)
+    Offset = -Offset;
+  Operand *OffsetVal = legalize(Ctx->getConstantInt32(Offset),
+                                Legal_Reg | Legal_Flex, getReservedTmpReg());
+  Variable *ScratchReg = makeReg(IceType_i32, getReservedTmpReg());
+  if (ShouldSub)
+    _sub(ScratchReg, OrigBaseReg, OffsetVal);
+  else
+    _add(ScratchReg, OrigBaseReg, OffsetVal);
+  StackVariable *NewVar = Func->makeVariable<StackVariable>(stackSlotType());
+  NewVar->setWeight(RegWeight::Zero);
+  NewVar->setBaseRegNum(ScratchReg->getRegNum());
+  constexpr int32_t NewOffset = 0;
+  NewVar->setStackOffset(NewOffset);
+  return NewVar;
+}
+
+void TargetARM32::legalizeStackSlots() {
+  // If a stack variable's frame offset doesn't fit, convert from:
+  //   ldr X, OFF[SP]
+  // to:
+  //   movw/movt TMP, OFF_PART
+  //   add TMP, TMP, SP
+  //   ldr X, OFF_MORE[TMP]
+  //
+  // This is safe because we have reserved TMP, and add for ARM does not
+  // clobber the flags register.
+  Func->dump("Before legalizeStackSlots");
+  assert(hasComputedFrame());
+  // Early exit, if SpillAreaSizeBytes is really small.
+  if (isLegalVariableStackOffset(SpillAreaSizeBytes))
+    return;
+  Variable *OrigBaseReg = getPhysicalRegister(getFrameOrStackReg());
+  int32_t StackAdjust = 0;
+  // Do a fairly naive greedy clustering for now.  Pick the first stack slot
+  // that's out of bounds and make a new base reg using the architecture's temp
+  // register. If that works for the next slot, then great. Otherwise, create
+  // a new base register, clobbering the previous base register.  Never share a
+  // base reg across different basic blocks.  This isn't ideal if local and
+  // multi-block variables are far apart and their references are interspersed.
+  // It may help to be more coordinated about assign stack slot numbers
+  // and may help to assign smaller offsets to higher-weight variables
+  // so that they don't depend on this legalization.
+  for (CfgNode *Node : Func->getNodes()) {
+    Context.init(Node);
+    StackVariable *NewBaseReg = nullptr;
+    int32_t NewBaseOffset = 0;
+    while (!Context.atEnd()) {
+      PostIncrLoweringContext PostIncrement(Context);
+      Inst *CurInstr = Context.getCur();
+      Variable *Dest = CurInstr->getDest();
+      // Check if the previous NewBaseReg is clobbered, and reset if needed.
+      if ((Dest && NewBaseReg && Dest->hasReg() &&
+           Dest->getRegNum() == NewBaseReg->getBaseRegNum()) ||
+          llvm::isa<InstFakeKill>(CurInstr)) {
+        NewBaseReg = nullptr;
+        NewBaseOffset = 0;
+      }
+      // The stack adjustment only matters if we are using SP instead of FP.
+      if (!hasFramePointer()) {
+        if (auto *AdjInst = llvm::dyn_cast<InstARM32AdjustStack>(CurInstr)) {
+          StackAdjust += AdjInst->getAmount();
+          NewBaseOffset += AdjInst->getAmount();
+          continue;
+        }
+        if (llvm::isa<InstARM32Call>(CurInstr)) {
+          NewBaseOffset -= StackAdjust;
+          StackAdjust = 0;
+          continue;
+        }
+      }
+      // For now, only Mov instructions can have stack variables.  We need to
+      // know the type of instruction because we currently create a fresh one
+      // to replace Dest/Source, rather than mutate in place.
+      auto *MovInst = llvm::dyn_cast<InstARM32Mov>(CurInstr);
+      if (!MovInst) {
+        continue;
+      }
+      if (!Dest->hasReg()) {
+        int32_t Offset = Dest->getStackOffset();
+        Offset += StackAdjust;
+        if (!isLegalVariableStackOffset(Offset)) {
+          if (NewBaseReg) {
+            int32_t OffsetDiff = Offset - NewBaseOffset;
+            if (isLegalVariableStackOffset(OffsetDiff)) {
+              StackVariable *NewDest =
+                  Func->makeVariable<StackVariable>(stackSlotType());
+              NewDest->setWeight(RegWeight::Zero);
+              NewDest->setBaseRegNum(NewBaseReg->getBaseRegNum());
+              NewDest->setStackOffset(OffsetDiff);
+              Variable *NewDestVar = NewDest;
+              _mov(NewDestVar, MovInst->getSrc(0));
+              MovInst->setDeleted();
+              continue;
+            }
+          }
+          StackVariable *LegalDest = legalizeVariableSlot(Dest, OrigBaseReg);
+          assert(LegalDest != Dest);
+          Variable *LegalDestVar = LegalDest;
+          _mov(LegalDestVar, MovInst->getSrc(0));
+          MovInst->setDeleted();
+          NewBaseReg = LegalDest;
+          NewBaseOffset = Offset;
+          continue;
+        }
+      }
+      assert(MovInst->getSrcSize() == 1);
+      Variable *Var = llvm::dyn_cast<Variable>(MovInst->getSrc(0));
+      if (Var && !Var->hasReg()) {
+        int32_t Offset = Var->getStackOffset();
+        Offset += StackAdjust;
+        if (!isLegalVariableStackOffset(Offset)) {
+          if (NewBaseReg) {
+            int32_t OffsetDiff = Offset - NewBaseOffset;
+            if (isLegalVariableStackOffset(OffsetDiff)) {
+              StackVariable *NewVar =
+                  Func->makeVariable<StackVariable>(stackSlotType());
+              NewVar->setWeight(RegWeight::Zero);
+              NewVar->setBaseRegNum(NewBaseReg->getBaseRegNum());
+              NewVar->setStackOffset(OffsetDiff);
+              _mov(Dest, NewVar);
+              MovInst->setDeleted();
+              continue;
+            }
+          }
+          StackVariable *LegalVar = legalizeVariableSlot(Var, OrigBaseReg);
+          assert(LegalVar != Var);
+          _mov(Dest, LegalVar);
+          MovInst->setDeleted();
+          NewBaseReg = LegalVar;
+          NewBaseOffset = Offset;
+          continue;
+        }
+      }
+    }
+  }
+}
+
 void TargetARM32::split64(Variable *Var) {
   assert(Var->getType() == IceType_i64);
   Variable *Lo = Var->getLo();
@@ -2080,7 +2242,9 @@
     if (Val->getType() == IceType_i64) {
       Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
       Constant *Zero = Ctx->getConstantZero(IceType_i32);
-      _mov(DestHi, Zero);
+      Variable *T = nullptr;
+      _mov(T, Zero);
+      _mov(DestHi, T);
     }
     return;
   }
@@ -2232,7 +2396,9 @@
     // prolong the liveness of T2 as if it was used as a source.
     _set_dest_nonkillable();
     _mov(DestLo, T2);
-    _mov(DestHi, Ctx->getConstantZero(IceType_i32));
+    Variable *T3 = nullptr;
+    _mov(T3, Zero);
+    _mov(DestHi, T3);
     return;
   }
   _mov(Dest, T);
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 58deb24..b60b6a6 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -72,6 +72,8 @@
   SizeT getFrameOrStackReg() const override {
     return UsesFramePointer ? RegARM32::Reg_fp : RegARM32::Reg_sp;
   }
+  SizeT getReservedTmpReg() const { return RegARM32::Reg_ip; }
+
   size_t typeWidthInBytesOnStack(Type Ty) const override {
     // Round up to the next multiple of 4 bytes.  In particular, i1,
     // i8, and i16 are rounded up to 4 bytes.
@@ -380,6 +382,16 @@
     Context.insert(InstARM32Uxt::create(Func, Dest, Src0, Pred));
   }
 
+  /// Run a pass through stack variables and ensure that the offsets are legal.
+  /// If the offset is not legal, use a new base register that accounts for
+  /// the offset, such that the addressing mode offset bits are now legal.
+  void legalizeStackSlots();
+  /// Returns true if the given Offset can be represented in a stack ldr/str.
+  bool isLegalVariableStackOffset(int32_t Offset) const;
+  /// Assuming Var needs its offset legalized, define a new base register
+  /// centered on the given Var's offset and use it.
+  StackVariable *legalizeVariableSlot(Variable *Var, Variable *OrigBaseReg);
+
   TargetARM32Features CPUFeatures;
   bool UsesFramePointer = false;
   bool NeedsStackAlignment = false;
diff --git a/src/IceTargetLoweringMIPS32.cpp b/src/IceTargetLoweringMIPS32.cpp
index d55fce6..5e87fa7 100644
--- a/src/IceTargetLoweringMIPS32.cpp
+++ b/src/IceTargetLoweringMIPS32.cpp
@@ -261,6 +261,8 @@
 }
 
 void TargetMIPS32::emitVariable(const Variable *Var) const {
+  if (!BuildDefs::dump())
+    return;
   Ostream &Str = Ctx->getStrEmit();
   (void)Var;
   (void)Str;
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 3c7ad5e..88abe09 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -751,6 +751,8 @@
 
 template <class Machine>
 void TargetX86Base<Machine>::emitVariable(const Variable *Var) const {
+  if (!BuildDefs::dump())
+    return;
   Ostream &Str = Ctx->getStrEmit();
   if (Var->hasReg()) {
     Str << "%" << getRegName(Var->getRegNum(), Var->getType());
@@ -760,12 +762,16 @@
     llvm_unreachable("Infinite-weight Variable has no register assigned");
   }
   int32_t Offset = Var->getStackOffset();
-  if (!hasFramePointer())
-    Offset += getStackAdjustment();
+  int32_t BaseRegNum = Var->getBaseRegNum();
+  if (BaseRegNum == Variable::NoRegister) {
+    BaseRegNum = getFrameOrStackReg();
+    if (!hasFramePointer())
+      Offset += getStackAdjustment();
+  }
   if (Offset)
     Str << Offset;
   const Type FrameSPTy = IceType_i32;
-  Str << "(%" << getRegName(getFrameOrStackReg(), FrameSPTy) << ")";
+  Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")";
 }
 
 template <class Machine>
@@ -777,10 +783,14 @@
     llvm_unreachable("Infinite-weight Variable has no register assigned");
   }
   int32_t Offset = Var->getStackOffset();
-  if (!hasFramePointer())
-    Offset += getStackAdjustment();
+  int32_t BaseRegNum = Var->getBaseRegNum();
+  if (Var->getBaseRegNum() == Variable::NoRegister) {
+    BaseRegNum = getFrameOrStackReg();
+    if (!hasFramePointer())
+      Offset += getStackAdjustment();
+  }
   return typename Traits::Address(
-      Traits::RegisterSet::getEncodedGPR(getFrameOrStackReg()), Offset);
+      Traits::RegisterSet::getEncodedGPR(BaseRegNum), Offset);
 }
 
 template <class Machine> void TargetX86Base<Machine>::lowerArguments() {
diff --git a/tests_lit/llvm2ice_tests/large_stack_offs.ll b/tests_lit/llvm2ice_tests/large_stack_offs.ll
new file mode 100644
index 0000000..6fba4df
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/large_stack_offs.ll
@@ -0,0 +1,133 @@
+; This tries to create variables with very large stack offsets.
+; This requires a lot of variables/register pressure. To simplify this
+; we assume poor register allocation from Om1, and a flag that forces
+; the frame to add K amount of unused stack for testing.
+; We only need to test ARM and other architectures which have limited space
+; for specifying an offset within an instruction.
+
+; RUN: %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command %p2i --filetype=asm --assemble --disassemble --target arm32 \
+; RUN:   -i %s --args -Om1 --skip-unimplemented --test-stack-extra 4096 \
+; RUN:   | %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command FileCheck --check-prefix ARM32 %s
+
+declare i64 @dummy(i32 %t1, i32 %t2, i32 %t3, i64 %t4, i64 %t5)
+
+; Test a function that requires lots of stack (due to test flag), and uses
+; SP as the base register (originally).
+define internal i64 @lotsOfStack(i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  %t1 = xor i32 %a, %b
+  %t2 = or i32 %c, %d
+  %cmp = icmp eq i32 %t1, %t2
+  br i1 %cmp, label %br_1, label %br_2
+
+br_1:
+  %x1 = zext i32 %t1 to i64
+  %y1 = ashr i64 %x1, 17
+  ; Use some stack during the call, so that references to %t1 and %t2's
+  ; stack slots require stack adjustment.
+  %r1 = call i64 @dummy(i32 123, i32 321, i32 %t2, i64 %x1, i64 %y1)
+  %z1 = sub i64 %r1, %y1
+  br label %end
+
+br_2:
+  %x2 = zext i32 %t2 to i64
+  %y2 = and i64 %x2, 123
+  %r2 = call i64 @dummy(i32 123, i32 321, i32 %t2, i64 %x2, i64 %y2)
+  %z2 = and i64 %r2, %y2
+  br label %end
+
+end:
+  %x3 = phi i64 [ %x1, %br_1 ], [ %x2, %br_2 ]
+  %z3 = phi i64 [ %z1, %br_1 ], [ %z2, %br_2 ]
+  %r3 = and i64 %x3, %z3
+  ret i64 %r3
+}
+; ARM32-LABEL: lotsOfStack
+; ARM32-NOT: mov fp, sp
+; ARM32: movw ip, #4{{.*}}
+; ARM32-NEXT: sub sp, sp, ip
+; ARM32: movw ip, #4232
+; ARM32-NEXT: add ip, sp, ip
+; ARM32-NOT: movw ip
+; %t2 is the result of the "or", and %t2 will be passed via r1 to the call.
+; Use that to check the stack offset of %t2. The first offset and the
+; later offset right before the call should be 16 bytes apart,
+; because of the sub sp, sp, #16.
+; ARM32: orr [[REG:r.*]], {{.*}},
+; I.e., the slot for t2 is (sp0 + 4232 - 20) == sp0 + 4212.
+; ARM32: str [[REG]], [ip, #-20]
+; ARM32: b {{[a-f0-9]+}}
+; Now skip ahead to where the call in br_1 begins, to check how %t2 is used.
+; ARM32: movw ip, #4216
+; ARM32-NEXT: add ip, sp, ip
+; ARM32: sub sp, sp, #16
+; Now sp1 = sp0 - 16, but ip is still in terms of sp0.
+; So, sp0 + 4212 == ip - 4.
+; ARM32: ldr r2, [ip, #-4]
+; ARM32: bl {{.*}} dummy
+; ARM32: add sp, sp
+; The call clobbers ip, so we need to re-create the base register.
+; ARM32: movw ip, #4{{.*}}
+; ARM32: b {{[a-f0-9]+}}
+; ARM32: bl {{.*}} dummy
+
+; Similar, but test a function that uses FP as the base register (originally).
+define internal i64 @usesFrameReg(i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  %p = alloca i8, i32 %d, align 4
+  %t1 = xor i32 %a, %b
+  %t2 = or i32 %c, %d
+  %cmp = icmp eq i32 %t1, %t2
+  br i1 %cmp, label %br_1, label %br_2
+
+br_1:
+  %x1 = zext i32 %t1 to i64
+  %y1 = ashr i64 %x1, 17
+  %p32 = ptrtoint i8* %p to i32
+  %r1 = call i64 @dummy(i32 %p32, i32 321, i32 %t2, i64 %x1, i64 %y1)
+  %z1 = sub i64 %r1, %y1
+  br label %end
+
+br_2:
+  %x2 = zext i32 %t2 to i64
+  %y2 = and i64 %x2, 123
+  %r2 = call i64 @dummy(i32 123, i32 321, i32 %d, i64 %x2, i64 %y2)
+  %z2 = and i64 %r2, %y2
+  br label %end
+
+end:
+  %x3 = phi i64 [ %x1, %br_1 ], [ %x2, %br_2 ]
+  %z3 = phi i64 [ %z1, %br_1 ], [ %z2, %br_2 ]
+  %r3 = and i64 %x3, %z3
+  ret i64 %r3
+}
+; ARM32-LABEL: usesFrameReg
+; ARM32: mov fp, sp
+; ARM32: movw ip, #4{{.*}}
+; ARM32-NEXT: sub sp, sp, ip
+; ARM32: movw ip, #4100
+; ARM32-NEXT: sub ip, fp, ip
+; ARM32-NOT: movw ip
+; %t2 is the result of the "or", and %t2 will be passed via r1 to the call.
+; Use that to check the stack offset of %t2. It should be the same offset
+; even after sub sp, sp, #16, because the base register was originally
+; the FP and not the SP.
+; ARM32: orr [[REG:r.*]], {{.*}},
+; I.e., the slot for t2 is (fp0 - 4100 -24) == fp0 - 4124
+; ARM32: str [[REG]], [ip, #-24]
+; ARM32: b {{[a-f0-9]+}}
+; Now skip ahead to where the call in br_1 begins, to check how %t2 is used.
+; ARM32: movw ip, #4120
+; ARM32-NEXT: sub ip, fp, ip
+; ARM32: sub sp, sp, #16
+; Now sp1 = sp0 - 16, but ip is still in terms of fp0.
+; So, fp0 - 4124 == ip - 4.
+; ARM32: ldr r2, [ip, #-4]
+; ARM32: bl {{.*}} dummy
+; ARM32: add sp, sp
+; The call clobbers ip, so we need to re-create the base register.
+; ARM32: movw ip, #4{{.*}}
+; ARM32: b {{[a-f0-9]+}}
+; ARM32: bl {{.*}} dummy