Subzero ARM: addProlog/addEpilogue -- share some code with x86.

Split out some of the addProlog code from x86 and
reuse that for ARM. Mainly, the code that doesn't
concern preserved registers or stack arguments is split out.

ARM push and pop take a whole list of registers (not
necessarily consecutive, but should be in ascending order).
There is also "vpush" for callee-saved float/vector
registers but we do not handle that yet (the register
numbers for that have to be consecutive).

Enable some of the int-arg.ll tests, which relied on
addPrologue's finishArgumentLowering to pull from the
correct argument stack slot.

Test some of the frame pointer usage (push/pop) when
handling a variable sized alloca.

Also change the classification of LR, and PC so that
they are not "CalleeSave". We don't want to push LR
if it isn't overwritten by another call. It will certainly be
"used" by the return however. The prologue code only checks
if a CalleeSave register is used somewhere before deciding
to preserve it. We could make that stricter and check if
the register is also written to, but there are some
additional writes that are not visible till after the
push/pop are generated (e.g., copy from argument stack slot
to the argument register). Instead, keep checking use
only, and handle LR as a special case (IsLeafFunction).

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1159013002
diff --git a/src/IceInstARM32.cpp b/src/IceInstARM32.cpp
index 713df3a..424acf0 100644
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -260,6 +260,20 @@
   addSource(Acc);
 }
 
+InstARM32Pop::InstARM32Pop(Cfg *Func, const VarList &Dests)
+    : InstARM32(Func, InstARM32::Pop, 0, nullptr), Dests(Dests) {
+  // Track modifications to Dests separately via FakeDefs.
+  // Also, a pop instruction affects the stack pointer and so it should not
+  // be allowed to be automatically dead-code eliminated. This is automatic
+  // since we leave the Dest as nullptr.
+}
+
+InstARM32Push::InstARM32Push(Cfg *Func, const VarList &Srcs)
+    : InstARM32(Func, InstARM32::Push, Srcs.size(), nullptr) {
+  for (Variable *Source : Srcs)
+    addSource(Source);
+}
+
 InstARM32Ret::InstARM32Ret(Cfg *Func, Variable *LR, Variable *Source)
     : InstARM32(Func, InstARM32::Ret, Source ? 2 : 1, nullptr) {
   addSource(LR);
@@ -554,6 +568,66 @@
   }
 }
 
+void InstARM32Pop::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  assert(Dests.size() > 0);
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t"
+      << "pop"
+      << "\t{";
+  for (SizeT I = 0; I < Dests.size(); ++I) {
+    if (I > 0)
+      Str << ", ";
+    Dests[I]->emit(Func);
+  }
+  Str << "}";
+}
+
+void InstARM32Pop::emitIAS(const Cfg *Func) const {
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+
+void InstARM32Pop::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "pop"
+      << " ";
+  for (SizeT I = 0; I < Dests.size(); ++I) {
+    if (I > 0)
+      Str << ", ";
+    Dests[I]->dump(Func);
+  }
+}
+
+void InstARM32Push::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  assert(getSrcSize() > 0);
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t"
+      << "push"
+      << "\t{";
+  emitSources(Func);
+  Str << "}";
+}
+
+void InstARM32Push::emitIAS(const Cfg *Func) const {
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+
+void InstARM32Push::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "push"
+      << " ";
+  dumpSources(Func);
+}
+
 void InstARM32Ret::emit(const Cfg *Func) const {
   if (!ALLOW_DUMP)
     return;
@@ -683,7 +757,7 @@
   } else {
     getOffset()->dump(Func, Str);
   }
-  Str << "] AddrMode==" << getAddrMode() << "\n";
+  Str << "] AddrMode==" << getAddrMode();
 }
 
 void OperandARM32FlexImm::emit(const Cfg *Func) const {
diff --git a/src/IceInstARM32.def b/src/IceInstARM32.def
index d381e1b..93b1e22 100644
--- a/src/IceInstARM32.def
+++ b/src/IceInstARM32.def
@@ -15,9 +15,14 @@
 #define SUBZERO_SRC_ICEINSTARM32_DEF
 
 // NOTE: PC and SP are not considered isInt, to avoid register allocating.
+//
 // For the NaCl sandbox we also need to r9 for TLS, so just reserve always.
 // TODO(jvoung): Allow r9 to be isInt when sandboxing is turned off
 // (native mode).
+//
+// LR is not considered isInt to avoid being allocated as a register.
+// It is technically preserved, but save/restore is handled separately,
+// based on whether or not the function MaybeLeafFunc.
 #define REGARM32_GPR_TABLE                                                     \
   /* val, encode, name, scratch, preserved, stackptr, frameptr, isInt, isFP */ \
   X(Reg_r0,  = 0,            "r0",  1, 0, 0, 0, 1, 0)                   \
@@ -33,9 +38,9 @@
   X(Reg_r10, = Reg_r0 + 10,  "r10", 0, 1, 0, 0, 1, 0)                   \
   X(Reg_fp,  = Reg_r0 + 11,  "fp",  0, 1, 0, 1, 1, 0)                   \
   X(Reg_ip,  = Reg_r0 + 12,  "ip",  1, 0, 0, 0, 1, 0)                   \
-  X(Reg_sp,  = Reg_r0 + 13,  "sp",  0, 1, 1, 0, 0, 0)                   \
-  X(Reg_lr,  = Reg_r0 + 14,  "lr",  0, 1, 0, 0, 1, 0)                   \
-  X(Reg_pc,  = Reg_r0 + 15,  "pc",  0, 1, 0, 0, 0, 0)                   \
+  X(Reg_sp,  = Reg_r0 + 13,  "sp",  0, 0, 1, 0, 0, 0)                   \
+  X(Reg_lr,  = Reg_r0 + 14,  "lr",  0, 0, 0, 0, 0, 0)                   \
+  X(Reg_pc,  = Reg_r0 + 15,  "pc",  0, 0, 0, 0, 0, 0)                   \
 //#define X(val, encode, name, scratch, preserved, stackptr, frameptr,
 //          isInt, isFP)
 
diff --git a/src/IceInstARM32.h b/src/IceInstARM32.h
index e65ef4a..9e0bb65 100644
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -113,7 +113,14 @@
   AddrMode getAddrMode() const { return Mode; }
 
   bool isRegReg() const { return Index != nullptr; }
-  bool isNegAddrMode() const { return Mode >= NegOffset; }
+  bool isNegAddrMode() const {
+    // Positive address modes have the "U" bit set, and negative modes don't.
+    static_assert((PreIndex & (4 << 21)) != 0,
+                  "Positive addr modes should have U bit set.");
+    static_assert((NegPreIndex & (4 << 21)) == 0,
+                  "Negative addr modes should have U bit clear.");
+    return (Mode & (4 << 21)) == 0;
+  }
 
   void emit(const Cfg *Func) const override;
   using OperandARM32::dump;
@@ -266,6 +273,8 @@
     Mul,
     Mvn,
     Orr,
+    Pop,
+    Push,
     Ret,
     Sbc,
     Sub,
@@ -682,6 +691,49 @@
   ~InstARM32Mla() override {}
 };
 
+// Pop into a list of GPRs. Technically this can be predicated, but we don't
+// need that functionality.
+class InstARM32Pop : public InstARM32 {
+  InstARM32Pop() = delete;
+  InstARM32Pop(const InstARM32Pop &) = delete;
+  InstARM32Pop &operator=(const InstARM32Pop &) = delete;
+
+public:
+  static InstARM32Pop *create(Cfg *Func, const VarList &Dests) {
+    return new (Func->allocate<InstARM32Pop>()) InstARM32Pop(Func, Dests);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Pop); }
+
+private:
+  InstARM32Pop(Cfg *Func, const VarList &Dests);
+  ~InstARM32Pop() override {}
+  VarList Dests;
+};
+
+// Push a list of GPRs. Technically this can be predicated, but we don't
+// need that functionality.
+class InstARM32Push : public InstARM32 {
+  InstARM32Push() = delete;
+  InstARM32Push(const InstARM32Push &) = delete;
+  InstARM32Push &operator=(const InstARM32Push &) = delete;
+
+public:
+  static InstARM32Push *create(Cfg *Func, const VarList &Srcs) {
+    return new (Func->allocate<InstARM32Push>()) InstARM32Push(Func, Srcs);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Push); }
+
+private:
+  InstARM32Push(Cfg *Func, const VarList &Srcs);
+  ~InstARM32Push() override {}
+};
+
 // Ret pseudo-instruction.  This is actually a "bx" instruction with
 // an "lr" register operand, but epilogue lowering will search for a Ret
 // instead of a generic "bx". This instruction also takes a Source
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index db332cb..bbbdc64 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -244,6 +244,159 @@
   }
 }
 
+void TargetLowering::sortVarsByAlignment(VarList &Dest,
+                                         const VarList &Source) const {
+  Dest = Source;
+  // Instead of std::sort, we could do a bucket sort with log2(alignment)
+  // as the buckets, if performance is an issue.
+  std::sort(Dest.begin(), Dest.end(),
+            [this](const Variable *V1, const Variable *V2) {
+              return typeWidthInBytesOnStack(V1->getType()) >
+                  typeWidthInBytesOnStack(V2->getType());
+            });
+}
+
+void TargetLowering::getVarStackSlotParams(
+    VarList &SortedSpilledVariables, llvm::SmallBitVector &RegsUsed,
+    size_t *GlobalsSize, size_t *SpillAreaSizeBytes,
+    uint32_t *SpillAreaAlignmentBytes, uint32_t *LocalsSlotsAlignmentBytes,
+    std::function<bool(Variable *)> TargetVarHook) {
+  const VariablesMetadata *VMetadata = Func->getVMetadata();
+  llvm::BitVector IsVarReferenced(Func->getNumVariables());
+  for (CfgNode *Node : Func->getNodes()) {
+    for (Inst &Inst : Node->getInsts()) {
+      if (Inst.isDeleted())
+        continue;
+      if (const Variable *Var = Inst.getDest())
+        IsVarReferenced[Var->getIndex()] = true;
+      for (SizeT I = 0; I < Inst.getSrcSize(); ++I) {
+        Operand *Src = Inst.getSrc(I);
+        SizeT NumVars = Src->getNumVars();
+        for (SizeT J = 0; J < NumVars; ++J) {
+          const Variable *Var = Src->getVar(J);
+          IsVarReferenced[Var->getIndex()] = true;
+        }
+      }
+    }
+  }
+
+  // If SimpleCoalescing is false, each variable without a register
+  // gets its own unique stack slot, which leads to large stack
+  // frames.  If SimpleCoalescing is true, then each "global" variable
+  // without a register gets its own slot, but "local" variable slots
+  // are reused across basic blocks.  E.g., if A and B are local to
+  // block 1 and C is local to block 2, then C may share a slot with A or B.
+  //
+  // We cannot coalesce stack slots if this function calls a "returns twice"
+  // function. In that case, basic blocks may be revisited, and variables
+  // local to those basic blocks are actually live until after the
+  // called function returns a second time.
+  const bool SimpleCoalescing = !callsReturnsTwice();
+
+  std::vector<size_t> LocalsSize(Func->getNumNodes());
+  const VarList &Variables = Func->getVariables();
+  VarList SpilledVariables;
+  for (Variable *Var : Variables) {
+    if (Var->hasReg()) {
+      RegsUsed[Var->getRegNum()] = true;
+      continue;
+    }
+    // An argument either does not need a stack slot (if passed in a
+    // register) or already has one (if passed on the stack).
+    if (Var->getIsArg())
+      continue;
+    // An unreferenced variable doesn't need a stack slot.
+    if (!IsVarReferenced[Var->getIndex()])
+      continue;
+    // Check a target-specific variable (it may end up sharing stack slots)
+    // and not need accounting here.
+    if (TargetVarHook(Var))
+      continue;
+    SpilledVariables.push_back(Var);
+  }
+
+  SortedSpilledVariables.reserve(SpilledVariables.size());
+  sortVarsByAlignment(SortedSpilledVariables, SpilledVariables);
+
+  for (Variable *Var : SortedSpilledVariables) {
+    size_t Increment = typeWidthInBytesOnStack(Var->getType());
+    // We have sorted by alignment, so the first variable we encounter that
+    // is located in each area determines the max alignment for the area.
+    if (!*SpillAreaAlignmentBytes)
+      *SpillAreaAlignmentBytes = Increment;
+    if (SimpleCoalescing && VMetadata->isTracked(Var)) {
+      if (VMetadata->isMultiBlock(Var)) {
+        *GlobalsSize += Increment;
+      } else {
+        SizeT NodeIndex = VMetadata->getLocalUseNode(Var)->getIndex();
+        LocalsSize[NodeIndex] += Increment;
+        if (LocalsSize[NodeIndex] > *SpillAreaSizeBytes)
+          *SpillAreaSizeBytes = LocalsSize[NodeIndex];
+        if (!*LocalsSlotsAlignmentBytes)
+          *LocalsSlotsAlignmentBytes = Increment;
+      }
+    } else {
+      *SpillAreaSizeBytes += Increment;
+    }
+  }
+}
+
+void TargetLowering::alignStackSpillAreas(uint32_t SpillAreaStartOffset,
+                                          uint32_t SpillAreaAlignmentBytes,
+                                          size_t GlobalsSize,
+                                          uint32_t LocalsSlotsAlignmentBytes,
+                                          uint32_t *SpillAreaPaddingBytes,
+                                          uint32_t *LocalsSlotsPaddingBytes) {
+  if (SpillAreaAlignmentBytes) {
+    uint32_t PaddingStart = SpillAreaStartOffset;
+    uint32_t SpillAreaStart =
+        Utils::applyAlignment(PaddingStart, SpillAreaAlignmentBytes);
+    *SpillAreaPaddingBytes = SpillAreaStart - PaddingStart;
+  }
+
+  // If there are separate globals and locals areas, make sure the
+  // locals area is aligned by padding the end of the globals area.
+  if (LocalsSlotsAlignmentBytes) {
+    uint32_t GlobalsAndSubsequentPaddingSize = GlobalsSize;
+    GlobalsAndSubsequentPaddingSize =
+        Utils::applyAlignment(GlobalsSize, LocalsSlotsAlignmentBytes);
+    *LocalsSlotsPaddingBytes = GlobalsAndSubsequentPaddingSize - GlobalsSize;
+  }
+}
+
+void TargetLowering::assignVarStackSlots(VarList &SortedSpilledVariables,
+                                         size_t SpillAreaPaddingBytes,
+                                         size_t SpillAreaSizeBytes,
+                                         size_t GlobalsAndSubsequentPaddingSize,
+                                         bool UsesFramePointer) {
+  const VariablesMetadata *VMetadata = Func->getVMetadata();
+  size_t GlobalsSpaceUsed = SpillAreaPaddingBytes;
+  size_t NextStackOffset = SpillAreaPaddingBytes;
+  std::vector<size_t> LocalsSize(Func->getNumNodes());
+  const bool SimpleCoalescing = !callsReturnsTwice();
+  for (Variable *Var : SortedSpilledVariables) {
+    size_t Increment = typeWidthInBytesOnStack(Var->getType());
+    if (SimpleCoalescing && VMetadata->isTracked(Var)) {
+      if (VMetadata->isMultiBlock(Var)) {
+        GlobalsSpaceUsed += Increment;
+        NextStackOffset = GlobalsSpaceUsed;
+      } else {
+        SizeT NodeIndex = VMetadata->getLocalUseNode(Var)->getIndex();
+        LocalsSize[NodeIndex] += Increment;
+        NextStackOffset = SpillAreaPaddingBytes +
+                          GlobalsAndSubsequentPaddingSize +
+                          LocalsSize[NodeIndex];
+      }
+    } else {
+      NextStackOffset += Increment;
+    }
+    if (UsesFramePointer)
+      Var->setStackOffset(-NextStackOffset);
+    else
+      Var->setStackOffset(SpillAreaSizeBytes - NextStackOffset);
+  }
+}
+
 InstCall *TargetLowering::makeHelperCall(const IceString &Name, Variable *Dest,
                                          SizeT MaxSrcs) {
   const bool HasTailCall = false;
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index c875b26..a2eac4b 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -165,6 +165,7 @@
   virtual bool hasFramePointer() const { return false; }
   virtual SizeT getFrameOrStackReg() const = 0;
   virtual size_t typeWidthInBytesOnStack(Type Ty) const = 0;
+
   bool hasComputedFrame() const { return HasComputedFrame; }
   // Returns true if this function calls a function that has the
   // "returns twice" attribute.
@@ -259,10 +260,66 @@
   // to keep liveness analysis consistent.
   void inferTwoAddress();
 
+  // Make a pass over the Cfg to determine which variables need stack slots
+  // and place them in a sorted list (SortedSpilledVariables). Among those,
+  // vars, classify the spill variables as local to the basic block vs
+  // global (multi-block) in order to compute the parameters GlobalsSize
+  // and SpillAreaSizeBytes (represents locals or general vars if the
+  // coalescing of locals is disallowed) along with alignments required
+  // for variables in each area. We rely on accurate VMetadata in order to
+  // classify a variable as global vs local (otherwise the variable is
+  // conservatively global). The in-args should be initialized to 0.
+  //
+  // This is only a pre-pass and the actual stack slot assignment is
+  // handled separately.
+  //
+  // There may be target-specific Variable types, which will be handled
+  // by TargetVarHook. If the TargetVarHook returns true, then the variable
+  // is skipped and not considered with the rest of the spilled variables.
+  void getVarStackSlotParams(VarList &SortedSpilledVariables,
+                             llvm::SmallBitVector &RegsUsed,
+                             size_t *GlobalsSize, size_t *SpillAreaSizeBytes,
+                             uint32_t *SpillAreaAlignmentBytes,
+                             uint32_t *LocalsSlotsAlignmentBytes,
+                             std::function<bool(Variable *)> TargetVarHook);
+
+  // Calculate the amount of padding needed to align the local and global
+  // areas to the required alignment.  This assumes the globals/locals layout
+  // used by getVarStackSlotParams and assignVarStackSlots.
+  void alignStackSpillAreas(uint32_t SpillAreaStartOffset,
+                            uint32_t SpillAreaAlignmentBytes,
+                            size_t GlobalsSize,
+                            uint32_t LocalsSlotsAlignmentBytes,
+                            uint32_t *SpillAreaPaddingBytes,
+                            uint32_t *LocalsSlotsPaddingBytes);
+
+  // Make a pass through the SortedSpilledVariables and actually assign
+  // stack slots. SpillAreaPaddingBytes takes into account stack alignment
+  // padding. The SpillArea starts after that amount of padding.
+  // This matches the scheme in getVarStackSlotParams, where there may
+  // be a separate multi-block global var spill area and a local var
+  // spill area.
+  void assignVarStackSlots(VarList &SortedSpilledVariables,
+                           size_t SpillAreaPaddingBytes,
+                           size_t SpillAreaSizeBytes,
+                           size_t GlobalsAndSubsequentPaddingSize,
+                           bool UsesFramePointer);
+
+  // Sort the variables in Source based on required alignment.
+  // The variables with the largest alignment need are placed in the front
+  // of the Dest list.
+  void sortVarsByAlignment(VarList &Dest, const VarList &Source) const;
+
   // Make a call to an external helper function.
   InstCall *makeHelperCall(const IceString &Name, Variable *Dest,
                            SizeT MaxSrcs);
 
+  void
+  _bundle_lock(InstBundleLock::Option BundleOption = InstBundleLock::Opt_None) {
+    Context.insert(InstBundleLock::create(Func, BundleOption));
+  }
+  void _bundle_unlock() { Context.insert(InstBundleUnlock::create(Func)); }
+
   Cfg *Func;
   GlobalContext *Ctx;
   bool HasComputedFrame;
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index 2305a1b..e160a3c 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -126,10 +126,17 @@
 // Stack alignment
 const uint32_t ARM32_STACK_ALIGNMENT_BYTES = 16;
 
+// Value is in bytes. Return Value adjusted to the next highest multiple
+// of the stack alignment.
+uint32_t applyStackAlignment(uint32_t Value) {
+  return Utils::applyAlignment(Value, ARM32_STACK_ALIGNMENT_BYTES);
+}
+
 } // end of anonymous namespace
 
 TargetARM32::TargetARM32(Cfg *Func)
-    : TargetLowering(Func), UsesFramePointer(false) {
+    : TargetLowering(Func), UsesFramePointer(false), NeedsStackAlignment(false),
+      MaybeLeafFunc(true), SpillAreaSizeBytes(0) {
   // TODO: Don't initialize IntegerRegisters and friends every time.
   // Instead, initialize in some sort of static initializer for the
   // class.
@@ -396,21 +403,21 @@
     } else if (Ty == IceType_i64) {
       if (NumGPRRegsUsed >= ARM32_MAX_GPR_ARG)
         continue;
-      int32_t RegLo = RegARM32::Reg_r0 + NumGPRRegsUsed;
-      int32_t RegHi = 0;
-      ++NumGPRRegsUsed;
+      int32_t RegLo;
+      int32_t RegHi;
       // Always start i64 registers at an even register, so this may end
       // up padding away a register.
-      if (RegLo % 2 != 0) {
-        ++RegLo;
+      if (NumGPRRegsUsed % 2 != 0) {
         ++NumGPRRegsUsed;
       }
-      // If this leaves us without room to consume another register,
-      // leave any previously speculatively consumed registers as consumed.
-      if (NumGPRRegsUsed >= ARM32_MAX_GPR_ARG)
-        continue;
+      RegLo = RegARM32::Reg_r0 + NumGPRRegsUsed;
+      ++NumGPRRegsUsed;
       RegHi = RegARM32::Reg_r0 + NumGPRRegsUsed;
       ++NumGPRRegsUsed;
+      // If this bumps us past the boundary, don't allocate to a register
+      // and leave any previously speculatively consumed registers as consumed.
+      if (NumGPRRegsUsed > ARM32_MAX_GPR_ARG)
+        continue;
       Variable *RegisterArg = Func->makeVariable(Ty);
       Variable *RegisterLo = Func->makeVariable(IceType_i32);
       Variable *RegisterHi = Func->makeVariable(IceType_i32);
@@ -450,16 +457,344 @@
   }
 }
 
+// Helper function for addProlog().
+//
+// This assumes Arg is an argument passed on the stack.  This sets the
+// frame offset for Arg and updates InArgsSizeBytes according to Arg's
+// width.  For an I64 arg that has been split into Lo and Hi components,
+// it calls itself recursively on the components, taking care to handle
+// Lo first because of the little-endian architecture.  Lastly, this
+// function generates an instruction to copy Arg into its assigned
+// register if applicable.
+void TargetARM32::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
+                                         size_t BasicFrameOffset,
+                                         size_t &InArgsSizeBytes) {
+  Variable *Lo = Arg->getLo();
+  Variable *Hi = Arg->getHi();
+  Type Ty = Arg->getType();
+  if (Lo && Hi && Ty == IceType_i64) {
+    assert(Lo->getType() != IceType_i64); // don't want infinite recursion
+    assert(Hi->getType() != IceType_i64); // don't want infinite recursion
+    finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes);
+    finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes);
+    return;
+  }
+  if (isVectorType(Ty)) {
+    InArgsSizeBytes = applyStackAlignment(InArgsSizeBytes);
+  }
+  Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
+  InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
+  // If the argument variable has been assigned a register, we need to load
+  // the value from the stack slot.
+  if (Arg->hasReg()) {
+    assert(Ty != IceType_i64);
+    OperandARM32Mem *Mem = OperandARM32Mem::create(
+        Func, Ty, FramePtr, llvm::cast<ConstantInteger32>(
+                                Ctx->getConstantInt32(Arg->getStackOffset())));
+    if (isVectorType(Arg->getType())) {
+      UnimplementedError(Func->getContext()->getFlags());
+    } else {
+      _ldr(Arg, Mem);
+    }
+    // This argument-copying instruction uses an explicit
+    // OperandARM32Mem operand instead of a Variable, so its
+    // fill-from-stack operation has to be tracked separately for
+    // statistics.
+    Ctx->statsUpdateFills();
+  }
+}
+
 Type TargetARM32::stackSlotType() { return IceType_i32; }
 
 void TargetARM32::addProlog(CfgNode *Node) {
-  (void)Node;
-  UnimplementedError(Func->getContext()->getFlags());
+  // Stack frame layout:
+  //
+  // +------------------------+
+  // | 1. preserved registers |
+  // +------------------------+
+  // | 2. padding             |
+  // +------------------------+
+  // | 3. global spill area   |
+  // +------------------------+
+  // | 4. padding             |
+  // +------------------------+
+  // | 5. local spill area    |
+  // +------------------------+
+  // | 6. padding             |
+  // +------------------------+
+  // | 7. allocas             |
+  // +------------------------+
+  //
+  // The following variables record the size in bytes of the given areas:
+  //  * PreservedRegsSizeBytes: area 1
+  //  * SpillAreaPaddingBytes:  area 2
+  //  * GlobalsSize:            area 3
+  //  * GlobalsAndSubsequentPaddingSize: areas 3 - 4
+  //  * LocalsSpillAreaSize:    area 5
+  //  * SpillAreaSizeBytes:     areas 2 - 6
+  // Determine stack frame offsets for each Variable without a
+  // register assignment.  This can be done as one variable per stack
+  // slot.  Or, do coalescing by running the register allocator again
+  // with an infinite set of registers (as a side effect, this gives
+  // variables a second chance at physical register assignment).
+  //
+  // A middle ground approach is to leverage sparsity and allocate one
+  // block of space on the frame for globals (variables with
+  // multi-block lifetime), and one block to share for locals
+  // (single-block lifetime).
+
+  Context.init(Node);
+  Context.setInsertPoint(Context.getCur());
+
+  llvm::SmallBitVector CalleeSaves =
+      getRegisterSet(RegSet_CalleeSave, RegSet_None);
+  RegsUsed = llvm::SmallBitVector(CalleeSaves.size());
+  VarList SortedSpilledVariables;
+  size_t GlobalsSize = 0;
+  // If there is a separate locals area, this represents that area.
+  // Otherwise it counts any variable not counted by GlobalsSize.
+  SpillAreaSizeBytes = 0;
+  // If there is a separate locals area, this specifies the alignment
+  // for it.
+  uint32_t LocalsSlotsAlignmentBytes = 0;
+  // The entire spill locations area gets aligned to largest natural
+  // alignment of the variables that have a spill slot.
+  uint32_t SpillAreaAlignmentBytes = 0;
+  // For now, we don't have target-specific variables that need special
+  // treatment (no stack-slot-linked SpillVariable type).
+  std::function<bool(Variable *)> TargetVarHook =
+      [](Variable *) { return false; };
+
+  // Compute the list of spilled variables and bounds for GlobalsSize, etc.
+  getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
+                        &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
+                        &LocalsSlotsAlignmentBytes, TargetVarHook);
+  uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
+  SpillAreaSizeBytes += GlobalsSize;
+
+  // Add push instructions for preserved registers.
+  // On ARM, "push" can push a whole list of GPRs via a bitmask (0-15).
+  // Unlike x86, ARM also has callee-saved float/vector registers.
+  // The "vpush" instruction can handle a whole list of float/vector
+  // registers, but it only handles contiguous sequences of registers
+  // by specifying the start and the length.
+  VarList GPRsToPreserve;
+  GPRsToPreserve.reserve(CalleeSaves.size());
+  uint32_t NumCallee = 0;
+  size_t PreservedRegsSizeBytes = 0;
+  // Consider FP and LR as callee-save / used as needed.
+  if (UsesFramePointer) {
+    CalleeSaves[RegARM32::Reg_fp] = true;
+    assert(RegsUsed[RegARM32::Reg_fp] == false);
+    RegsUsed[RegARM32::Reg_fp] = true;
+  }
+  if (!MaybeLeafFunc) {
+    CalleeSaves[RegARM32::Reg_lr] = true;
+    RegsUsed[RegARM32::Reg_lr] = true;
+  }
+  for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
+    if (CalleeSaves[i] && RegsUsed[i]) {
+      // TODO(jvoung): do separate vpush for each floating point
+      // register segment and += 4, or 8 depending on type.
+      ++NumCallee;
+      PreservedRegsSizeBytes += 4;
+      GPRsToPreserve.push_back(getPhysicalRegister(i));
+    }
+  }
+  Ctx->statsUpdateRegistersSaved(NumCallee);
+  if (!GPRsToPreserve.empty())
+    _push(GPRsToPreserve);
+
+  // Generate "mov FP, SP" if needed.
+  if (UsesFramePointer) {
+    Variable *FP = getPhysicalRegister(RegARM32::Reg_fp);
+    Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
+    _mov(FP, SP);
+    // Keep FP live for late-stage liveness analysis (e.g. asm-verbose mode).
+    Context.insert(InstFakeUse::create(Func, FP));
+  }
+
+  // Align the variables area. SpillAreaPaddingBytes is the size of
+  // the region after the preserved registers and before the spill areas.
+  // LocalsSlotsPaddingBytes is the amount of padding between the globals
+  // and locals area if they are separate.
+  assert(SpillAreaAlignmentBytes <= ARM32_STACK_ALIGNMENT_BYTES);
+  assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
+  uint32_t SpillAreaPaddingBytes = 0;
+  uint32_t LocalsSlotsPaddingBytes = 0;
+  alignStackSpillAreas(PreservedRegsSizeBytes, SpillAreaAlignmentBytes,
+                       GlobalsSize, LocalsSlotsAlignmentBytes,
+                       &SpillAreaPaddingBytes, &LocalsSlotsPaddingBytes);
+  SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
+  uint32_t GlobalsAndSubsequentPaddingSize =
+      GlobalsSize + LocalsSlotsPaddingBytes;
+
+  // Align SP if necessary.
+  if (NeedsStackAlignment) {
+    uint32_t StackOffset = PreservedRegsSizeBytes;
+    uint32_t StackSize = applyStackAlignment(StackOffset + SpillAreaSizeBytes);
+    SpillAreaSizeBytes = StackSize - StackOffset;
+  }
+
+  // Generate "sub sp, SpillAreaSizeBytes"
+  if (SpillAreaSizeBytes) {
+    // Use the IP inter-procedural scratch register if needed to legalize
+    // the immediate.
+    Operand *SubAmount = legalize(Ctx->getConstantInt32(SpillAreaSizeBytes),
+                                  Legal_Reg | Legal_Flex, RegARM32::Reg_ip);
+    Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
+    _sub(SP, SP, SubAmount);
+  }
+  Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
+
+  resetStackAdjustment();
+
+  // Fill in stack offsets for stack args, and copy args into registers
+  // for those that were register-allocated.  Args are pushed right to
+  // left, so Arg[0] is closest to the stack/frame pointer.
+  Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg());
+  size_t BasicFrameOffset = PreservedRegsSizeBytes;
+  if (!UsesFramePointer)
+    BasicFrameOffset += SpillAreaSizeBytes;
+
+  const VarList &Args = Func->getArgs();
+  size_t InArgsSizeBytes = 0;
+  unsigned NumGPRArgs = 0;
+  for (Variable *Arg : Args) {
+    Type Ty = Arg->getType();
+    // Skip arguments passed in registers.
+    if (isVectorType(Ty)) {
+      UnimplementedError(Func->getContext()->getFlags());
+      continue;
+    } else if (isFloatingType(Ty)) {
+      UnimplementedError(Func->getContext()->getFlags());
+      continue;
+    } else if (Ty == IceType_i64 && NumGPRArgs < ARM32_MAX_GPR_ARG) {
+      // Start at an even register.
+      if (NumGPRArgs % 2 == 1) {
+        ++NumGPRArgs;
+      }
+      NumGPRArgs += 2;
+      if (NumGPRArgs <= ARM32_MAX_GPR_ARG)
+        continue;
+    } else if (NumGPRArgs < ARM32_MAX_GPR_ARG) {
+      ++NumGPRArgs;
+      continue;
+    }
+    finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
+  }
+
+  // Fill in stack offsets for locals.
+  assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
+                      SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
+                      UsesFramePointer);
+  this->HasComputedFrame = true;
+
+  if (ALLOW_DUMP && Func->isVerbose(IceV_Frame)) {
+    OstreamLocker L(Func->getContext());
+    Ostream &Str = Func->getContext()->getStrDump();
+
+    Str << "Stack layout:\n";
+    uint32_t SPAdjustmentPaddingSize =
+        SpillAreaSizeBytes - LocalsSpillAreaSize -
+        GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes;
+    Str << " in-args = " << InArgsSizeBytes << " bytes\n"
+        << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
+        << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
+        << " globals spill area = " << GlobalsSize << " bytes\n"
+        << " globals-locals spill areas intermediate padding = "
+        << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
+        << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
+        << " SP alignment padding = " << SPAdjustmentPaddingSize << " bytes\n";
+
+    Str << "Stack details:\n"
+        << " SP adjustment = " << SpillAreaSizeBytes << " bytes\n"
+        << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
+        << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
+        << " bytes\n"
+        << " is FP based = " << UsesFramePointer << "\n";
+  }
 }
 
 void TargetARM32::addEpilog(CfgNode *Node) {
-  (void)Node;
-  UnimplementedError(Func->getContext()->getFlags());
+  InstList &Insts = Node->getInsts();
+  InstList::reverse_iterator RI, E;
+  for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
+    if (llvm::isa<InstARM32Ret>(*RI))
+      break;
+  }
+  if (RI == E)
+    return;
+
+  // Convert the reverse_iterator position into its corresponding
+  // (forward) iterator position.
+  InstList::iterator InsertPoint = RI.base();
+  --InsertPoint;
+  Context.init(Node);
+  Context.setInsertPoint(InsertPoint);
+
+  Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
+  if (UsesFramePointer) {
+    Variable *FP = getPhysicalRegister(RegARM32::Reg_fp);
+    // For late-stage liveness analysis (e.g. asm-verbose mode),
+    // adding a fake use of SP before the assignment of SP=FP keeps
+    // previous SP adjustments from being dead-code eliminated.
+    Context.insert(InstFakeUse::create(Func, SP));
+    _mov(SP, FP);
+  } else {
+    // add SP, SpillAreaSizeBytes
+    if (SpillAreaSizeBytes) {
+      // Use the IP inter-procedural scratch register if needed to legalize
+      // the immediate. It shouldn't be live at this point.
+      Operand *AddAmount = legalize(Ctx->getConstantInt32(SpillAreaSizeBytes),
+                                    Legal_Reg | Legal_Flex, RegARM32::Reg_ip);
+      _add(SP, SP, AddAmount);
+    }
+  }
+
+  // Add pop instructions for preserved registers.
+  llvm::SmallBitVector CalleeSaves =
+      getRegisterSet(RegSet_CalleeSave, RegSet_None);
+  VarList GPRsToRestore;
+  GPRsToRestore.reserve(CalleeSaves.size());
+  // Consider FP and LR as callee-save / used as needed.
+  if (UsesFramePointer) {
+    CalleeSaves[RegARM32::Reg_fp] = true;
+  }
+  if (!MaybeLeafFunc) {
+    CalleeSaves[RegARM32::Reg_lr] = true;
+  }
+  // Pop registers in ascending order just like push
+  // (instead of in reverse order).
+  for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
+    if (CalleeSaves[i] && RegsUsed[i]) {
+      GPRsToRestore.push_back(getPhysicalRegister(i));
+    }
+  }
+  if (!GPRsToRestore.empty())
+    _pop(GPRsToRestore);
+
+  if (!Ctx->getFlags().getUseSandboxing())
+    return;
+
+  // Change the original ret instruction into a sandboxed return sequence.
+  // bundle_lock
+  // bic lr, #0xc000000f
+  // bx lr
+  // bundle_unlock
+  // This isn't just aligning to the getBundleAlignLog2Bytes(). It needs to
+  // restrict to the lower 1GB as well.
+  Operand *RetMask =
+      legalize(Ctx->getConstantInt32(0xc000000f), Legal_Reg | Legal_Flex);
+  Variable *LR = makeReg(IceType_i32, RegARM32::Reg_lr);
+  Variable *RetValue = nullptr;
+  if (RI->getSrcSize())
+    RetValue = llvm::cast<Variable>(RI->getSrc(0));
+  _bundle_lock();
+  _bic(LR, LR, RetMask);
+  _ret(LR, RetValue);
+  _bundle_unlock();
+  RI->setDeleted();
 }
 
 void TargetARM32::split64(Variable *Var) {
@@ -881,6 +1216,8 @@
 }
 
 void TargetARM32::lowerCall(const InstCall *Instr) {
+  MaybeLeafFunc = false;
+
   // TODO(jvoung): assign arguments to registers and stack. Also reserve stack.
   if (Instr->getNumArgs()) {
     UnimplementedError(Func->getContext()->getFlags());
@@ -1567,12 +1904,12 @@
 
 void TargetARM32::alignRegisterPow2(Variable *Reg, uint32_t Align) {
   assert(llvm::isPowerOf2_32(Align));
-  uint32_t RotateAmt = 0;
+  uint32_t RotateAmt;
   uint32_t Immed_8;
   Operand *Mask;
   // Use AND or BIC to mask off the bits, depending on which immediate fits
   // (if it fits at all). Assume Align is usually small, in which case BIC
-  // works better.
+  // works better. Thus, this rounds down to the alignment.
   if (OperandARM32FlexImm::canHoldImm(Align - 1, &RotateAmt, &Immed_8)) {
     Mask = legalize(Ctx->getConstantInt32(Align - 1), Legal_Reg | Legal_Flex);
     _bic(Reg, Reg, Mask);
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 019a3e0..b31ff98 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -52,6 +52,7 @@
     // i8, and i16 are rounded up to 4 bytes.
     return (typeWidthInBytes(Ty) + 3) & ~3;
   }
+
   void emitVariable(const Variable *Var) const override;
 
   const char *getConstantPrefix() const final { return "#"; }
@@ -71,6 +72,8 @@
   void split64(Variable *Var);
   Operand *loOperand(Operand *Operand);
   Operand *hiOperand(Operand *Operand);
+  void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
+                              size_t BasicFrameOffset, size_t &InArgsSizeBytes);
 
 protected:
   explicit TargetARM32(Cfg *Func);
@@ -219,6 +222,15 @@
             CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert(InstARM32Orr::create(Func, Dest, Src0, Src1, Pred));
   }
+  void _push(const VarList &Sources) {
+    Context.insert(InstARM32Push::create(Func, Sources));
+  }
+  void _pop(const VarList &Dests) {
+    Context.insert(InstARM32Pop::create(Func, Dests));
+    // Mark dests as modified.
+    for (Variable *Dest : Dests)
+      Context.insert(InstFakeDef::create(Func, Dest));
+  }
   void _sbc(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert(InstARM32Sbc::create(Func, Dest, Src0, Src1, Pred));
@@ -253,6 +265,8 @@
 
   bool UsesFramePointer;
   bool NeedsStackAlignment;
+  bool MaybeLeafFunc;
+  size_t SpillAreaSizeBytes;
   llvm::SmallBitVector TypeToRegisterSet[IceType_NUM];
   llvm::SmallBitVector ScratchRegs;
   llvm::SmallBitVector RegsUsed;
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index a1ba9d6..e0334ca 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -131,10 +131,6 @@
 const uint32_t X86_STACK_ALIGNMENT_BYTES = 16;
 // Size of the return address on the stack
 const uint32_t X86_RET_IP_SIZE_BYTES = 4;
-// The base 2 logarithm of the width in bytes of the smallest stack slot
-const uint32_t X86_LOG2_OF_MIN_STACK_SLOT_SIZE = 2;
-// The base 2 logarithm of the width in bytes of the largest stack slot
-const uint32_t X86_LOG2_OF_MAX_STACK_SLOT_SIZE = 4;
 // The number of different NOP instructions
 const uint32_t X86_NUM_NOP_VARIANTS = 5;
 
@@ -700,28 +696,6 @@
   }
 }
 
-void TargetX8632::sortByAlignment(VarList &Dest, const VarList &Source) const {
-  // Sort the variables into buckets according to the log of their width
-  // in bytes.
-  const SizeT NumBuckets =
-      X86_LOG2_OF_MAX_STACK_SLOT_SIZE - X86_LOG2_OF_MIN_STACK_SLOT_SIZE + 1;
-  VarList Buckets[NumBuckets];
-
-  for (Variable *Var : Source) {
-    uint32_t NaturalAlignment = typeWidthInBytesOnStack(Var->getType());
-    SizeT LogNaturalAlignment = llvm::findFirstSet(NaturalAlignment);
-    assert(LogNaturalAlignment >= X86_LOG2_OF_MIN_STACK_SLOT_SIZE);
-    assert(LogNaturalAlignment <= X86_LOG2_OF_MAX_STACK_SLOT_SIZE);
-    SizeT BucketIndex = LogNaturalAlignment - X86_LOG2_OF_MIN_STACK_SLOT_SIZE;
-    Buckets[BucketIndex].push_back(Var);
-  }
-
-  for (SizeT I = 0, E = NumBuckets; I < E; ++I) {
-    VarList &List = Buckets[NumBuckets - I - 1];
-    Dest.insert(Dest.end(), List.begin(), List.end());
-  }
-}
-
 // Helper function for addProlog().
 //
 // This assumes Arg is an argument passed on the stack.  This sets the
@@ -798,45 +772,6 @@
   //  * LocalsSpillAreaSize:    area 6
   //  * SpillAreaSizeBytes:     areas 3 - 7
 
-  // Make a final pass over the Cfg to determine which variables need
-  // stack slots.
-  llvm::BitVector IsVarReferenced(Func->getNumVariables());
-  for (CfgNode *Node : Func->getNodes()) {
-    for (Inst &Inst : Node->getInsts()) {
-      if (Inst.isDeleted())
-        continue;
-      if (const Variable *Var = Inst.getDest())
-        IsVarReferenced[Var->getIndex()] = true;
-      for (SizeT I = 0; I < Inst.getSrcSize(); ++I) {
-        Operand *Src = Inst.getSrc(I);
-        SizeT NumVars = Src->getNumVars();
-        for (SizeT J = 0; J < NumVars; ++J) {
-          const Variable *Var = Src->getVar(J);
-          IsVarReferenced[Var->getIndex()] = true;
-        }
-      }
-    }
-  }
-
-  // If SimpleCoalescing is false, each variable without a register
-  // gets its own unique stack slot, which leads to large stack
-  // frames.  If SimpleCoalescing is true, then each "global" variable
-  // without a register gets its own slot, but "local" variable slots
-  // are reused across basic blocks.  E.g., if A and B are local to
-  // block 1 and C is local to block 2, then C may share a slot with A or B.
-  //
-  // We cannot coalesce stack slots if this function calls a "returns twice"
-  // function. In that case, basic blocks may be revisited, and variables
-  // local to those basic blocks are actually live until after the
-  // called function returns a second time.
-  const bool SimpleCoalescing = !callsReturnsTwice();
-  size_t InArgsSizeBytes = 0;
-  size_t PreservedRegsSizeBytes = 0;
-  SpillAreaSizeBytes = 0;
-  const VariablesMetadata *VMetadata = Func->getVMetadata();
-  Context.init(Node);
-  Context.setInsertPoint(Context.getCur());
-
   // Determine stack frame offsets for each Variable without a
   // register assignment.  This can be done as one variable per stack
   // slot.  Or, do coalescing by running the register allocator again
@@ -848,76 +783,47 @@
   // multi-block lifetime), and one block to share for locals
   // (single-block lifetime).
 
+  Context.init(Node);
+  Context.setInsertPoint(Context.getCur());
+
   llvm::SmallBitVector CalleeSaves =
       getRegisterSet(RegSet_CalleeSave, RegSet_None);
-
-  size_t GlobalsSize = 0;
-  std::vector<size_t> LocalsSize(Func->getNumNodes());
-
-  // Prepass.  Compute RegsUsed, PreservedRegsSizeBytes, and
-  // SpillAreaSizeBytes.
   RegsUsed = llvm::SmallBitVector(CalleeSaves.size());
-  const VarList &Variables = Func->getVariables();
-  const VarList &Args = Func->getArgs();
-  VarList SpilledVariables, SortedSpilledVariables, VariablesLinkedToSpillSlots;
-
+  VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
+  size_t GlobalsSize = 0;
+  // If there is a separate locals area, this represents that area.
+  // Otherwise it counts any variable not counted by GlobalsSize.
+  SpillAreaSizeBytes = 0;
   // If there is a separate locals area, this specifies the alignment
   // for it.
   uint32_t LocalsSlotsAlignmentBytes = 0;
   // The entire spill locations area gets aligned to largest natural
   // alignment of the variables that have a spill slot.
   uint32_t SpillAreaAlignmentBytes = 0;
-  for (Variable *Var : Variables) {
-    if (Var->hasReg()) {
-      RegsUsed[Var->getRegNum()] = true;
-      continue;
-    }
-    // An argument either does not need a stack slot (if passed in a
-    // register) or already has one (if passed on the stack).
-    if (Var->getIsArg())
-      continue;
-    // An unreferenced variable doesn't need a stack slot.
-    if (!IsVarReferenced[Var->getIndex()])
-      continue;
-    // A spill slot linked to a variable with a stack slot should reuse
-    // that stack slot.
+  // A spill slot linked to a variable with a stack slot should reuse
+  // that stack slot.
+  std::function<bool(Variable *)> TargetVarHook =
+      [&VariablesLinkedToSpillSlots](Variable *Var) {
     if (SpillVariable *SpillVar = llvm::dyn_cast<SpillVariable>(Var)) {
       assert(Var->getWeight().isZero());
       if (SpillVar->getLinkedTo() && !SpillVar->getLinkedTo()->hasReg()) {
         VariablesLinkedToSpillSlots.push_back(Var);
-        continue;
+        return true;
       }
     }
-    SpilledVariables.push_back(Var);
-  }
+    return false;
+  };
 
-  SortedSpilledVariables.reserve(SpilledVariables.size());
-  sortByAlignment(SortedSpilledVariables, SpilledVariables);
-  for (Variable *Var : SortedSpilledVariables) {
-    size_t Increment = typeWidthInBytesOnStack(Var->getType());
-    if (!SpillAreaAlignmentBytes)
-      SpillAreaAlignmentBytes = Increment;
-    if (SimpleCoalescing && VMetadata->isTracked(Var)) {
-      if (VMetadata->isMultiBlock(Var)) {
-        GlobalsSize += Increment;
-      } else {
-        SizeT NodeIndex = VMetadata->getLocalUseNode(Var)->getIndex();
-        LocalsSize[NodeIndex] += Increment;
-        if (LocalsSize[NodeIndex] > SpillAreaSizeBytes)
-          SpillAreaSizeBytes = LocalsSize[NodeIndex];
-        if (!LocalsSlotsAlignmentBytes)
-          LocalsSlotsAlignmentBytes = Increment;
-      }
-    } else {
-      SpillAreaSizeBytes += Increment;
-    }
-  }
+  // Compute the list of spilled variables and bounds for GlobalsSize, etc.
+  getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
+                        &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
+                        &LocalsSlotsAlignmentBytes, TargetVarHook);
   uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
-
   SpillAreaSizeBytes += GlobalsSize;
 
   // Add push instructions for preserved registers.
   uint32_t NumCallee = 0;
+  size_t PreservedRegsSizeBytes = 0;
   for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
     if (CalleeSaves[i] && RegsUsed[i]) {
       ++NumCallee;
@@ -942,27 +848,20 @@
   }
 
   // Align the variables area. SpillAreaPaddingBytes is the size of
-  // the region after the preserved registers and before the spill
-  // areas.
+  // the region after the preserved registers and before the spill areas.
+  // LocalsSlotsPaddingBytes is the amount of padding between the globals
+  // and locals area if they are separate.
+  assert(SpillAreaAlignmentBytes <= X86_STACK_ALIGNMENT_BYTES);
+  assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
   uint32_t SpillAreaPaddingBytes = 0;
-  if (SpillAreaAlignmentBytes) {
-    assert(SpillAreaAlignmentBytes <= X86_STACK_ALIGNMENT_BYTES);
-    uint32_t PaddingStart = X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
-    uint32_t SpillAreaStart =
-        Utils::applyAlignment(PaddingStart, SpillAreaAlignmentBytes);
-    SpillAreaPaddingBytes = SpillAreaStart - PaddingStart;
-    SpillAreaSizeBytes += SpillAreaPaddingBytes;
-  }
-
-  // If there are separate globals and locals areas, make sure the
-  // locals area is aligned by padding the end of the globals area.
-  uint32_t GlobalsAndSubsequentPaddingSize = GlobalsSize;
-  if (LocalsSlotsAlignmentBytes) {
-    assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
-    GlobalsAndSubsequentPaddingSize =
-        Utils::applyAlignment(GlobalsSize, LocalsSlotsAlignmentBytes);
-    SpillAreaSizeBytes += GlobalsAndSubsequentPaddingSize - GlobalsSize;
-  }
+  uint32_t LocalsSlotsPaddingBytes = 0;
+  alignStackSpillAreas(X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
+                       SpillAreaAlignmentBytes, GlobalsSize,
+                       LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
+                       &LocalsSlotsPaddingBytes);
+  SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
+  uint32_t GlobalsAndSubsequentPaddingSize =
+      GlobalsSize + LocalsSlotsPaddingBytes;
 
   // Align esp if necessary.
   if (NeedsStackAlignment) {
@@ -987,9 +886,10 @@
   if (!IsEbpBasedFrame)
     BasicFrameOffset += SpillAreaSizeBytes;
 
+  const VarList &Args = Func->getArgs();
+  size_t InArgsSizeBytes = 0;
   unsigned NumXmmArgs = 0;
-  for (SizeT i = 0; i < Args.size(); ++i) {
-    Variable *Arg = Args[i];
+  for (Variable *Arg : Args) {
     // Skip arguments passed in registers.
     if (isVectorType(Arg->getType()) && NumXmmArgs < X86_MAX_XMM_ARGS) {
       ++NumXmmArgs;
@@ -999,38 +899,16 @@
   }
 
   // Fill in stack offsets for locals.
-  size_t GlobalsSpaceUsed = SpillAreaPaddingBytes;
-  LocalsSize.assign(LocalsSize.size(), 0);
-  size_t NextStackOffset = GlobalsSpaceUsed;
-  for (Variable *Var : SortedSpilledVariables) {
-    size_t Increment = typeWidthInBytesOnStack(Var->getType());
-    if (SimpleCoalescing && VMetadata->isTracked(Var)) {
-      if (VMetadata->isMultiBlock(Var)) {
-        GlobalsSpaceUsed += Increment;
-        NextStackOffset = GlobalsSpaceUsed;
-      } else {
-        SizeT NodeIndex = VMetadata->getLocalUseNode(Var)->getIndex();
-        LocalsSize[NodeIndex] += Increment;
-        NextStackOffset = SpillAreaPaddingBytes +
-                          GlobalsAndSubsequentPaddingSize +
-                          LocalsSize[NodeIndex];
-      }
-    } else {
-      NextStackOffset += Increment;
-    }
-    if (IsEbpBasedFrame)
-      Var->setStackOffset(-NextStackOffset);
-    else
-      Var->setStackOffset(SpillAreaSizeBytes - NextStackOffset);
-  }
-  this->HasComputedFrame = true;
-
+  assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
+                      SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
+                      IsEbpBasedFrame);
   // Assign stack offsets to variables that have been linked to spilled
   // variables.
   for (Variable *Var : VariablesLinkedToSpillSlots) {
     Variable *Linked = (llvm::cast<SpillVariable>(Var))->getLinkedTo();
     Var->setStackOffset(Linked->getStackOffset());
   }
+  this->HasComputedFrame = true;
 
   if (ALLOW_DUMP && Func->isVerbose(IceV_Frame)) {
     OstreamLocker L(Func->getContext());
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 4213a59..cc3ab18 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -120,6 +120,7 @@
     // i8, and i16 are rounded up to 4 bytes.
     return (typeWidthInBytes(Ty) + 3) & ~3;
   }
+
   void emitVariable(const Variable *Var) const override;
 
   const char *getConstantPrefix() const final { return "$"; }
@@ -139,10 +140,10 @@
   // function calls using the 32-bit push instruction (though the
   // latter could be done by directly writing to the stack).
   void split64(Variable *Var);
-  void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
-                              size_t BasicFrameOffset, size_t &InArgsSizeBytes);
   Operand *loOperand(Operand *Operand);
   Operand *hiOperand(Operand *Operand);
+  void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
+                              size_t BasicFrameOffset, size_t &InArgsSizeBytes);
   X8632::Address stackVarToAsmOperand(const Variable *Var) const;
 
   enum X86InstructionSet {
@@ -205,8 +206,6 @@
   void scalarizeArithmetic(InstArithmetic::OpKind K, Variable *Dest,
                            Operand *Src0, Operand *Src1);
 
-  void sortByAlignment(VarList &Dest, const VarList &Source) const;
-
   // Operand legalization helpers.  To deal with address mode
   // constraints, the helpers will create a new Operand and emit
   // instructions that guarantee that the Operand kind is one of those
@@ -303,11 +302,6 @@
   void _bswap(Variable *SrcDest) {
     Context.insert(InstX8632Bswap::create(Func, SrcDest));
   }
-  void
-  _bundle_lock(InstBundleLock::Option BundleOption = InstBundleLock::Opt_None) {
-    Context.insert(InstBundleLock::create(Func, BundleOption));
-  }
-  void _bundle_unlock() { Context.insert(InstBundleUnlock::create(Func)); }
   void _cbwdq(Variable *Dest, Operand *Src0) {
     Context.insert(InstX8632Cbwdq::create(Func, Dest, Src0));
   }