Merge fixed alloca stack adjustments into the prolog

Also removes reliance on lowerAlloca entirely for the fixed allocations.

BUG=
R=jpp@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/1435363002 .
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 6ae12bb..65844b9 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -494,53 +494,44 @@
   uint32_t TotalSize = Utils::applyAlignment(CurrentOffset, CombinedAlignment);
   // Ensure every alloca was assigned an offset.
   assert(Allocas.size() == Offsets.size());
-  Variable *BaseVariable = makeVariable(IceType_i32);
-  Variable *AllocaDest = BaseVariable;
-  // Emit one addition for each alloca after the first.
-  for (size_t i = 0; i < Allocas.size(); ++i) {
-    auto *Alloca = llvm::cast<InstAlloca>(Allocas[i]);
-    switch (BaseVariableType) {
-    case BVT_FramePointer:
-    case BVT_UserPointer: {
+
+  switch (BaseVariableType) {
+  case BVT_UserPointer: {
+    Variable *BaseVariable = makeVariable(IceType_i32);
+    for (SizeT i = 0; i < Allocas.size(); ++i) {
+      auto *Alloca = llvm::cast<InstAlloca>(Allocas[i]);
       // Emit a new addition operation to replace the alloca.
       Operand *AllocaOffset = Ctx->getConstantInt32(Offsets[i]);
       InstArithmetic *Add =
           InstArithmetic::create(this, InstArithmetic::Add, Alloca->getDest(),
                                  BaseVariable, AllocaOffset);
       Insts.push_front(Add);
-    } break;
-    case BVT_StackPointer: {
+      Alloca->setDeleted();
+    }
+    Operand *AllocaSize = Ctx->getConstantInt32(TotalSize);
+    InstAlloca *CombinedAlloca =
+        InstAlloca::create(this, BaseVariable, AllocaSize, CombinedAlignment);
+    CombinedAlloca->setKnownFrameOffset();
+    Insts.push_front(CombinedAlloca);
+  } break;
+  case BVT_StackPointer:
+  case BVT_FramePointer: {
+    for (SizeT i = 0; i < Allocas.size(); ++i) {
+      auto *Alloca = llvm::cast<InstAlloca>(Allocas[i]);
       // Emit a fake definition of the rematerializable variable.
       Variable *Dest = Alloca->getDest();
       InstFakeDef *Def = InstFakeDef::create(this, Dest);
-      Dest->setRematerializable(getTarget()->getStackReg(), Offsets[i]);
+      if (BaseVariableType == BVT_StackPointer)
+        Dest->setRematerializable(getTarget()->getStackReg(), Offsets[i]);
+      else
+        Dest->setRematerializable(getTarget()->getFrameReg(), Offsets[i]);
       Insts.push_front(Def);
-    } break;
+      Alloca->setDeleted();
     }
-    Alloca->setDeleted();
-  }
-  Operand *AllocaSize = Ctx->getConstantInt32(TotalSize);
-  switch (BaseVariableType) {
-  case BVT_FramePointer: {
-    // Adjust the return of the alloca to the top of the returned region.
-    AllocaDest = makeVariable(IceType_i32);
-    InstArithmetic *Add = InstArithmetic::create(
-        this, InstArithmetic::Add, BaseVariable, AllocaDest, AllocaSize);
-    Insts.push_front(Add);
+    // Allocate the fixed area in the function prolog.
+    getTarget()->reserveFixedAllocaArea(TotalSize, CombinedAlignment);
   } break;
-  case BVT_StackPointer: {
-    // Emit a fake use to keep the Alloca live.
-    InstFakeUse *Use = InstFakeUse::create(this, AllocaDest);
-    Insts.push_front(Use);
-  } break;
-  case BVT_UserPointer:
-    break;
   }
-  // And insert the fused alloca.
-  InstAlloca *CombinedAlloca =
-      InstAlloca::create(this, AllocaSize, CombinedAlignment, AllocaDest);
-  CombinedAlloca->setKnownFrameOffset();
-  Insts.push_front(CombinedAlloca);
 }
 
 void Cfg::processAllocas(bool SortAndCombine) {
@@ -595,7 +586,7 @@
   // Allocas in the entry block that have constant size and alignment greater
   // than the function's stack alignment.
   CfgVector<Inst *> AlignedAllocas;
-  // Maximum alignment used for the dynamic/aligned allocas.
+  // Maximum alignment used by any alloca.
   uint32_t MaxAlignment = StackAlignment;
   for (Inst &Instr : EntryNode->getInsts()) {
     if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) {
@@ -623,14 +614,16 @@
     // do not have a known offset from either the stack or frame pointer.
     // They grow up from a user pointer from an alloca.
     sortAndCombineAllocas(AlignedAllocas, MaxAlignment, Insts, BVT_UserPointer);
+    // Fixed size allocas are addressed relative to the frame pointer.
+    sortAndCombineAllocas(FixedAllocas, StackAlignment, Insts,
+                          BVT_FramePointer);
+  } else {
+    // Otherwise, fixed size allocas are addressed relative to the stack unless
+    // there are dynamic allocas.
+    const AllocaBaseVariableType BasePointerType =
+        (HasDynamicAllocation ? BVT_FramePointer : BVT_StackPointer);
+    sortAndCombineAllocas(FixedAllocas, MaxAlignment, Insts, BasePointerType);
   }
-  // Otherwise, fixed size allocas are always addressed relative to the stack
-  // unless there are dynamic allocas.
-  // TODO(sehr): re-enable frame pointer and decrementing addressing.
-  AllocaBaseVariableType BasePointerType =
-      (HasDynamicAllocation ? BVT_UserPointer : BVT_StackPointer);
-  sortAndCombineAllocas(FixedAllocas, MaxAlignment, Insts, BasePointerType);
-
   if (!FixedAllocas.empty() || !AlignedAllocas.empty())
     // No use calling findRematerializable() unless there is some
     // rematerializable alloca instruction to seed it.
diff --git a/src/IceConverter.cpp b/src/IceConverter.cpp
index 89fa2e3..34bdcb9 100644
--- a/src/IceConverter.cpp
+++ b/src/IceConverter.cpp
@@ -586,7 +586,7 @@
     uint32_t Align = Inst->getAlignment();
     Ice::Variable *Dest = mapValueToIceVar(Inst, Ice::getPointerType());
 
-    return Ice::InstAlloca::create(Func.get(), ByteCount, Align, Dest);
+    return Ice::InstAlloca::create(Func.get(), Dest, ByteCount, Align);
   }
 
   Ice::Inst *convertUnreachableInstruction(const UnreachableInst * /*Inst*/) {
diff --git a/src/IceInst.cpp b/src/IceInst.cpp
index 96755eb..bd68e68 100644
--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -218,8 +218,8 @@
   return true;
 }
 
-InstAlloca::InstAlloca(Cfg *Func, Operand *ByteCount, uint32_t AlignInBytes,
-                       Variable *Dest)
+InstAlloca::InstAlloca(Cfg *Func, Variable *Dest, Operand *ByteCount,
+                       uint32_t AlignInBytes)
     : InstHighLevel(Func, Inst::Alloca, 1, Dest), AlignInBytes(AlignInBytes) {
   // Verify AlignInBytes is 0 or a power of 2.
   assert(AlignInBytes == 0 || llvm::isPowerOf2_32(AlignInBytes));
diff --git a/src/IceInst.h b/src/IceInst.h
index 5e32904..7505924 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -246,10 +246,10 @@
   InstAlloca &operator=(const InstAlloca &) = delete;
 
 public:
-  static InstAlloca *create(Cfg *Func, Operand *ByteCount,
-                            uint32_t AlignInBytes, Variable *Dest) {
+  static InstAlloca *create(Cfg *Func, Variable *Dest, Operand *ByteCount,
+                            uint32_t AlignInBytes) {
     return new (Func->allocate<InstAlloca>())
-        InstAlloca(Func, ByteCount, AlignInBytes, Dest);
+        InstAlloca(Func, Dest, ByteCount, AlignInBytes);
   }
   uint32_t getAlignInBytes() const { return AlignInBytes; }
   Operand *getSizeInBytes() const { return getSrc(0); }
@@ -259,8 +259,8 @@
   static bool classof(const Inst *Inst) { return Inst->getKind() == Alloca; }
 
 private:
-  InstAlloca(Cfg *Func, Operand *ByteCount, uint32_t AlignInBytes,
-             Variable *Dest);
+  InstAlloca(Cfg *Func, Variable *Dest, Operand *ByteCount,
+             uint32_t AlignInBytes);
 
   const uint32_t AlignInBytes;
   bool KnownFrameOffset = false;
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 2c2f09c..07d7e54 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -98,17 +98,34 @@
   }
 }
 
+namespace {
+static int32_t GetRematerializableOffset(Variable *Var, bool IgnoreStackAdjust,
+                                         const Ice::TargetX8632 *Target) {
+  int32_t Disp = 0;
+  Disp += Var->getStackOffset();
+  SizeT RegNum = static_cast<SizeT>(Var->getRegNum());
+  if (RegNum == Target->getStackReg()) {
+    if (!IgnoreStackAdjust)
+      Disp += Target->getStackAdjustment();
+  } else if (RegNum == Target->getFrameReg()) {
+    Disp += Target->getFrameFixedAllocaOffset();
+  } else {
+    llvm::report_fatal_error("Unexpected rematerializable register type");
+  }
+  return Disp;
+}
+} // end of anonymous namespace
+
 void MachineTraits<TargetX8632>::X86OperandMem::emit(const Cfg *Func) const {
   if (!BuildDefs::dump())
     return;
-  const ::Ice::TargetLowering *Target = Func->getTarget();
+  const auto *Target = static_cast<const Ice::TargetX8632 *>(Func->getTarget());
   // If the base is rematerializable, we need to replace it with the correct
   // physical register (esp or ebp), and update the Offset.
   int32_t Disp = 0;
   if (getBase() && getBase()->isRematerializable()) {
-    Disp += getBase()->getStackOffset();
-    if (!getIgnoreStackAdjust())
-      Disp += Target->getStackAdjustment();
+    Disp +=
+        GetRematerializableOffset(getBase(), getIgnoreStackAdjust(), Target);
   }
   // The index should never be rematerializable.  But if we ever allow it, then
   // we should make sure the rematerialization offset is shifted by the Shift
@@ -135,7 +152,7 @@
     // TODO(sehr): ConstantRelocatable still needs updating for
     // rematerializable base/index and Disp.
     assert(Disp == 0);
-    CR->emitWithoutPrefix(Func->getTarget());
+    CR->emitWithoutPrefix(Target);
   } else {
     llvm_unreachable("Invalid offset type for x86 mem operand");
   }
@@ -165,10 +182,10 @@
   bool Dumped = false;
   Str << "[";
   int32_t Disp = 0;
+  const auto *Target = static_cast<const Ice::TargetX8632 *>(Func->getTarget());
   if (getBase() && getBase()->isRematerializable()) {
-    Disp += getBase()->getStackOffset();
-    if (!getIgnoreStackAdjust())
-      Disp += Func->getTarget()->getStackAdjustment();
+    Disp +=
+        GetRematerializableOffset(getBase(), getIgnoreStackAdjust(), Target);
   }
   if (getBase()) {
     if (Func)
@@ -230,13 +247,12 @@
 MachineTraits<TargetX8632>::Address
 MachineTraits<TargetX8632>::X86OperandMem::toAsmAddress(
     MachineTraits<TargetX8632>::Assembler *Asm,
-    const Ice::TargetLowering *Target) const {
+    const Ice::TargetLowering *TargetLowering) const {
   int32_t Disp = 0;
+  const auto *Target = static_cast<const Ice::TargetX8632 *>(TargetLowering);
   if (getBase() && getBase()->isRematerializable()) {
-    Disp += getBase()->getStackOffset();
-    if (!getIgnoreStackAdjust()) {
-      Disp += Target->getStackAdjustment();
-    }
+    Disp +=
+        GetRematerializableOffset(getBase(), getIgnoreStackAdjust(), Target);
   }
   // The index should never be rematerializable.  But if we ever allow it, then
   // we should make sure the rematerialization offset is shifted by the Shift
diff --git a/src/IceOperand.cpp b/src/IceOperand.cpp
index e66a995..bb16967 100644
--- a/src/IceOperand.cpp
+++ b/src/IceOperand.cpp
@@ -487,7 +487,8 @@
   Target->emit(this);
 }
 
-void ConstantRelocatable::emitWithoutPrefix(TargetLowering *Target) const {
+void ConstantRelocatable::emitWithoutPrefix(
+    const TargetLowering *Target) const {
   Target->emitWithoutPrefix(this);
 }
 
diff --git a/src/IceOperand.h b/src/IceOperand.h
index 9d9f497..0cde5b2 100644
--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -290,7 +290,7 @@
   bool getSuppressMangling() const { return SuppressMangling; }
   using Constant::emit;
   void emit(TargetLowering *Target) const final;
-  void emitWithoutPrefix(TargetLowering *Target) const;
+  void emitWithoutPrefix(const TargetLowering *Target) const;
   using Constant::dump;
   void dump(const Cfg *Func, Ostream &Str) const override;
 
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index 94821ce..92223c9 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -368,7 +368,13 @@
   VarList SpilledVariables;
   for (Variable *Var : Variables) {
     if (Var->hasReg()) {
-      RegsUsed[Var->getRegNum()] = true;
+      // Don't consider a rematerializable variable to be an actual register use
+      // (specifically of the frame pointer).  Otherwise, the prolog may decide
+      // to save the frame pointer twice - once because of the explicit need for
+      // a frame pointer, and once because of an active use of a callee-save
+      // register.
+      if (!Var->isRematerializable())
+        RegsUsed[Var->getRegNum()] = true;
       continue;
     }
     // An argument either does not need a stack slot (if passed in a register)
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index aff423c..c613d99 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -203,9 +203,12 @@
   virtual bool hasFramePointer() const { return false; }
   virtual void setHasFramePointer() = 0;
   virtual SizeT getStackReg() const = 0;
+  virtual SizeT getFrameReg() const = 0;
   virtual SizeT getFrameOrStackReg() const = 0;
   virtual size_t typeWidthInBytesOnStack(Type Ty) const = 0;
   virtual uint32_t getStackAlignment() const = 0;
+  virtual void reserveFixedAllocaArea(size_t Size, size_t Align) = 0;
+  virtual int32_t getFrameFixedAllocaOffset() const = 0;
 
   /// Return whether a 64-bit Variable should be split into a Variable64On32.
   virtual bool shouldSplitToVariable64On32(Type Ty) const = 0;
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 969e17a..20f3597 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -86,8 +86,9 @@
   bool hasFramePointer() const override { return UsesFramePointer; }
   void setHasFramePointer() override { UsesFramePointer = true; }
   SizeT getStackReg() const override { return RegARM32::Reg_sp; }
+  SizeT getFrameReg() const override { return RegARM32::Reg_fp; }
   SizeT getFrameOrStackReg() const override {
-    return UsesFramePointer ? RegARM32::Reg_fp : RegARM32::Reg_sp;
+    return UsesFramePointer ? getFrameReg() : getStackReg();
   }
   SizeT getReservedTmpReg() const { return RegARM32::Reg_ip; }
 
@@ -97,6 +98,17 @@
     return (typeWidthInBytes(Ty) + 3) & ~3;
   }
   uint32_t getStackAlignment() const override;
+  void reserveFixedAllocaArea(size_t Size, size_t Align) override {
+    // TODO(sehr,jpp): Implement fixed stack layout.
+    (void)Size;
+    (void)Align;
+    llvm::report_fatal_error("Not yet implemented");
+  }
+  int32_t getFrameFixedAllocaOffset() const override {
+    // TODO(sehr,jpp): Implement fixed stack layout.
+    llvm::report_fatal_error("Not yet implemented");
+    return 0;
+  }
 
   bool shouldSplitToVariable64On32(Type Ty) const override {
     return Ty == IceType_i64;
diff --git a/src/IceTargetLoweringMIPS32.h b/src/IceTargetLoweringMIPS32.h
index c01c6c2..1f91eee 100644
--- a/src/IceTargetLoweringMIPS32.h
+++ b/src/IceTargetLoweringMIPS32.h
@@ -54,8 +54,9 @@
   bool hasFramePointer() const override { return UsesFramePointer; }
   void setHasFramePointer() override { UsesFramePointer = true; }
   SizeT getStackReg() const override { return RegMIPS32::Reg_SP; }
+  SizeT getFrameReg() const override { return RegMIPS32::Reg_FP; }
   SizeT getFrameOrStackReg() const override {
-    return UsesFramePointer ? RegMIPS32::Reg_FP : RegMIPS32::Reg_SP;
+    return UsesFramePointer ? getFrameReg() : getStackReg();
   }
   size_t typeWidthInBytesOnStack(Type Ty) const override {
     // Round up to the next multiple of 4 bytes. In particular, i1, i8, and i16
@@ -66,6 +67,17 @@
     // TODO(sehr): what is the stack alignment?
     return 1;
   }
+  void reserveFixedAllocaArea(size_t Size, size_t Align) override {
+    // TODO(sehr): Implement fixed stack layout.
+    (void)Size;
+    (void)Align;
+    llvm::report_fatal_error("Not yet implemented");
+  }
+  int32_t getFrameFixedAllocaOffset() const override {
+    // TODO(sehr): Implement fixed stack layout.
+    llvm::report_fatal_error("Not yet implemented");
+    return 0;
+  }
 
   bool shouldSplitToVariable64On32(Type Ty) const override {
     return Ty == IceType_i64;
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 27f9ae0..e8a86d8 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -506,20 +506,36 @@
     SpillAreaSizeBytes = StackSize - StackOffset;
   }
 
-  // Generate "sub esp, SpillAreaSizeBytes"
-  if (SpillAreaSizeBytes)
+  // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
+  // fixed allocations in the prolog.
+  if (PrologEmitsFixedAllocas)
+    SpillAreaSizeBytes += FixedAllocaSizeBytes;
+  if (SpillAreaSizeBytes) {
+    // Generate "sub esp, SpillAreaSizeBytes"
     _sub(getPhysicalRegister(Traits::RegisterSet::Reg_esp),
          Ctx->getConstantInt32(SpillAreaSizeBytes));
+    // If the fixed allocas are aligned more than the stack frame, align the
+    // stack pointer accordingly.
+    if (PrologEmitsFixedAllocas &&
+        FixedAllocaAlignBytes > Traits::X86_STACK_ALIGNMENT_BYTES) {
+      assert(IsEbpBasedFrame);
+      _and(getPhysicalRegister(Traits::RegisterSet::Reg_esp),
+           Ctx->getConstantInt32(-FixedAllocaAlignBytes));
+    }
+  }
 
-  // Account for alloca instructions with known frame offsets.
-  SpillAreaSizeBytes += FixedAllocaSizeBytes;
+  // Account for known-frame-offset alloca instructions that were not already
+  // combined into the prolog.
+  if (!PrologEmitsFixedAllocas)
+    SpillAreaSizeBytes += FixedAllocaSizeBytes;
 
   Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
 
   // Initialize the stack adjustment so that after all the known-frame-offset
   // alloca instructions are emitted, the stack adjustment will reach zero.
   resetStackAdjustment();
-  updateStackAdjustment(-FixedAllocaSizeBytes);
+  if (!PrologEmitsFixedAllocas)
+    updateStackAdjustment(-FixedAllocaSizeBytes);
 
   // Fill in stack offsets for stack args, and copy args into registers for
   // those that were register-allocated. Args are pushed right to left, so
@@ -539,11 +555,14 @@
       ++NumXmmArgs;
       continue;
     }
-    // For esp-based frames, the esp value may not stabilize to its home value
-    // until after all the fixed-size alloca instructions have executed.  In
-    // this case, a stack adjustment is needed when accessing in-args in order
-    // to copy them into registers.
-    size_t StackAdjBytes = IsEbpBasedFrame ? 0 : -FixedAllocaSizeBytes;
+    // For esp-based frames where the allocas are done outside the prolog, the
+    // esp value may not stabilize to its home value until after all the
+    // fixed-size alloca instructions have executed.  In this case, a stack
+    // adjustment is needed when accessing in-args in order to copy them into
+    // registers.
+    size_t StackAdjBytes = 0;
+    if (!IsEbpBasedFrame && !PrologEmitsFixedAllocas)
+      StackAdjBytes -= FixedAllocaSizeBytes;
     finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
                            InArgsSizeBytes);
   }
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index d23e35b..90bbed2 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -89,9 +89,9 @@
   bool hasFramePointer() const override { return IsEbpBasedFrame; }
   void setHasFramePointer() override { IsEbpBasedFrame = true; }
   SizeT getStackReg() const override { return Traits::RegisterSet::Reg_esp; }
+  SizeT getFrameReg() const override { return Traits::RegisterSet::Reg_ebp; }
   SizeT getFrameOrStackReg() const override {
-    return IsEbpBasedFrame ? Traits::RegisterSet::Reg_ebp
-                           : Traits::RegisterSet::Reg_esp;
+    return IsEbpBasedFrame ? getFrameReg() : getStackReg();
   }
   size_t typeWidthInBytesOnStack(Type Ty) const override {
     // Round up to the next multiple of WordType bytes.
@@ -101,6 +101,16 @@
   uint32_t getStackAlignment() const override {
     return Traits::X86_STACK_ALIGNMENT_BYTES;
   }
+  void reserveFixedAllocaArea(size_t Size, size_t Align) override {
+    FixedAllocaSizeBytes = Size;
+    assert(llvm::isPowerOf2_32(Align));
+    FixedAllocaAlignBytes = Align;
+    PrologEmitsFixedAllocas = true;
+  }
+  /// Returns the (negative) offset from ebp/rbp where the fixed Allocas start.
+  int32_t getFrameFixedAllocaOffset() const override {
+    return FixedAllocaSizeBytes - SpillAreaSizeBytes;
+  }
 
   bool shouldSplitToVariable64On32(Type Ty) const override {
     return Traits::Is64Bit ? false : Ty == IceType_i64;
@@ -691,6 +701,8 @@
   bool NeedsStackAlignment = false;
   size_t SpillAreaSizeBytes = 0;
   size_t FixedAllocaSizeBytes = 0;
+  size_t FixedAllocaAlignBytes = 0;
+  bool PrologEmitsFixedAllocas = false;
   static std::array<llvm::SmallBitVector, RCX86_NUM> TypeToRegisterSet;
   static std::array<llvm::SmallBitVector, Traits::RegisterSet::Reg_NUM>
       RegisterAliases;
diff --git a/src/PNaClTranslator.cpp b/src/PNaClTranslator.cpp
index 755971d..cff46e7 100644
--- a/src/PNaClTranslator.cpp
+++ b/src/PNaClTranslator.cpp
@@ -2626,7 +2626,7 @@
       return;
     }
     CurrentNode->appendInst(Ice::InstAlloca::create(
-        Func.get(), ByteCount, Alignment, getNextInstVar(PtrTy)));
+        Func.get(), getNextInstVar(PtrTy), ByteCount, Alignment));
     return;
   }
   case naclbitc::FUNC_CODE_INST_LOAD: {
diff --git a/tests_lit/llvm2ice_tests/alloc.ll b/tests_lit/llvm2ice_tests/alloc.ll
index 53e1842..7b08afb 100644
--- a/tests_lit/llvm2ice_tests/alloc.ll
+++ b/tests_lit/llvm2ice_tests/alloc.ll
@@ -6,7 +6,8 @@
 
 ; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
 ; RUN:   --target x8632 -i %s --args -Om1 -allow-externally-defined-symbols \
-; RUN:   | %if --need=target_X8632 --command FileCheck %s
+; RUN:   | %if --need=target_X8632 --command FileCheck \
+; RUN:   --check-prefix CHECK-OPTM1 %s
 
 ; TODO(jvoung): Stop skipping unimplemented parts (via --skip-unimplemented)
 ; once enough infrastructure is in. Also, switch to --filetype=obj
@@ -33,11 +34,18 @@
   ret void
 }
 ; CHECK-LABEL: fixed_416_align_16
-; CHECK:      sub     esp,0x1a0
+; CHECK:      sub     esp,0x1ac
 ; CHECK:      sub     esp,0x10
 ; CHECK:      mov     DWORD PTR [esp],eax
 ; CHECK:      call {{.*}} R_{{.*}}    f1
 
+; CHECK-OPTM1-LABEL: fixed_416_align_16
+; CHECK-OPTM1:      sub     esp,0xc
+; CHECK-OPTM1:      sub     esp,0x1a0
+; CHECK-OPTM1:      sub     esp,0x10
+; CHECK-OPTM1:      mov     DWORD PTR [esp],eax
+; CHECK-OPTM1:      call {{.*}} R_{{.*}}    f1
+
 ; ARM32-LABEL: fixed_416_align_16
 ; ARM32:      sub sp, sp, #416
 ; ARM32:      bl {{.*}} R_{{.*}}    f1
@@ -52,8 +60,8 @@
 ; CHECK-LABEL: fixed_416_align_32
 ; CHECK:      push    ebp
 ; CHECK-NEXT: mov     ebp,esp
+; CHECK:      sub     esp,0x1a8
 ; CHECK:      and     esp,0xffffffe0
-; CHECK:      sub     esp,0x1a0
 ; CHECK:      sub     esp,0x10
 ; CHECK:      mov     DWORD PTR [esp],eax
 ; CHECK:      call {{.*}} R_{{.*}}    f1
@@ -72,11 +80,16 @@
   ret void
 }
 ; CHECK-LABEL: fixed_351_align_16
-; CHECK:      sub     esp,0x160
-; CHECK:      sub     esp,0x10
+; CHECK:      sub     esp,0x16c
 ; CHECK:      mov     DWORD PTR [esp],eax
 ; CHECK:      call {{.*}} R_{{.*}}    f1
 
+; CHECK-OPTM1-LABEL: fixed_351_align_16
+; CHECK-OPTM1:      sub     esp,0xc
+; CHECK-OPTM1:      sub     esp,0x160
+; CHECK-OPTM1:      mov     DWORD PTR [esp],eax
+; CHECK-OPTM1:      call {{.*}} R_{{.*}}    f1
+
 ; ARM32-LABEL: fixed_351_align_16
 ; ARM32:      sub sp, sp, #352
 ; ARM32:      bl {{.*}} R_{{.*}}    f1
@@ -91,8 +104,8 @@
 ; CHECK-LABEL: fixed_351_align_32
 ; CHECK:      push    ebp
 ; CHECK-NEXT: mov     ebp,esp
+; CHECK:      sub     esp,0x168
 ; CHECK:      and     esp,0xffffffe0
-; CHECK:      sub     esp,0x160
 ; CHECK:      sub     esp,0x10
 ; CHECK:      mov     DWORD PTR [esp],eax
 ; CHECK:      call {{.*}} R_{{.*}}    f1
diff --git a/tests_lit/llvm2ice_tests/fused-alloca-arg.ll b/tests_lit/llvm2ice_tests/fused-alloca-arg.ll
index f1e061c..5d9ec94 100644
--- a/tests_lit/llvm2ice_tests/fused-alloca-arg.ll
+++ b/tests_lit/llvm2ice_tests/fused-alloca-arg.ll
@@ -17,10 +17,8 @@
 }
 
 ; CHECK-LABEL:  caller1
-; CHECK-NEXT:   sub    esp,0xc
-; CHECK-NEXT:   mov    eax,DWORD PTR [esp+0x10]
-; CHECK-NEXT:   sub    esp,0x20
-; CHECK-NEXT:   mov    ecx,esp
+; CHECK-NEXT:   sub    esp,0x2c
+; CHECK-NEXT:   mov    eax,DWORD PTR [esp+0x30]
 ; CHECK-NEXT:   mov    DWORD PTR [esp],eax
 ; CHECK-NEXT:   sub    esp,0x20
 ; CHECK-NEXT:   mov    DWORD PTR [esp],eax
@@ -51,10 +49,8 @@
 }
 
 ; CHECK-LABEL:  caller2
-; CHECK-NEXT:   sub    esp,0xc
-; CHECK-NEXT:   mov    eax,DWORD PTR [esp+0x10]
-; CHECK-NEXT:   sub    esp,0x40
-; CHECK-NEXT:   mov    ecx,esp
+; CHECK-NEXT:   sub    esp,0x4c
+; CHECK-NEXT:   mov    eax,DWORD PTR [esp+0x50]
 ; CHECK-NEXT:   mov    DWORD PTR [esp],eax
 ; CHECK-NEXT:   mov    DWORD PTR [esp+0x20],eax
 ; CHECK-NEXT:   sub    esp,0x20
diff --git a/tests_lit/llvm2ice_tests/fused-alloca.ll b/tests_lit/llvm2ice_tests/fused-alloca.ll
index bf80fcd..8daec9c 100644
--- a/tests_lit/llvm2ice_tests/fused-alloca.ll
+++ b/tests_lit/llvm2ice_tests/fused-alloca.ll
@@ -19,14 +19,12 @@
   ret void
 }
 ; CHECK-LABEL: fused_small_align
-; CHECK-NEXT: sub    esp,0xc
-; CHECK-NEXT: mov    eax,DWORD PTR [esp+0x10]
 ; CHECK-NEXT: sub    esp,0x30
-; CHECK-NEXT: mov    {{.*}},esp
+; CHECK-NEXT: mov    eax,DWORD PTR [esp+0x34]
 ; CHECK-NEXT: mov    DWORD PTR [esp+0x10],eax
 ; CHECK-NEXT: mov    DWORD PTR [esp+0x18],eax
 ; CHECK-NEXT: mov    DWORD PTR [esp],eax
-; CHECK-NEXT: add    esp,0x3c
+; CHECK-NEXT: add    esp,0x30
 
 ; Test that a sequence of allocas with greater than stack alignment get fused.
 define internal void @fused_large_align(i32 %arg) {
@@ -45,11 +43,9 @@
 ; CHECK-LABEL: fused_large_align
 ; CHECK-NEXT: push   ebp
 ; CHECK-NEXT: mov    ebp,esp
-; CHECK-NEXT: sub    esp,0x8
-; CHECK-NEXT: mov    eax,DWORD PTR [ebp+0x8]
-; CHECK-NEXT: and    esp,0xffffffc0
 ; CHECK-NEXT: sub    esp,0x80
-; CHECK-NEXT: mov    ecx,esp
+; CHECK-NEXT: and    esp,0xffffffc0
+; CHECK-NEXT: mov    eax,DWORD PTR [ebp+0x8]
 ; CHECK-NEXT: mov    DWORD PTR [esp+0x40],eax
 ; CHECK-NEXT: mov    DWORD PTR [esp],eax
 ; CHECK-NEXT: mov    DWORD PTR [esp+0x60],eax
@@ -80,13 +76,88 @@
   br label %block1
 }
 ; CHECK-LABEL: fused_derived
-; CHECK-NEXT: sub    esp,0xc
-; CHECK-NEXT: mov    [[ARG:e..]],DWORD PTR [esp+0x10]
 ; CHECK-NEXT: sub    esp,0x180
-; CHECK-NEXT: mov    {{.*}},esp
+; CHECK-NEXT: mov    [[ARG:e..]],DWORD PTR [esp+0x184]
 ; CHECK-NEXT: jmp
 ; CHECK-NEXT: mov    DWORD PTR [esp+0x80],[[ARG]]
 ; CHECK-NEXT: mov    DWORD PTR [esp+0x8c],[[ARG]]
 ; CHECK-NEXT: lea    eax,[esp+0x81]
-; CHECK-NEXT: add    esp,0x18c
+; CHECK-NEXT: add    esp,0x180
 ; CHECK-NEXT: ret
+
+; Test that a fixed alloca gets referenced by the frame pointer.
+define internal void @fused_small_align_with_dynamic(i32 %arg) {
+entry:
+  %a1 = alloca i8, i32 8, align 16
+  br label %next
+next:
+  %a2 = alloca i8, i32 12, align 1
+  %a3 = alloca i8, i32 16, align 1
+  %p1 = bitcast i8* %a1 to i32*
+  %p2 = bitcast i8* %a2 to i32*
+  %p3 = bitcast i8* %a3 to i32*
+  store i32 %arg, i32* %p1, align 1
+  store i32 %arg, i32* %p2, align 1
+  store i32 %arg, i32* %p3, align 1
+  ret void
+}
+; CHECK-LABEL: fused_small_align_with_dynamic
+; CHECK-NEXT: push   ebp
+; CHECK-NEXT: mov    ebp,esp
+; CHECK-NEXT: sub    esp,0x18
+; CHECK-NEXT: mov    eax,DWORD PTR [ebp+0x8]
+; CHECK-NEXT: sub    esp,0x10
+; CHECK-NEXT: mov    ecx,esp
+; CHECK-NEXT: sub    esp,0x10
+; CHECK-NEXT: mov    edx,esp
+; CHECK-NEXT: mov    DWORD PTR [ebp-0x18],eax
+; CHECK-NEXT: mov    DWORD PTR [ecx],eax
+; CHECK-NEXT: mov    DWORD PTR [edx],eax
+; CHECK-NEXT: mov    esp,ebp
+; CHECK-NEXT: pop    ebp
+
+; Test that a sequence with greater than stack alignment and dynamic size
+; get folded and referenced correctly;
+
+define internal void @fused_large_align_with_dynamic(i32 %arg) {
+entry:
+  %a1 = alloca i8, i32 8, align 32
+  %a2 = alloca i8, i32 12, align 32
+  %a3 = alloca i8, i32 16, align 1
+  %a4 = alloca i8, i32 16, align 1
+  br label %next
+next:
+  %a5 = alloca i8, i32 16, align 1
+  %p1 = bitcast i8* %a1 to i32*
+  %p2 = bitcast i8* %a2 to i32*
+  %p3 = bitcast i8* %a3 to i32*
+  %p4 = bitcast i8* %a4 to i32*
+  %p5 = bitcast i8* %a5 to i32*
+  store i32 %arg, i32* %p1, align 1
+  store i32 %arg, i32* %p2, align 1
+  store i32 %arg, i32* %p3, align 1
+  store i32 %arg, i32* %p4, align 1
+  store i32 %arg, i32* %p5, align 1
+  ret void
+}
+; CHECK-LABEL: fused_large_align_with_dynamic
+; CHECK-NEXT: push   ebx
+; CHECK-NEXT: push   ebp
+; CHECK-NEXT: mov    ebp,esp
+; CHECK-NEXT: sub    esp,0x64
+; CHECK-NEXT: mov    eax,DWORD PTR [ebp+0xc]
+; CHECK-NEXT: and    esp,0xffffffe0
+; CHECK-NEXT: sub    esp,0x40
+; CHECK-NEXT: mov    ecx,esp
+; CHECK-NEXT: mov    edx,ecx
+; CHECK-NEXT: add    edx,0x20
+; CHECK-NEXT: add    ecx,0x0
+; CHECK-NEXT: sub    esp,0x10
+; CHECK-NEXT: mov    ebx,esp
+; CHECK-NEXT: mov    DWORD PTR [ecx],eax
+; CHECK-NEXT: mov    DWORD PTR [edx],eax
+; CHECK-NEXT: mov    DWORD PTR [ebp-0x14],eax
+; CHECK-NEXT: mov    DWORD PTR [ebp-0x24],eax
+; CHECK-NEXT: mov    DWORD PTR [ebx],eax
+; CHECK-NEXT: mov    esp,ebp
+; CHECK-NEXT: pop    ebp