Sort allocas, compute frame pointer in Cfg pass

Split allocas in the entry block into two categories.  The first has alignment <= stack alignment and constant size.  The second violates one or both of those conditions.  Sort both of these lists in descending alignment order and emit.  Also, compute the need for a frame pointer during the pass.

BUG=
R=jpp@chromium.org, stichnot@chromium.org

Review URL: https://codereview.chromium.org/1414343010 .
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 57b1e73..9eb934b 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -201,26 +201,7 @@
     if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Var))
       Var64On32->initHiLo(this);
 
-  // Figure out which alloca instructions result in storage at known stack frame
-  // offsets.  If this is true for all alloca instructions, then a stack pointer
-  // can still be used instead of a frame pointer, freeing up the frame pointer
-  // for normal register allocation.  Additionally, for each such alloca, its
-  // address could be rematerialized at each use in terms of the stack/frame
-  // pointer, saving a stack slot and a load from that stack slot.
-  //
-  // This simple implementation is limited to alloca instructions at the start
-  // of the entry node.
-  for (Inst &Instr : getEntryNode()->getInsts()) {
-    if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) {
-      if (llvm::isa<Constant>(Alloca->getSizeInBytes())) {
-        Alloca->setKnownFrameOffset();
-        continue;
-      }
-    }
-    // The first instruction that is not an alloca with a constant size stops
-    // the search.
-    break;
-  }
+  processAllocas();
 
   // The set of translation passes and their order are determined by the
   // target.
@@ -470,6 +451,91 @@
   getTarget()->lowerArguments();
 }
 
+void Cfg::sortAllocas(CfgVector<Inst *> &Allocas, InstList &Insts,
+                      bool IsKnownFrameOffset) {
+  if (Allocas.empty())
+    return;
+  // Sort by decreasing alignment.  This does not really matter at the moment,
+  // but will allow compacting stack allocation when we fuse to one alloca.
+  std::sort(Allocas.begin(), Allocas.end(),
+            [](Inst *I1, Inst *I2) {
+              auto *A1 = llvm::dyn_cast<InstAlloca>(I1);
+              auto *A2 = llvm::dyn_cast<InstAlloca>(I2);
+              return A1->getAlignInBytes() > A2->getAlignInBytes();
+            });
+  for (Inst *Instr: Allocas) {
+    auto *Alloca = llvm::cast<InstAlloca>(Instr);
+    // Move the alloca to its sorted position.
+    InstAlloca *NewAlloca = InstAlloca::create(this,
+                                               Alloca->getSizeInBytes(),
+                                               Alloca->getAlignInBytes(),
+                                               Alloca->getDest());
+    if (IsKnownFrameOffset)
+      NewAlloca->setKnownFrameOffset();
+    Insts.push_front(NewAlloca);
+    Alloca->setDeleted();
+  }
+}
+
+void Cfg::processAllocas() {
+  const uint32_t StackAlignment = getTarget()->getStackAlignment();
+  CfgNode *EntryNode = getEntryNode();
+  // Allocas in the entry block that have constant size and alignment less
+  // than or equal to the function's stack alignment.
+  CfgVector<Inst *> FixedAllocas;
+  // Allocas in the entry block that have constant size and alignment greater
+  // than the function's stack alignment.
+  CfgVector<Inst *> AlignedAllocas;
+  // LLVM enforces power of 2 alignment.
+  assert(llvm::isPowerOf2_32(StackAlignment));
+  // Collect the Allocas into the two vectors.
+  bool RequiresFramePointer = false;
+  for (Inst &Instr : EntryNode->getInsts()) {
+    if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) {
+      if (!llvm::isa<Constant>(Alloca->getSizeInBytes())) {
+        // Variable-sized allocations require a frame pointer.
+        RequiresFramePointer = true;
+        continue;
+      }
+      uint32_t AlignmentParam = Alloca->getAlignInBytes();
+      // For default align=0, set it to the real value 1, to avoid any
+      // bit-manipulation problems below.
+      AlignmentParam = std::max(AlignmentParam, 1u);
+      assert(llvm::isPowerOf2_32(AlignmentParam));
+      if (AlignmentParam > StackAlignment) {
+        // Allocations aligned more than the stack require a frame pointer.
+        RequiresFramePointer = true;
+        AlignedAllocas.push_back(Alloca);
+      }
+      else
+        FixedAllocas.push_back(Alloca);
+    }
+  }
+  // Look for alloca instructions in other blocks
+  for (CfgNode *Node : Nodes) {
+    if (Node == EntryNode)
+      continue;
+    for (Inst &Instr : Node->getInsts()) {
+      if (llvm::isa<InstAlloca>(&Instr)) {
+        // Allocations outside the entry block require a frame pointer.
+        RequiresFramePointer = true;
+        break;
+      }
+    }
+    if (RequiresFramePointer)
+      break;
+  }
+  // Mark the target as requiring a frame pointer.
+  if (RequiresFramePointer)
+    getTarget()->setHasFramePointer();
+  // Add instructions to the head of the entry block in reverse order.
+  InstList &Insts = getEntryNode()->getInsts();
+  // Fixed, large alignment alloca addresses do not have known offset.
+  sortAllocas(AlignedAllocas, Insts, false);
+  // Fixed, small alignment alloca addresses have known offset.
+  sortAllocas(FixedAllocas, Insts, true);
+}
+
 void Cfg::doAddressOpt() {
   TimerMarker T(TimerStack::TT_doAddressOpt, this);
   for (CfgNode *Node : Nodes)
diff --git a/src/IceCfg.h b/src/IceCfg.h
index 6a1a1c8..37c6eb7 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -185,6 +185,10 @@
   void advancedPhiLowering();
   void reorderNodes();
   void shuffleNodes();
+  void sortAllocas(CfgVector<Inst *> &Allocas, InstList &Insts,
+                   bool IsKnownFrameOffset);
+  /// Merge all the fixed-size allocas in the entry block.
+  void processAllocas();
   void doAddressOpt();
   void doArgLowering();
   void doNopInsertion();
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 605562d..f518225 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -201,9 +201,11 @@
   virtual IceString getRegName(SizeT RegNum, Type Ty) const = 0;
 
   virtual bool hasFramePointer() const { return false; }
+  virtual void setHasFramePointer() = 0;
   virtual SizeT getStackReg() const = 0;
   virtual SizeT getFrameOrStackReg() const = 0;
   virtual size_t typeWidthInBytesOnStack(Type Ty) const = 0;
+  virtual uint32_t getStackAlignment() const = 0;
 
   /// Return whether a 64-bit Variable should be split into a Variable64On32.
   virtual bool shouldSplitToVariable64On32(Type Ty) const = 0;
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index 0761f02..31ef0c0 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -380,6 +380,10 @@
   }
 }
 
+uint32_t TargetARM32::getStackAlignment() const {
+  return ARM32_STACK_ALIGNMENT_BYTES;
+}
+
 bool TargetARM32::doBranchOpt(Inst *I, const CfgNode *NextNode) {
   if (InstARM32Br *Br = llvm::dyn_cast<InstARM32Br>(I)) {
     return Br->optimizeBranch(NextNode);
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 922f68c..62a903d 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -81,6 +81,7 @@
     return RegisterAliases[Reg];
   }
   bool hasFramePointer() const override { return UsesFramePointer; }
+  void setHasFramePointer() override { UsesFramePointer = true; }
   SizeT getStackReg() const override { return RegARM32::Reg_sp; }
   SizeT getFrameOrStackReg() const override {
     return UsesFramePointer ? RegARM32::Reg_fp : RegARM32::Reg_sp;
@@ -92,6 +93,7 @@
     // are rounded up to 4 bytes.
     return (typeWidthInBytes(Ty) + 3) & ~3;
   }
+  uint32_t getStackAlignment() const override;
 
   bool shouldSplitToVariable64On32(Type Ty) const override {
     return Ty == IceType_i64;
diff --git a/src/IceTargetLoweringMIPS32.h b/src/IceTargetLoweringMIPS32.h
index 5ac0976..3cd1687 100644
--- a/src/IceTargetLoweringMIPS32.h
+++ b/src/IceTargetLoweringMIPS32.h
@@ -49,6 +49,7 @@
     return RegisterAliases[Reg];
   }
   bool hasFramePointer() const override { return UsesFramePointer; }
+  void setHasFramePointer() override { UsesFramePointer = true; }
   SizeT getStackReg() const override { return RegMIPS32::Reg_SP; }
   SizeT getFrameOrStackReg() const override {
     return UsesFramePointer ? RegMIPS32::Reg_FP : RegMIPS32::Reg_SP;
@@ -58,6 +59,10 @@
     // are rounded up to 4 bytes.
     return (typeWidthInBytes(Ty) + 3) & ~3;
   }
+  uint32_t getStackAlignment() const override {
+    // TODO(sehr): what is the stack alignment?
+    return 1;
+  }
 
   bool shouldSplitToVariable64On32(Type Ty) const override {
     return Ty == IceType_i64;
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index 7380d56..1b52aad 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -83,6 +83,7 @@
   }
 
   bool hasFramePointer() const override { return IsEbpBasedFrame; }
+  void setHasFramePointer() override { IsEbpBasedFrame = true; }
   SizeT getStackReg() const override { return Traits::RegisterSet::Reg_esp; }
   SizeT getFrameOrStackReg() const override {
     return IsEbpBasedFrame ? Traits::RegisterSet::Reg_ebp
@@ -93,6 +94,9 @@
     const uint32_t WordSizeInBytes = typeWidthInBytes(Traits::WordType);
     return Utils::applyAlignment(typeWidthInBytes(Ty), WordSizeInBytes);
   }
+  uint32_t getStackAlignment() const override {
+    return Traits::X86_STACK_ALIGNMENT_BYTES;
+  }
 
   bool shouldSplitToVariable64On32(Type Ty) const override {
     return Traits::Is64Bit ? false : Ty == IceType_i64;
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 2ecb7d3..b22ec64 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -957,7 +957,6 @@
   uint32_t Alignment =
       std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES);
   if (Alignment > Traits::X86_STACK_ALIGNMENT_BYTES) {
-    IsEbpBasedFrame = true;
     _and(esp, Ctx->getConstantInt32(-Alignment));
   }
   if (const auto *ConstantTotalSize =