Live Range Splitting after initial Register Allocation

After register allocation is done once, this pass targets
the variables that do not get registers, break them into
multiple variables with shorter (at most spanning a basic
block) live ranges. After discarding the new variables with
too few uses, the register allocator is run again and
the new variables that manage to get registers are inserted.

BUG=None
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/2172313002 .
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index c92c9eb..c7dac53 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -459,8 +459,11 @@
       // Undo the effect of the phi instruction on this node's live-in set by
       // marking the phi dest variable as live on entry.
       SizeT VarNum = Func->getLiveness()->getLiveIndex(Dest->getIndex());
-      assert(!Func->getLiveness()->getLiveIn(this)[VarNum]);
-      Func->getLiveness()->getLiveIn(this)[VarNum] = true;
+      auto &LiveIn = Func->getLiveness()->getLiveIn(this);
+      if (VarNum < LiveIn.size()) {
+        assert(!LiveIn[VarNum]);
+        LiveIn[VarNum] = true;
+      }
       Phi->setDeleted();
     }
   }
@@ -867,15 +870,15 @@
 
     Variable *Var = Liveness->getVariable(i, this);
     if (LB > LE) {
-      Var->addLiveRange(FirstInstNum, LE);
-      Var->addLiveRange(LB, LastInstNum + 1);
+      Var->addLiveRange(FirstInstNum, LE, this);
+      Var->addLiveRange(LB, LastInstNum + 1, this);
       // Assert that Var is a global variable by checking that its liveness
       // index is less than the number of globals. This ensures that the
       // LiveInAndOut[] access is valid.
       assert(i < Liveness->getNumGlobalVars());
       LiveInAndOut[i] = false;
     } else {
-      Var->addLiveRange(LB, LE);
+      Var->addLiveRange(LB, LE, this);
     }
     if (i == i1)
       ++IBB;
@@ -887,7 +890,7 @@
        i = LiveInAndOut.find_next(i)) {
     Variable *Var = Liveness->getVariable(i, this);
     if (Liveness->getRangeMask(Var->getIndex()))
-      Var->addLiveRange(FirstInstNum, LastInstNum + 1);
+      Var->addLiveRange(FirstInstNum, LastInstNum + 1, this);
   }
 }
 
diff --git a/src/IceClFlags.def b/src/IceClFlags.def
index 0281e7d..d5e1a26 100644
--- a/src/IceClFlags.def
+++ b/src/IceClFlags.def
@@ -175,6 +175,10 @@
              "this executable."),                                              \
     cl::init(false))                                                           \
                                                                                \
+  X(SplitGlobalVars, bool, dev_opt_flag, "split-global-vars",                  \
+    cl::desc("Global live range splitting"),                                   \
+    cl::init(false))                                                           \
+                                                                               \
   X(InputFileFormat, llvm::NaClFileFormat, dev_opt_flag, "bitcode-format",     \
     cl::desc("Define format of input file:"),                                  \
     cl::values(clEnumValN(llvm::LLVMFormat, "llvm", "LLVM file (default)"),    \
diff --git a/src/IceDefs.h b/src/IceDefs.h
index c78af0f..81f0a89 100644
--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -39,6 +39,7 @@
 #include <map>
 #include <memory>
 #include <mutex>
+#include <set>
 #include <string>
 #include <system_error>
 #include <unordered_map>
@@ -129,6 +130,8 @@
 template <typename T> using CfgList = std::list<T, CfgLocalAllocator<T>>;
 template <typename T, typename H = std::hash<T>, typename Eq = std::equal_to<T>>
 using CfgUnorderedSet = std::unordered_set<T, H, Eq, CfgLocalAllocator<T>>;
+template <typename T, typename Cmp = std::less<T>>
+using CfgSet = std::set<T, Cmp, CfgLocalAllocator<T>>;
 template <typename T, typename U, typename H = std::hash<T>,
           typename Eq = std::equal_to<T>>
 using CfgUnorderedMap =
diff --git a/src/IceInst.h b/src/IceInst.h
index 2f35b52..492df48 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -191,6 +191,7 @@
   virtual bool isRedundantAssign() const { return false; }
 
   virtual ~Inst() = default;
+  void replaceDest(Variable *Var) { Dest = Var; }
 
 protected:
   Inst(Cfg *Func, InstKind Kind, SizeT MaxSrcs, Variable *Dest);
diff --git a/src/IceOperand.cpp b/src/IceOperand.cpp
index 3e31c60..5d7aa42 100644
--- a/src/IceOperand.cpp
+++ b/src/IceOperand.cpp
@@ -101,14 +101,21 @@
   return !(B < A) && !(A < B);
 }
 
-void LiveRange::addSegment(InstNumberT Start, InstNumberT End) {
-  if (!Range.empty()) {
-    // Check for merge opportunity.
-    InstNumberT CurrentEnd = Range.back().second;
-    assert(Start >= CurrentEnd);
-    if (Start == CurrentEnd) {
-      Range.back().second = End;
-      return;
+void LiveRange::addSegment(InstNumberT Start, InstNumberT End, CfgNode *Node) {
+  if (getFlags().getSplitGlobalVars()) {
+    // Disable merging to make sure a live range 'segment' has a single node.
+    // Might be possible to enable when the target segment has the same node.
+    assert(NodeMap.find(Start) == NodeMap.end());
+    NodeMap[Start] = Node;
+  } else {
+    if (!Range.empty()) {
+      // Check for merge opportunity.
+      InstNumberT CurrentEnd = Range.back().second;
+      assert(Start >= CurrentEnd);
+      if (Start == CurrentEnd) {
+        Range.back().second = End;
+        return;
+      }
     }
   }
   Range.push_back(RangeElementType(Start, End));
diff --git a/src/IceOperand.h b/src/IceOperand.h
index 5033592..a2e9f9e 100644
--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -603,6 +603,9 @@
 /// interval.
 class LiveRange {
 public:
+  using RangeElementType = std::pair<InstNumberT, InstNumberT>;
+  /// RangeType is arena-allocated from the Cfg's allocator.
+  using RangeType = CfgVector<RangeElementType>;
   LiveRange() = default;
   /// Special constructor for building a kill set. The advantage is that we can
   /// reserve the right amount of space in advance.
@@ -618,7 +621,10 @@
     Range.clear();
     untrim();
   }
-  void addSegment(InstNumberT Start, InstNumberT End);
+  void addSegment(InstNumberT Start, InstNumberT End, CfgNode *Node = nullptr);
+  void addSegment(RangeElementType Segment, CfgNode *Node = nullptr) {
+    addSegment(Segment.first, Segment.second, Node);
+  }
 
   bool endsBefore(const LiveRange &Other) const;
   bool overlaps(const LiveRange &Other, bool UseTrimmed = false) const;
@@ -637,11 +643,18 @@
 
   void dump(Ostream &Str) const;
 
+  SizeT getNumSegments() const { return Range.size(); }
+
+  const RangeType &getSegments() const { return Range; }
+  CfgNode *getNodeForSegment(InstNumberT Begin) {
+    auto Iter = NodeMap.find(Begin);
+    assert(Iter != NodeMap.end());
+    return Iter->second;
+  }
+
 private:
-  using RangeElementType = std::pair<InstNumberT, InstNumberT>;
-  /// RangeType is arena-allocated from the Cfg's allocator.
-  using RangeType = CfgVector<RangeElementType>;
   RangeType Range;
+  CfgUnorderedMap<InstNumberT, CfgNode *> NodeMap;
   /// TrimmedBegin is an optimization for the overlaps() computation. Since the
   /// linear-scan algorithm always calls it as overlaps(Cur) and Cur advances
   /// monotonically according to live range start, we can optimize overlaps() by
@@ -760,9 +773,10 @@
   const LiveRange &getLiveRange() const { return Live; }
   void setLiveRange(const LiveRange &Range) { Live = Range; }
   void resetLiveRange() { Live.reset(); }
-  void addLiveRange(InstNumberT Start, InstNumberT End) {
+  void addLiveRange(InstNumberT Start, InstNumberT End,
+                    CfgNode *Node = nullptr) {
     assert(!getIgnoreLiveness());
-    Live.addSegment(Start, End);
+    Live.addSegment(Start, End, Node);
   }
   void trimLiveRange(InstNumberT Start) { Live.trim(Start); }
   void untrimLiveRange() { Live.untrim(); }
diff --git a/src/IceRegAlloc.cpp b/src/IceRegAlloc.cpp
index 06631bf..282a544 100644
--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
@@ -113,7 +113,6 @@
   // register allocation since no overlap opportunities should be available and
   // it's more expensive to look for opportunities.
   FindOverlap = (Kind != RAK_Phi);
-  const VarList &Vars = Func->getVariables();
   Unhandled.reserve(Vars.size());
   UnhandledPrecolored.reserve(Vars.size());
   // Gather the live ranges of all variables and add them to the Unhandled set.
@@ -168,7 +167,6 @@
   if (!BuildDefs::dump())
     return false;
 
-  const VarList &Vars = Func->getVariables();
   OstreamLocker L(Ctx);
   Ostream &Str = Ctx->getStrDump();
   for (SizeT VarNum : DefsWithoutUses) {
@@ -212,7 +210,6 @@
   FindPreference = false;
   FindOverlap = false;
   SizeT NumVars = 0;
-  const VarList &Vars = Func->getVariables();
 
   // Iterate across all instructions and record the begin and end of the live
   // range for each variable that is pre-colored or infinite weight.
@@ -325,13 +322,19 @@
   }
 }
 
-void LinearScan::init(RegAllocKind Kind) {
+void LinearScan::init(RegAllocKind Kind, CfgSet<Variable *> ExcludeVars) {
   this->Kind = Kind;
   Unhandled.clear();
   UnhandledPrecolored.clear();
   Handled.clear();
   Inactive.clear();
   Active.clear();
+  Vars.clear();
+  Vars.reserve(Func->getVariables().size() - ExcludeVars.size());
+  for (auto *Var : Func->getVariables()) {
+    if (ExcludeVars.find(Var) == ExcludeVars.end())
+      Vars.emplace_back(Var);
+  }
 
   SizeT NumRegs = Target->getNumRegisters();
   RegAliases.resize(NumRegs);
diff --git a/src/IceRegAlloc.h b/src/IceRegAlloc.h
index fb1a0d8..e9f8752 100644
--- a/src/IceRegAlloc.h
+++ b/src/IceRegAlloc.h
@@ -32,7 +32,7 @@
 
 public:
   explicit LinearScan(Cfg *Func);
-  void init(RegAllocKind Kind);
+  void init(RegAllocKind Kind, CfgSet<Variable *> ExcludeVars = {});
   void scan(const SmallBitVector &RegMask, bool Randomized);
   // Returns the number of times some variable has been assigned a register but
   // later evicted because of a higher-priority allocation.  The idea is that we
@@ -131,9 +131,9 @@
   llvm::SmallVector<const SmallBitVector *, REGS_SIZE> RegAliases;
   bool FindPreference = false;
   bool FindOverlap = false;
-
   const bool Verbose;
   const bool UseReserve;
+  CfgVector<Variable *> Vars;
 };
 
 } // end of namespace Ice
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index a2203dc..6a13ccb 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -24,6 +24,7 @@
 #include "IceGlobalContext.h"
 #include "IceGlobalInits.h"
 #include "IceInstVarIter.h"
+#include "IceLiveness.h"
 #include "IceOperand.h"
 #include "IceRegAlloc.h"
 
@@ -511,6 +512,182 @@
   // set of Variables that have no register and a non-empty live range, and
   // model an infinite number of registers.  Maybe use the register aliasing
   // mechanism to get better packing of narrower slots.
+  if (getFlags().getSplitGlobalVars())
+    postRegallocSplitting(RegMask);
+}
+
+namespace {
+CfgVector<Inst *> getInstructionsInRange(CfgNode *Node, InstNumberT Start,
+                                         InstNumberT End) {
+  CfgVector<Inst *> Result;
+  bool Started = false;
+  auto Process = [&](InstList &Insts) {
+    for (auto &Instr : Insts) {
+      if (Instr.isDeleted()) {
+        continue;
+      }
+      if (Instr.getNumber() == Start) {
+        Started = true;
+      }
+      if (Started) {
+        Result.emplace_back(&Instr);
+      }
+      if (Instr.getNumber() == End) {
+        break;
+      }
+    }
+  };
+  Process(Node->getPhis());
+  Process(Node->getInsts());
+  // TODO(manasijm): Investigate why checking >= End significantly changes
+  // output. Should not happen when renumbering produces monotonically
+  // increasing instruction numbers and live ranges begin and end on non-deleted
+  // instructions.
+  return Result;
+}
+}
+
+void TargetLowering::postRegallocSplitting(const SmallBitVector &RegMask) {
+  // Splits the live ranges of global(/multi block) variables and runs the
+  // register allocator to find registers for as many of the new variables as
+  // possible.
+  // TODO(manasijm): Merge the small liveranges back into multi-block ones when
+  // the variables get the same register. This will reduce the amount of new
+  // instructions inserted. This might involve a full dataflow analysis.
+  // Also, modify the preference mechanism in the register allocator to match.
+
+  TimerMarker _(TimerStack::TT_splitGlobalVars, Func);
+  CfgSet<Variable *> SplitCandidates;
+
+  // Find variables that do not have registers but are allowed to. Also skip
+  // variables with single segment live ranges as they are not split further in
+  // this function.
+  for (Variable *Var : Func->getVariables()) {
+    if (!Var->mustNotHaveReg() && !Var->hasReg()) {
+      if (Var->getLiveRange().getNumSegments() > 1)
+        SplitCandidates.insert(Var);
+    }
+  }
+  if (SplitCandidates.empty())
+    return;
+
+  CfgSet<Variable *> ExtraVars;
+
+  struct UseInfo {
+    Variable *Replacing = nullptr;
+    Inst *FirstUse = nullptr;
+    Inst *LastDef = nullptr;
+    SizeT UseCount = 0;
+  };
+  CfgUnorderedMap<Variable *, UseInfo> VarInfo;
+  // Split the live ranges of the viable variables by node.
+  // Compute metadata (UseInfo) for each of the resulting variables.
+  for (auto *Var : SplitCandidates) {
+    for (auto &Segment : Var->getLiveRange().getSegments()) {
+      UseInfo Info;
+      Info.Replacing = Var;
+      auto *Node = Var->getLiveRange().getNodeForSegment(Segment.first);
+
+      for (auto *Instr :
+           getInstructionsInRange(Node, Segment.first, Segment.second)) {
+        for (SizeT i = 0; i < Instr->getSrcSize(); ++i) {
+          // It's safe to iterate over the top-level src operands rather than
+          // using FOREACH_VAR_IN_INST(), because any variables inside e.g.
+          // mem operands should already have registers.
+          if (auto *Var = llvm::dyn_cast<Variable>(Instr->getSrc(i))) {
+            if (Var == Info.Replacing) {
+              if (Info.FirstUse == nullptr && !llvm::isa<InstPhi>(Instr)) {
+                Info.FirstUse = Instr;
+              }
+              Info.UseCount++;
+            }
+          }
+        }
+        if (Instr->getDest() == Info.Replacing && !llvm::isa<InstPhi>(Instr)) {
+          Info.LastDef = Instr;
+        }
+      }
+
+      static constexpr SizeT MinUseThreshold = 3;
+      // Skip if variable has less than `MinUseThreshold` uses in the segment.
+      if (Info.UseCount < MinUseThreshold)
+        continue;
+
+      if (!Info.FirstUse && !Info.LastDef) {
+        continue;
+      }
+
+      LiveRange LR;
+      LR.addSegment(Segment);
+      Variable *NewVar = Func->makeVariable(Var->getType());
+
+      NewVar->setLiveRange(LR);
+
+      VarInfo[NewVar] = Info;
+
+      ExtraVars.insert(NewVar);
+    }
+  }
+  // Run the register allocator with all these new variables included
+  LinearScan RegAlloc(Func);
+  RegAlloc.init(RAK_Global, SplitCandidates);
+  RegAlloc.scan(RegMask, getFlags().getRandomizeRegisterAllocation());
+
+  // Modify the Cfg to use the new variables that now have registers.
+  for (auto *ExtraVar : ExtraVars) {
+    if (!ExtraVar->hasReg()) {
+      continue;
+    }
+
+    auto &Info = VarInfo[ExtraVar];
+
+    assert(ExtraVar->getLiveRange().getSegments().size() == 1);
+    auto Segment = ExtraVar->getLiveRange().getSegments()[0];
+
+    auto *Node =
+        Info.Replacing->getLiveRange().getNodeForSegment(Segment.first);
+
+    auto RelevantInsts =
+        getInstructionsInRange(Node, Segment.first, Segment.second);
+
+    if (RelevantInsts.empty())
+      continue;
+
+    // Replace old variables
+    for (auto *Instr : RelevantInsts) {
+      if (llvm::isa<InstPhi>(Instr))
+        continue;
+      // TODO(manasijm): Figure out how to safely enable replacing phi dest
+      // variables. The issue is that we can not insert low level mov
+      // instructions into the PhiList.
+      for (SizeT i = 0; i < Instr->getSrcSize(); ++i) {
+        // FOREACH_VAR_IN_INST() not needed. Same logic as above.
+        if (auto *Var = llvm::dyn_cast<Variable>(Instr->getSrc(i))) {
+          if (Var == Info.Replacing) {
+            Instr->replaceSource(i, ExtraVar);
+          }
+        }
+      }
+      if (Instr->getDest() == Info.Replacing) {
+        Instr->replaceDest(ExtraVar);
+      }
+    }
+
+    assert(Info.FirstUse != Info.LastDef);
+    assert(Info.FirstUse || Info.LastDef);
+
+    // Insert spill code
+    if (Info.FirstUse != nullptr) {
+      auto *NewInst =
+          Func->getTarget()->createLoweredMove(ExtraVar, Info.Replacing);
+      Node->getInsts().insert(instToIterator(Info.FirstUse), NewInst);
+    }
+    if (Info.LastDef != nullptr) {
+      auto *NewInst =
+          Func->getTarget()->createLoweredMove(Info.Replacing, ExtraVar);
+      Node->getInsts().insertAfter(instToIterator(Info.LastDef), NewInst);
+    }
+  }
 }
 
 void TargetLowering::markRedefinitions() {
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index ad2a63e..278b7a8 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -23,11 +23,12 @@
 #ifndef SUBZERO_SRC_ICETARGETLOWERING_H
 #define SUBZERO_SRC_ICETARGETLOWERING_H
 
-#include "IceDefs.h"
 #include "IceBitVector.h"
 #include "IceCfgNode.h"
+#include "IceDefs.h"
 #include "IceInst.h" // for the names of the Inst subtypes
 #include "IceOperand.h"
+#include "IceRegAlloc.h"
 #include "IceTypes.h"
 
 #include <utility>
@@ -290,6 +291,7 @@
   virtual const SmallBitVector &getAliasesForRegister(RegNumT) const = 0;
 
   void regAlloc(RegAllocKind Kind);
+  void postRegallocSplitting(const SmallBitVector &RegMask);
 
   virtual void
   makeRandomRegisterPermutation(llvm::SmallVectorImpl<RegNumT> &Permutation,
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index a5f1b2e..eaecb40 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -93,6 +93,17 @@
   SizeT getNumRegisters() const override {
     return Traits::RegisterSet::Reg_NUM;
   }
+
+  Inst *createLoweredMove(Variable *Dest, Variable *SrcVar) override {
+    if (isVectorType(Dest->getType())) {
+      return Traits::Insts::Movp::create(Func, Dest, SrcVar);
+    }
+    return Traits::Insts::Mov::create(Func, Dest, SrcVar);
+    (void)Dest;
+    (void)SrcVar;
+    return nullptr;
+  }
+
   Variable *getPhysicalRegister(RegNumT RegNum,
                                 Type Ty = IceType_void) override;
   const char *getRegName(RegNumT RegNum, Type Ty) const override;
@@ -220,8 +231,6 @@
   InstructionSetEnum getInstructionSet() const { return InstructionSet; }
   Operand *legalizeUndef(Operand *From, RegNumT RegNum = RegNumT());
 
-  Inst *createLoweredMove(Variable *Dest, Variable *SrcVar) override;
-
 protected:
   const bool NeedSandboxing;
 
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 948dd19..c1643ef 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -1353,15 +1353,6 @@
   RI->setDeleted();
 }
 
-template <typename TraitsType>
-Inst *TargetX86Base<TraitsType>::createLoweredMove(Variable *Dest,
-                                                   Variable *SrcVar) {
-  if (isVectorType(Dest->getType())) {
-    return Traits::Insts::Movp::create(Func, Dest, SrcVar);
-  }
-  return Traits::Insts::Mov::create(Func, Dest, SrcVar);
-}
-
 template <typename TraitsType> Type TargetX86Base<TraitsType>::stackSlotType() {
   return Traits::WordType;
 }
diff --git a/src/IceTimerTree.def b/src/IceTimerTree.def
index 9cd6b8e..3697206 100644
--- a/src/IceTimerTree.def
+++ b/src/IceTimerTree.def
@@ -63,6 +63,7 @@
   X(regAlloc)                                                                  \
   X(renumberInstructions)                                                      \
   X(shortCircuit)                                                              \
+  X(splitGlobalVars)                                                           \
   X(splitLocalVars)                                                            \
   X(szmain)                                                                    \
   X(translate)                                                                 \