Subzero: Local variable splitting.

The linear-scan register allocator takes an all-or-nothing approach -- either the variable's entire live range gets a register, or none of it does.

To help with this, we add a pass that splits successive uses of a variable within a basic block into a chain of linked variables.  This gives the register allocator the chance to allocate registers to subsets of the original live range.

The split variables are linked to each other so that if they don't get a register, they share a stack slot with the original variable, and redundant writes to that stack slot are recognized and elided.

This pass is executed after target lowering and right before register allocation.  As such, it has to deal with some idiosyncrasies of target lowering, specifically the possibility of intra-block control flow.  We experimented with doing this as a pre-lowering pass.  However, the transformations interfered with some of the target lowering's pattern matching, such as bool folding, so we concluded that post-lowering was a better place for it.

Note: Some of the lit tests are overly specific about registers, and in these cases it was the path of least resistance to just disable local variable splitting.

BUG= none
R=eholk@chromium.org, jpp@chromium.org

Review URL: https://codereview.chromium.org/2177033002 .
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 23c363f..28ae017 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -1661,7 +1661,7 @@
   emitTextHeader(FunctionName, Ctx, Asm);
   if (getFlags().getDecorateAsm()) {
     for (Variable *Var : getVariables()) {
-      if (Var->hasStackOffset() && !Var->isRematerializable()) {
+      if (Var->hasKnownStackOffset() && !Var->isRematerializable()) {
         Str << "\t" << Var->getSymbolicStackOffset() << " = "
             << Var->getStackOffset() << "\n";
       }
diff --git a/src/IceCfgNode.h b/src/IceCfgNode.h
index 40c7d85..cf20e13 100644
--- a/src/IceCfgNode.h
+++ b/src/IceCfgNode.h
@@ -76,6 +76,8 @@
   /// @{
   InstList &getInsts() { return Insts; }
   PhiList &getPhis() { return Phis; }
+  const InstList &getInsts() const { return Insts; }
+  const PhiList &getPhis() const { return Phis; }
   void appendInst(Inst *Instr);
   void renumberInstructions();
   /// Rough and generally conservative estimate of the number of instructions in
diff --git a/src/IceClFlags.cpp b/src/IceClFlags.cpp
index 4982cbf..516713c 100644
--- a/src/IceClFlags.cpp
+++ b/src/IceClFlags.cpp
@@ -204,6 +204,7 @@
   OutFlags.setDisableHybridAssembly(DisableHybridAssemblyObj ||
                                     (OutFileTypeObj != Ice::FT_Iasm));
   OutFlags.ForceO2.init(OutFlags.getForceO2String());
+  OutFlags.SplitInsts.init(OutFlags.getSplitInstString());
   OutFlags.TestStatus.init(OutFlags.getTestStatusString());
   OutFlags.TimingFocus.init(OutFlags.getTimingFocusOnString());
   OutFlags.TranslateOnly.init(OutFlags.getTranslateOnlyString());
diff --git a/src/IceClFlags.def b/src/IceClFlags.def
index 0fabb9c..2af1ee4 100644
--- a/src/IceClFlags.def
+++ b/src/IceClFlags.def
@@ -159,6 +159,9 @@
   X(ForceO2String, std::string, dev_opt_flag, "force-O2",                      \
     cl::desc("Force -O2 for certain functions (assumes -Om1)"), cl::init(""))  \
                                                                                \
+  X(SplitInstString, std::string, dev_opt_flag, "split-inst",                  \
+    cl::desc("Restrict local var splitting to specific insts"), cl::init(":")) \
+                                                                               \
   X(FunctionSections, bool, dev_opt_flag, "ffunction-sections",                \
     cl::desc("Emit functions into separate sections"))                         \
                                                                                \
@@ -233,6 +236,9 @@
   X(RandomizeRegisterAllocation, bool, dev_opt_flag, "randomize-regalloc",     \
     cl::desc("Randomize register allocation"), cl::init(false))                \
                                                                                \
+  X(SplitLocalVars, bool, dev_opt_flag, "split-local-vars", cl::init(true),    \
+    cl::desc("Block-local variable splitting (O2 only)"))                      \
+                                                                               \
   X(RandomSeed, unsigned long long, dev_opt_flag, "sz-seed",                   \
     cl::desc("Seed the random number generator"), cl::init(1))                 \
                                                                                \
diff --git a/src/IceClFlags.h b/src/IceClFlags.h
index 8c43471..a1018e5 100644
--- a/src/IceClFlags.h
+++ b/src/IceClFlags.h
@@ -168,6 +168,9 @@
   bool matchForceO2(GlobalString Name, uint32_t Number) const {
     return ForceO2.match(Name, Number);
   }
+  bool matchSplitInsts(const std::string &Name, uint32_t Number) const {
+    return SplitInsts.match(Name, Number);
+  }
   bool matchTestStatus(GlobalString Name, uint32_t Number) const {
     return TestStatus.match(Name, Number);
   }
@@ -191,6 +194,7 @@
   bool GenerateUnitTestMessages;
 
   RangeSpec ForceO2;
+  RangeSpec SplitInsts;
   RangeSpec TestStatus;
   RangeSpec TimingFocus;
   RangeSpec TranslateOnly;
diff --git a/src/IceInst.cpp b/src/IceInst.cpp
index 2d67f37..2ffb0cc 100644
--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -1090,9 +1090,10 @@
   Condition = InstIcmpAttributes[Condition].Reverse;
   std::swap(Srcs[0], Srcs[1]);
 }
+
 bool checkForRedundantAssign(const Variable *Dest, const Operand *Source) {
   const auto *SrcVar = llvm::dyn_cast<const Variable>(Source);
-  if (!SrcVar)
+  if (SrcVar == nullptr)
     return false;
   if (Dest->hasReg() && Dest->getRegNum() == SrcVar->getRegNum()) {
     // TODO: On x86-64, instructions like "mov eax, eax" are used to clear the
@@ -1101,6 +1102,8 @@
   }
   if (!Dest->hasReg() && !SrcVar->hasReg()) {
     if (!Dest->hasStackOffset() || !SrcVar->hasStackOffset()) {
+      // If called before stack slots have been assigned (i.e. as part of the
+      // dump() routine), conservatively return false.
       return false;
     }
     if (Dest->getStackOffset() != SrcVar->getStackOffset()) {
@@ -1108,6 +1111,15 @@
     }
     return true;
   }
+  // For a "v=t" assignment where t has a register, v has a stack slot, and v
+  // has a LinkedTo stack root, and v and t share the same LinkedTo root, return
+  // true.  This is because this assignment is effectively reassigning the same
+  // value to the original LinkedTo stack root.
+  if (SrcVar->hasReg() && Dest->hasStackOffset() &&
+      Dest->getLinkedToStackRoot() != nullptr &&
+      Dest->getLinkedToRoot() == SrcVar->getLinkedToRoot()) {
+    return true;
+  }
   return false;
 }
 
diff --git a/src/IceInst.h b/src/IceInst.h
index b8e1b61..2f35b52 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -111,8 +111,6 @@
   void replaceSource(SizeT Index, Operand *Replacement) {
     assert(Index < getSrcSize());
     assert(!isDeleted());
-    assert(LiveRangesEnded == 0);
-    // Invalidates liveness info because the use Srcs[Index] is removed.
     Srcs[Index] = Replacement;
   }
 
@@ -151,6 +149,15 @@
   /// report_fatal_error().
   virtual bool isMemoryWrite() const;
 
+  /// Returns true if the (target-specific) instruction represents an
+  /// intra-block label, i.e. branch target.  This is meant primarily for
+  /// Cfg::splitLocalVars().
+  virtual bool isLabel() const { return false; }
+  /// If the (target-specific) instruction represents an intra-block branch to
+  /// some Label instruction, return that Label branch target instruction;
+  /// otherwise return nullptr.
+  virtual const Inst *getIntraBlockBranchTarget() const { return nullptr; }
+
   void livenessLightweight(Cfg *Func, LivenessBV &Live);
   /// Calculates liveness for this instruction. Returns true if this instruction
   /// is (tentatively) still live and should be retained, and false if this
diff --git a/src/IceInstX86Base.h b/src/IceInstX86Base.h
index 655b38d..de7d2f7 100644
--- a/src/IceInstX86Base.h
+++ b/src/IceInstX86Base.h
@@ -337,6 +337,7 @@
     uint32_t getEmitInstCount() const override { return 0; }
     GlobalString getLabelName() const { return Name; }
     SizeT getLabelNumber() const { return LabelNumber; }
+    bool isLabel() const override { return true; }
     void emit(const Cfg *Func) const override;
     void emitIAS(const Cfg *Func) const override;
     void dump(const Cfg *Func) const override;
@@ -412,6 +413,7 @@
     bool isUnconditionalBranch() const override {
       return !Label && Condition == Cond::Br_None;
     }
+    const Inst *getIntraBlockBranchTarget() const override { return Label; }
     bool repointEdges(CfgNode *OldNode, CfgNode *NewNode) override;
     void emit(const Cfg *Func) const override;
     void emitIAS(const Cfg *Func) const override;
diff --git a/src/IceOperand.cpp b/src/IceOperand.cpp
index 540252b..3e31c60 100644
--- a/src/IceOperand.cpp
+++ b/src/IceOperand.cpp
@@ -208,10 +208,11 @@
 }
 
 RegWeight Variable::getWeight(const Cfg *Func) const {
-  VariablesMetadata *VMetadata = Func->getVMetadata();
-  return mustHaveReg() ? RegWeight(RegWeight::Inf)
-                       : mustNotHaveReg() ? RegWeight(RegWeight::Zero)
-                                          : VMetadata->getUseWeight(this);
+  if (mustHaveReg())
+    return RegWeight(RegWeight::Inf);
+  if (mustNotHaveReg())
+    return RegWeight(RegWeight::Zero);
+  return Func->getVMetadata()->getUseWeight(this);
 }
 
 void VariableTracking::markUse(MetadataKind TrackingKind, const Inst *Instr,
@@ -540,8 +541,10 @@
   if (Func->isVerbose(IceV_RegOrigins) ||
       (!hasReg() && !Func->getTarget()->hasComputedFrame())) {
     Str << "%" << getName();
-    if (getLinkedTo() != nullptr)
-      Str << ":%" << getLinkedTo()->getName();
+    for (Variable *Link = getLinkedTo(); Link != nullptr;
+         Link = Link->getLinkedTo()) {
+      Str << ":%" << Link->getName();
+    }
   }
   if (hasReg()) {
     if (Func->isVerbose(IceV_RegOrigins))
@@ -554,7 +557,7 @@
         hasReg() ? getBaseRegNum() : Func->getTarget()->getFrameOrStackReg();
     Str << "["
         << Func->getTarget()->getRegName(BaseRegisterNumber, IceType_i32);
-    if (hasStackOffset()) {
+    if (hasKnownStackOffset()) {
       int32_t Offset = getStackOffset();
       if (Offset) {
         if (Offset > 0)
diff --git a/src/IceOperand.h b/src/IceOperand.h
index 041adfd..5033592 100644
--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -696,12 +696,27 @@
     return IgnoreLiveness || IsRematerializable;
   }
 
+  /// Returns true if the variable either has a definite stack offset, or has
+  /// the UndeterminedStackOffset such that it is guaranteed to have a definite
+  /// stack offset at emission time.
   bool hasStackOffset() const { return StackOffset != InvalidStackOffset; }
+  /// Returns true if the variable has a stack offset that is known at this
+  /// time.
+  bool hasKnownStackOffset() const {
+    return StackOffset != InvalidStackOffset &&
+           StackOffset != UndeterminedStackOffset;
+  }
   int32_t getStackOffset() const {
-    assert(hasStackOffset());
+    assert(hasKnownStackOffset());
     return StackOffset;
   }
   void setStackOffset(int32_t Offset) { StackOffset = Offset; }
+  /// Set a "placeholder" stack offset before its actual offset has been
+  /// determined.
+  void setHasStackOffset() {
+    if (!hasStackOffset())
+      StackOffset = UndeterminedStackOffset;
+  }
   /// Returns the variable's stack offset in symbolic form, to improve
   /// readability in DecorateAsm mode.
   std::string getSymbolicStackOffset() const {
@@ -729,6 +744,7 @@
   bool mustNotHaveReg() const {
     return RegRequirement == RR_MustNotHaveRegister;
   }
+  bool mayHaveReg() const { return RegRequirement == RR_MayHaveRegister; }
   void setRematerializable(RegNumT NewRegNum, int32_t NewOffset) {
     IsRematerializable = true;
     setRegNum(NewRegNum);
@@ -789,6 +805,18 @@
       Root = Root->LinkedTo;
     return Root;
   }
+  /// Follow the LinkedTo chain up to the furthest stack-allocated ancestor.
+  /// This is only certain to be accurate after register allocation and stack
+  /// slot assignment have completed.
+  Variable *getLinkedToStackRoot() const {
+    Variable *FurthestStackVar = nullptr;
+    for (Variable *Root = LinkedTo; Root != nullptr; Root = Root->LinkedTo) {
+      if (!Root->hasReg() && Root->hasStackOffset()) {
+        FurthestStackVar = Root;
+      }
+    }
+    return FurthestStackVar;
+  }
 
   static bool classof(const Operand *Operand) {
     OperandKind Kind = Operand->getKind();
@@ -825,7 +853,10 @@
   RegNumT RegNum;
   /// RegNumTmp is the tentative assignment during register allocation.
   RegNumT RegNumTmp;
-  static constexpr int32_t InvalidStackOffset = -1;
+  static constexpr int32_t InvalidStackOffset =
+      std::numeric_limits<int32_t>::min();
+  static constexpr int32_t UndeterminedStackOffset =
+      1 + std::numeric_limits<int32_t>::min();
   /// StackOffset is the canonical location on stack (only if
   /// RegNum.hasNoValue() || IsArgument).
   int32_t StackOffset = InvalidStackOffset;
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index 1b9ca23..a2203dc 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -565,16 +565,6 @@
             });
 }
 
-namespace {
-bool mightHaveStackSlot(const Variable *Var, const BitVector &IsVarReferenced) {
-  if (!IsVarReferenced[Var->getIndex()])
-    return false;
-  if (Var->hasReg())
-    return false;
-  return true;
-}
-} // end of anonymous namespace
-
 void TargetLowering::getVarStackSlotParams(
     VarList &SortedSpilledVariables, SmallBitVector &RegsUsed,
     size_t *GlobalsSize, size_t *SpillAreaSizeBytes,
@@ -594,30 +584,6 @@
     }
   }
 
-  // Find each variable Var where:
-  //  - Var is actively referenced
-  //  - Var does not have a register
-  //  - Var's furthest ancestor through LinkedTo: Root
-  //  - Root has no active references, or has a register
-  //
-  // When any such Var is found, rotate the LinkedTo tree by swapping
-  // Var->LinkedTo and Root->LinkedTo.  This ensures that when Var needs a stack
-  // slot, either its LinkedTo field is nullptr, or Var->getLinkedToRoot()
-  // returns a variable with a stack slot.
-  for (Variable *Var : Func->getVariables()) {
-    if (!mightHaveStackSlot(Var, IsVarReferenced))
-      continue;
-    if (Variable *Root = Var->getLinkedToRoot()) {
-      assert(Root->getLinkedTo() == nullptr);
-      if (mightHaveStackSlot(Root, IsVarReferenced)) {
-        // Found a "safe" root, no need to rotate the tree.
-        continue;
-      }
-      Var->setLinkedTo(nullptr);
-      Root->setLinkedTo(Var);
-    }
-  }
-
   // If SimpleCoalescing is false, each variable without a register gets its
   // own unique stack slot, which leads to large stack frames. If
   // SimpleCoalescing is true, then each "global" variable without a register
@@ -647,8 +613,13 @@
     }
     // An argument either does not need a stack slot (if passed in a register)
     // or already has one (if passed on the stack).
-    if (Var->getIsArg())
+    if (Var->getIsArg()) {
+      if (!Var->hasReg()) {
+        assert(!Var->hasStackOffset());
+        Var->setHasStackOffset();
+      }
       continue;
+    }
     // An unreferenced variable doesn't need a stack slot.
     if (!IsVarReferenced[Var->getIndex()])
       continue;
@@ -656,6 +627,8 @@
     // not need accounting here.
     if (TargetVarHook(Var))
       continue;
+    assert(!Var->hasStackOffset());
+    Var->setHasStackOffset();
     SpilledVariables.push_back(Var);
   }
 
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 43f5e01..ad2a63e 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -321,6 +321,16 @@
   virtual void addProlog(CfgNode *Node) = 0;
   virtual void addEpilog(CfgNode *Node) = 0;
 
+  /// Create a properly-typed "mov" instruction.  This is primarily for local
+  /// variable splitting.
+  virtual Inst *createLoweredMove(Variable *Dest, Variable *SrcVar) {
+    // TODO(stichnot): make pure virtual by implementing for all targets
+    (void)Dest;
+    (void)SrcVar;
+    llvm::report_fatal_error("createLoweredMove() unimplemented");
+    return nullptr;
+  }
+
   virtual ~TargetLowering() = default;
 
 private:
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index 2ff54e9..a5f1b2e 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -220,6 +220,8 @@
   InstructionSetEnum getInstructionSet() const { return InstructionSet; }
   Operand *legalizeUndef(Operand *From, RegNumT RegNum = RegNumT());
 
+  Inst *createLoweredMove(Variable *Dest, Variable *SrcVar) override;
+
 protected:
   const bool NeedSandboxing;
 
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 2acd331..ecb4ae6 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -23,11 +23,13 @@
 #include "IceELFObjectWriter.h"
 #include "IceGlobalInits.h"
 #include "IceInstVarIter.h"
+#include "IceInstX86Base.h"
 #include "IceLiveness.h"
 #include "IceOperand.h"
 #include "IcePhiLoweringImpl.h"
 #include "IceUtils.h"
-#include "IceInstX86Base.h"
+#include "IceVariableSplitting.h"
+
 #include "llvm/Support/MathExtras.h"
 
 #include <stack>
@@ -521,6 +523,7 @@
     initSandbox();
   }
   Func->dump("After x86 codegen");
+  splitBlockLocalVariables(Func);
 
   // Register allocation. This requires instruction renumbering and full
   // liveness analysis. Loops must be identified before liveness so variable
@@ -1042,11 +1045,11 @@
   // stack slot.
   std::function<bool(Variable *)> TargetVarHook =
       [&VariablesLinkedToSpillSlots](Variable *Var) {
-        if (Var->getLinkedTo() != nullptr) {
-          // TODO(stichnot): This assert won't necessarily be true in the
-          // future.
-          assert(Var->mustNotHaveReg());
-          if (!Var->getLinkedTo()->hasReg()) {
+        // TODO(stichnot): Refactor this into the base class.
+        Variable *Root = Var->getLinkedToStackRoot();
+        if (Root != nullptr) {
+          assert(!Root->hasReg());
+          if (!Root->hasReg()) {
             VariablesLinkedToSpillSlots.push_back(Var);
             return true;
           }
@@ -1210,7 +1213,7 @@
   // Assign stack offsets to variables that have been linked to spilled
   // variables.
   for (Variable *Var : VariablesLinkedToSpillSlots) {
-    const Variable *Root = Var->getLinkedToRoot();
+    const Variable *Root = Var->getLinkedToStackRoot();
     assert(Root != nullptr);
     Var->setStackOffset(Root->getStackOffset());
   }
@@ -1350,6 +1353,15 @@
   RI->setDeleted();
 }
 
+template <typename TraitsType>
+Inst *TargetX86Base<TraitsType>::createLoweredMove(Variable *Dest,
+                                                   Variable *SrcVar) {
+  if (isVectorType(Dest->getType())) {
+    return Traits::Insts::Movp::create(Func, Dest, SrcVar);
+  }
+  return Traits::Insts::Mov::create(Func, Dest, SrcVar);
+}
+
 template <typename TraitsType> Type TargetX86Base<TraitsType>::stackSlotType() {
   return Traits::WordType;
 }
@@ -3124,7 +3136,7 @@
       } else {
         Src0 = legalize(Src0);
         if (llvm::isa<X86OperandMem>(Src0)) {
-          Variable *T = Func->makeVariable(DestTy);
+          Variable *T = makeReg(DestTy);
           _movq(T, Src0);
           _movq(Dest, T);
           break;
diff --git a/src/IceTimerTree.def b/src/IceTimerTree.def
index 0ebcfd3..9cd6b8e 100644
--- a/src/IceTimerTree.def
+++ b/src/IceTimerTree.def
@@ -63,6 +63,7 @@
   X(regAlloc)                                                                  \
   X(renumberInstructions)                                                      \
   X(shortCircuit)                                                              \
+  X(splitLocalVars)                                                            \
   X(szmain)                                                                    \
   X(translate)                                                                 \
   X(translateFunctions)                                                        \
diff --git a/src/IceVariableSplitting.cpp b/src/IceVariableSplitting.cpp
new file mode 100644
index 0000000..8680ada
--- /dev/null
+++ b/src/IceVariableSplitting.cpp
@@ -0,0 +1,608 @@
+//===- subzero/src/IceVariableSplitting.cpp - Local variable splitting ----===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Aggressive block-local variable splitting to improve linear-scan
+/// register allocation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "IceVariableSplitting.h"
+
+#include "IceCfg.h"
+#include "IceCfgNode.h"
+#include "IceClFlags.h"
+#include "IceInst.h"
+#include "IceOperand.h"
+#include "IceTargetLowering.h"
+
+namespace Ice {
+
+namespace {
+
+/// A Variable is "allocable" if it is a register allocation candidate but
+/// doesn't already have a register.
+bool isAllocable(const Variable *Var) {
+  if (Var == nullptr)
+    return false;
+  return !Var->hasReg() && Var->mayHaveReg();
+}
+
+/// A Variable is "inf" if it already has a register or is infinite-weight.
+bool isInf(const Variable *Var) {
+  if (Var == nullptr)
+    return false;
+  return Var->hasReg() || Var->mustHaveReg();
+}
+
+/// VariableMap is a simple helper class that keeps track of the latest split
+/// version of the original Variables, as well as the instruction containing the
+/// last use of the Variable within the current block.  For each entry, the
+/// Variable is tagged with the CfgNode that it is valid in, so that we don't
+/// need to clear the entire Map[] vector for each block.
+class VariableMap {
+private:
+  VariableMap() = delete;
+  VariableMap(const VariableMap &) = delete;
+  VariableMap &operator=(const VariableMap &) = delete;
+
+  struct VarInfo {
+    /// MappedVar is the latest mapped/split version of the Variable.
+    Variable *MappedVar = nullptr;
+    /// MappedVarNode is the block in which MappedVar is valid.
+    const CfgNode *MappedVarNode = nullptr;
+    /// LastUseInst is the last instruction in the block that uses the Variable
+    /// as a source operand.
+    const Inst *LastUseInst = nullptr;
+    /// LastUseNode is the block in which LastUseInst is valid.
+    const CfgNode *LastUseNode = nullptr;
+    VarInfo() = default;
+
+  private:
+    VarInfo(const VarInfo &) = delete;
+    VarInfo &operator=(const VarInfo &) = delete;
+  };
+
+public:
+  explicit VariableMap(Cfg *Func)
+      : Func(Func), NumVars(Func->getNumVariables()), Map(NumVars) {}
+  /// Reset the mappings at the start of a block.
+  void reset(const CfgNode *CurNode) {
+    Node = CurNode;
+    // Do a prepass through all the instructions, marking which instruction is
+    // the last use of each Variable within the block.
+    for (const Inst &Instr : Node->getInsts()) {
+      if (Instr.isDeleted())
+        continue;
+      for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
+        if (auto *SrcVar = llvm::dyn_cast<Variable>(Instr.getSrc(i))) {
+          const SizeT VarNum = getVarNum(SrcVar);
+          Map[VarNum].LastUseInst = &Instr;
+          Map[VarNum].LastUseNode = Node;
+        }
+      }
+    }
+  }
+  /// Get Var's current mapping (or Var itself if it has no mapping yet).
+  Variable *get(Variable *Var) const {
+    const SizeT VarNum = getVarNum(Var);
+    Variable *MappedVar = Map[VarNum].MappedVar;
+    if (MappedVar == nullptr)
+      return Var;
+    if (Map[VarNum].MappedVarNode != Node)
+      return Var;
+    return MappedVar;
+  }
+  /// Create a new linked Variable in the LinkedTo chain, and set it as Var's
+  /// latest mapping.
+  Variable *makeLinked(Variable *Var) {
+    Variable *NewVar = Func->makeVariable(Var->getType());
+    NewVar->setRegClass(Var->getRegClass());
+    NewVar->setLinkedTo(get(Var));
+    const SizeT VarNum = getVarNum(Var);
+    Map[VarNum].MappedVar = NewVar;
+    Map[VarNum].MappedVarNode = Node;
+    return NewVar;
+  }
+  /// Given Var that is LinkedTo some other variable, re-splice it into the
+  /// LinkedTo chain so that the chain is ordered by Variable::getIndex().
+  void spliceBlockLocalLinkedToChain(Variable *Var) {
+    Variable *LinkedTo = Var->getLinkedTo();
+    assert(LinkedTo != nullptr);
+    assert(Var->getIndex() > LinkedTo->getIndex());
+    const SizeT VarNum = getVarNum(LinkedTo);
+    Variable *Link = Map[VarNum].MappedVar;
+    if (Link == nullptr || Map[VarNum].MappedVarNode != Node)
+      return;
+    Variable *LinkParent = Link->getLinkedTo();
+    while (LinkParent != nullptr && LinkParent->getIndex() >= Var->getIndex()) {
+      Link = LinkParent;
+      LinkParent = Link->getLinkedTo();
+    }
+    Var->setLinkedTo(LinkParent);
+    Link->setLinkedTo(Var);
+  }
+  /// Return whether the given Variable has any uses as a source operand within
+  /// the current block.  If it has no source operand uses, but is assigned as a
+  /// dest variable in some instruction in the block, then we needn't bother
+  /// splitting it.
+  bool isDestUsedInBlock(const Variable *Dest) const {
+    return Map[getVarNum(Dest)].LastUseNode == Node;
+  }
+  /// Return whether the given instruction is the last use of the given Variable
+  /// within the current block.  If it is, then we needn't bother splitting the
+  /// Variable at this instruction.
+  bool isInstLastUseOfVar(const Variable *Var, const Inst *Instr) {
+    return Map[getVarNum(Var)].LastUseInst == Instr;
+  }
+
+private:
+  Cfg *const Func;
+  // NumVars is for the size of the Map array.  It can be const because any new
+  // Variables created during the splitting pass don't need to be mapped.
+  const SizeT NumVars;
+  CfgVector<VarInfo> Map;
+  const CfgNode *Node = nullptr;
+  /// Get Var's VarNum, and do some validation.
+  SizeT getVarNum(const Variable *Var) const {
+    const SizeT VarNum = Var->getIndex();
+    assert(VarNum < NumVars);
+    return VarNum;
+  }
+};
+
+/// LocalVariableSplitter tracks the necessary splitting state across
+/// instructions.
+class LocalVariableSplitter {
+  LocalVariableSplitter() = delete;
+  LocalVariableSplitter(const LocalVariableSplitter &) = delete;
+  LocalVariableSplitter &operator=(const LocalVariableSplitter &) = delete;
+
+public:
+  explicit LocalVariableSplitter(Cfg *Func)
+      : Target(Func->getTarget()), VarMap(Func) {}
+  /// setNode() is called before processing the instructions of a block.
+  void setNode(CfgNode *CurNode) {
+    Node = CurNode;
+    VarMap.reset(Node);
+    LinkedToFixups.clear();
+  }
+  /// finalizeNode() is called after all instructions in the block are
+  /// processed.
+  void finalizeNode() {
+    // Splice in any preexisting LinkedTo links into the single chain.  These
+    // are the ones that were recorded during setInst().
+    for (Variable *Var : LinkedToFixups) {
+      VarMap.spliceBlockLocalLinkedToChain(Var);
+    }
+  }
+  /// setInst() is called before processing the next instruction.  The iterators
+  /// are the insertion points for a new instructions, depending on whether the
+  /// new instruction should be inserted before or after the current
+  /// instruction.
+  void setInst(Inst *CurInst, InstList::iterator Cur, InstList::iterator Next) {
+    Instr = CurInst;
+    Dest = Instr->getDest();
+    IterCur = Cur;
+    IterNext = Next;
+    ShouldSkipRemainingInstructions = false;
+    // Note any preexisting LinkedTo relationships that were created during
+    // target lowering.  Record them in LinkedToFixups which is then processed
+    // in finalizeNode().
+    if (Dest != nullptr && Dest->getLinkedTo() != nullptr) {
+      LinkedToFixups.emplace_back(Dest);
+    }
+  }
+  bool shouldSkipRemainingInstructions() const {
+    return ShouldSkipRemainingInstructions;
+  }
+  bool isUnconditionallyExecuted() const { return WaitingForLabel == nullptr; }
+
+  /// Note: the handle*() functions return true to indicate that the instruction
+  /// has now been handled and that the instruction loop should continue to the
+  /// next instruction in the block (and return false otherwise).  In addition,
+  /// they set the ShouldSkipRemainingInstructions flag to indicate that no more
+  /// instructions in the block should be processed.
+
+  /// Handle an "unwanted" instruction by returning true;
+  bool handleUnwantedInstruction() {
+    // We can limit the splitting to an arbitrary subset of the instructions,
+    // and still expect correct code.  As such, we can do instruction-subset
+    // bisection to help debug any problems in this pass.
+    static constexpr char AnInstructionHasNoName[] = "";
+    if (!BuildDefs::minimal() &&
+        !getFlags().matchSplitInsts(AnInstructionHasNoName,
+                                    Instr->getNumber())) {
+      return true;
+    }
+    if (!llvm::isa<InstTarget>(Instr)) {
+      // Ignore non-lowered instructions like FakeDef/FakeUse.
+      return true;
+    }
+    return false;
+  }
+
+  /// Process a potential label instruction.
+  bool handleLabel() {
+    if (!Instr->isLabel())
+      return false;
+    // A Label instruction shouldn't have any operands, so it can be handled
+    // right here and then move on.
+    assert(Dest == nullptr);
+    assert(Instr->getSrcSize() == 0);
+    if (Instr == WaitingForLabel) {
+      // If we found the forward-branch-target Label instruction we're waiting
+      // for, then clear the WaitingForLabel state.
+      WaitingForLabel = nullptr;
+    } else if (WaitingForLabel == nullptr && WaitingForBranchTo == nullptr) {
+      // If we found a new Label instruction while the WaitingFor* state is
+      // clear, then set things up for this being a backward branch target.
+      WaitingForBranchTo = Instr;
+    } else {
+      // We see something we don't understand, so skip to the next block.
+      ShouldSkipRemainingInstructions = true;
+    }
+    return true;
+  }
+
+  /// Process a potential intra-block branch instruction.
+  bool handleIntraBlockBranch() {
+    const Inst *Label = Instr->getIntraBlockBranchTarget();
+    if (Label == nullptr)
+      return false;
+    // An intra-block branch instruction shouldn't have any operands, so it can
+    // be handled right here and then move on.
+    assert(Dest == nullptr);
+    assert(Instr->getSrcSize() == 0);
+    if (WaitingForBranchTo == Label && WaitingForLabel == nullptr) {
+      WaitingForBranchTo = nullptr;
+    } else if (WaitingForBranchTo == nullptr &&
+               (WaitingForLabel == nullptr || WaitingForLabel == Label)) {
+      WaitingForLabel = Label;
+    } else {
+      // We see something we don't understand, so skip to the next block.
+      ShouldSkipRemainingInstructions = true;
+    }
+    return true;
+  }
+
+  /// Specially process a potential "Variable=Variable" assignment instruction,
+  /// when it conforms to certain patterns.
+  bool handleSimpleVarAssign() {
+    if (!Instr->isVarAssign())
+      return false;
+    const bool DestIsInf = isInf(Dest);
+    const bool DestIsAllocable = isAllocable(Dest);
+    auto *SrcVar = llvm::cast<Variable>(Instr->getSrc(0));
+    const bool SrcIsInf = isInf(SrcVar);
+    const bool SrcIsAllocable = isAllocable(SrcVar);
+    if (DestIsInf && SrcIsInf) {
+      // The instruction:
+      //   t:inf = u:inf
+      // No transformation is needed.
+      return true;
+    }
+    if (DestIsInf && SrcIsAllocable && Dest->getType() == SrcVar->getType()) {
+      // The instruction:
+      //   t:inf = v
+      // gets transformed to:
+      //   t:inf = v1
+      //   v2 = t:inf
+      // where:
+      //   v1 := map[v]
+      //   v2 := linkTo(v)
+      //   map[v] := v2
+      //
+      // If both v2 and its linkedToStackRoot get a stack slot, then "v2=t:inf"
+      // is recognized as a redundant assignment and elided.
+      //
+      // Note that if the dest and src types are different, then this is
+      // actually a truncation operation, which would make "v2=t:inf" an invalid
+      // instruction.  In this case, the type test will make it fall through to
+      // the general case below.
+      Variable *OldMapped = VarMap.get(SrcVar);
+      Instr->replaceSource(0, OldMapped);
+      if (isUnconditionallyExecuted()) {
+        // Only create new mapping state if the instruction is unconditionally
+        // executed.
+        if (!VarMap.isInstLastUseOfVar(SrcVar, Instr)) {
+          Variable *NewMapped = VarMap.makeLinked(SrcVar);
+          Inst *Mov = Target->createLoweredMove(NewMapped, Dest);
+          Node->getInsts().insert(IterNext, Mov);
+        }
+      }
+      return true;
+    }
+    if (DestIsAllocable && SrcIsInf) {
+      if (!VarMap.isDestUsedInBlock(Dest)) {
+        return true;
+      }
+      // The instruction:
+      //   v = t:inf
+      // gets transformed to:
+      //   v = t:inf
+      //   v2 = t:inf
+      // where:
+      //   v2 := linkTo(v)
+      //   map[v] := v2
+      //
+      // If both v2 and v get a stack slot, then "v2=t:inf" is recognized as a
+      // redundant assignment and elided.
+      if (isUnconditionallyExecuted()) {
+        // Only create new mapping state if the instruction is unconditionally
+        // executed.
+        Variable *NewMapped = VarMap.makeLinked(Dest);
+        Inst *Mov = Target->createLoweredMove(NewMapped, SrcVar);
+        Node->getInsts().insert(IterNext, Mov);
+      } else {
+        // For a conditionally executed instruction, add a redefinition of the
+        // original Dest mapping, without creating a new linked variable.
+        Variable *OldMapped = VarMap.get(Dest);
+        Inst *Mov = Target->createLoweredMove(OldMapped, SrcVar);
+        Mov->setDestRedefined();
+        Node->getInsts().insert(IterNext, Mov);
+      }
+      return true;
+    }
+    assert(!ShouldSkipRemainingInstructions);
+    return false;
+  }
+
+  /// Process the dest Variable of a Phi instruction.
+  bool handlePhi() {
+    assert(llvm::isa<InstPhi>(Instr));
+    const bool DestIsAllocable = isAllocable(Dest);
+    if (!DestIsAllocable)
+      return true;
+    if (!VarMap.isDestUsedInBlock(Dest))
+      return true;
+    Variable *NewMapped = VarMap.makeLinked(Dest);
+    Inst *Mov = Target->createLoweredMove(NewMapped, Dest);
+    Node->getInsts().insert(IterCur, Mov);
+    return true;
+  }
+
+  /// Process an arbitrary instruction.
+  bool handleGeneralInst() {
+    const bool DestIsAllocable = isAllocable(Dest);
+    // The (non-variable-assignment) instruction:
+    //   ... = F(v)
+    // where v is not infinite-weight, gets transformed to:
+    //   v2 = v1
+    //   ... = F(v1)
+    // where:
+    //   v1 := map[v]
+    //   v2 := linkTo(v)
+    //   map[v] := v2
+    // After that, if the "..." dest=u is not infinite-weight, append:
+    //   u2 = u
+    // where:
+    //   u2 := linkTo(u)
+    //   map[u] := u2
+    for (SizeT i = 0; i < Instr->getSrcSize(); ++i) {
+      // Iterate over the top-level src vars.  Don't bother to dig into
+      // e.g. MemOperands because their vars should all be infinite-weight.
+      // (This assumption would need to change if the pass were done
+      // pre-lowering.)
+      if (auto *SrcVar = llvm::dyn_cast<Variable>(Instr->getSrc(i))) {
+        const bool SrcIsAllocable = isAllocable(SrcVar);
+        if (SrcIsAllocable) {
+          Variable *OldMapped = VarMap.get(SrcVar);
+          if (isUnconditionallyExecuted()) {
+            if (!VarMap.isInstLastUseOfVar(SrcVar, Instr)) {
+              Variable *NewMapped = VarMap.makeLinked(SrcVar);
+              Inst *Mov = Target->createLoweredMove(NewMapped, OldMapped);
+              Node->getInsts().insert(IterCur, Mov);
+            }
+          }
+          Instr->replaceSource(i, OldMapped);
+        }
+      }
+    }
+    // Transformation of Dest is the same as the "v=t:inf" case above.
+    if (DestIsAllocable && VarMap.isDestUsedInBlock(Dest)) {
+      if (isUnconditionallyExecuted()) {
+        Variable *NewMapped = VarMap.makeLinked(Dest);
+        Inst *Mov = Target->createLoweredMove(NewMapped, Dest);
+        Node->getInsts().insert(IterNext, Mov);
+      } else {
+        Variable *OldMapped = VarMap.get(Dest);
+        Inst *Mov = Target->createLoweredMove(OldMapped, Dest);
+        Mov->setDestRedefined();
+        Node->getInsts().insert(IterNext, Mov);
+      }
+    }
+    return true;
+  }
+
+private:
+  TargetLowering *Target;
+  CfgNode *Node = nullptr;
+  Inst *Instr = nullptr;
+  Variable *Dest = nullptr;
+  InstList::iterator IterCur;
+  InstList::iterator IterNext;
+  bool ShouldSkipRemainingInstructions = false;
+  VariableMap VarMap;
+  CfgVector<Variable *> LinkedToFixups;
+  /// WaitingForLabel and WaitingForBranchTo are for tracking intra-block
+  /// control flow.
+  const Inst *WaitingForLabel = nullptr;
+  const Inst *WaitingForBranchTo = nullptr;
+};
+
+} // end of anonymous namespace
+
+/// Within each basic block, rewrite Variable references in terms of chained
+/// copies of the original Variable.  For example:
+///   A = B + C
+/// might be rewritten as:
+///   B1 = B
+///   C1 = C
+///   A = B + C
+///   A1 = A
+/// and then:
+///   D = A + B
+/// might be rewritten as:
+///   A2 = A1
+///   B2 = B1
+///   D = A1 + B1
+///   D1 = D
+///
+/// The purpose is to present the linear-scan register allocator with smaller
+/// live ranges, to help mitigate its "all or nothing" allocation strategy,
+/// while counting on its preference mechanism to keep the split versions in the
+/// same register when possible.
+///
+/// When creating new Variables, A2 is linked to A1 which is linked to A, and
+/// similar for the other Variable linked-to chains.  Rewrites apply only to
+/// Variables where mayHaveReg() is true.
+///
+/// At code emission time, redundant linked-to stack assignments will be
+/// recognized and elided.  To illustrate using the above example, if A1 gets a
+/// register but A and A2 are on the stack, the "A2=A1" store instruction is
+/// redundant since A and A2 share the same stack slot and A1 originated from A.
+///
+/// Simple assignment instructions are rewritten slightly differently, to take
+/// maximal advantage of Variables known to have registers.
+///
+/// In general, there may be several valid ways to rewrite an instruction: add
+/// the new assignment instruction either before or after the original
+/// instruction, and rewrite the original instruction with either the old or the
+/// new variable mapping.  We try to pick a strategy most likely to avoid
+/// potential performance problems.  For example, try to avoid storing to the
+/// stack and then immediately reloading from the same location.  One
+/// consequence is that code might be generated that loads a register from a
+/// stack location, followed almost immediately by another use of the same stack
+/// location, despite its value already being available in a register as a
+/// result of the first instruction.  However, the performance impact here is
+/// likely to be negligible, and a simple availability peephole optimization
+/// could clean it up.
+///
+/// This pass potentially adds a lot of new instructions and variables, and as
+/// such there are compile-time performance concerns, particularly with liveness
+/// analysis and register allocation.  Note that for liveness analysis, the new
+/// variables have single-block liveness, so they don't increase the size of the
+/// liveness bit vectors that need to be merged across blocks.  As a result, the
+/// performance impact is likely to be linearly related to the number of new
+/// instructions, rather than number of new variables times number of blocks
+/// which would be the case if they were multi-block variables.
+void splitBlockLocalVariables(Cfg *Func) {
+  if (!getFlags().getSplitLocalVars())
+    return;
+  TimerMarker _(TimerStack::TT_splitLocalVars, Func);
+  LocalVariableSplitter Splitter(Func);
+  // TODO(stichnot): Fix this mechanism for LinkedTo variables and stack slot
+  // assignment.
+  //
+  // To work around shortcomings with stack frame mapping, we want to arrange
+  // LinkedTo structure such that within one block, the LinkedTo structure
+  // leading to a root forms a list, not a tree.  A LinkedTo root can have
+  // multiple children linking to it, but only one per block.  Furthermore,
+  // because stack slot mapping processes variables in numerical order, the
+  // LinkedTo chain needs to be ordered such that when A->getLinkedTo() == B,
+  // then A->getIndex() > B->getIndex().
+  //
+  // To effect this, while processing a block we keep track of preexisting
+  // LinkedTo relationships via the LinkedToFixups vector, and at the end of the
+  // block we splice them in such that the block has a single chain for each
+  // root, ordered by getIndex() value.
+  CfgVector<Variable *> LinkedToFixups;
+  for (CfgNode *Node : Func->getNodes()) {
+    // Clear the VarMap and LinkedToFixups at the start of every block.
+    LinkedToFixups.clear();
+    Splitter.setNode(Node);
+    auto &Insts = Node->getInsts();
+    auto Iter = Insts.begin();
+    auto IterEnd = Insts.end();
+    // TODO(stichnot): Figure out why Phi processing usually degrades
+    // performance.  Disable for now.
+    static constexpr bool ProcessPhis = false;
+    if (ProcessPhis) {
+      for (Inst &Instr : Node->getPhis()) {
+        if (Instr.isDeleted())
+          continue;
+        Splitter.setInst(&Instr, Iter, Iter);
+        Splitter.handlePhi();
+      }
+    }
+    InstList::iterator NextIter;
+    for (; Iter != IterEnd && !Splitter.shouldSkipRemainingInstructions();
+         Iter = NextIter) {
+      NextIter = Iter;
+      ++NextIter;
+      Inst *Instr = iteratorToInst(Iter);
+      if (Instr->isDeleted())
+        continue;
+      Splitter.setInst(Instr, Iter, NextIter);
+
+      // Before doing any transformations, take care of the bookkeeping for
+      // intra-block branching.
+      //
+      // This is tricky because the transformation for one instruction may
+      // depend on a transformation for a previous instruction, but if that
+      // previous instruction is not dynamically executed due to intra-block
+      // control flow, it may lead to an inconsistent state and incorrect code.
+      //
+      // We want to handle some simple cases, and reject some others:
+      //
+      // 1. For something like a select instruction, we could have:
+      //   test cond
+      //   dest = src_false
+      //   branch conditionally to label
+      //   dest = src_true
+      //   label:
+      //
+      // Between the conditional branch and the label, we need to treat dest and
+      // src variables specially, specifically not creating any new state.
+      //
+      // 2. Some 64-bit atomic instructions may be lowered to a loop:
+      //   label:
+      //   ...
+      //   branch conditionally to label
+      //
+      // No special treatment is needed, but it's worth tracking so that case #1
+      // above can also be handled.
+      //
+      // 3. Advanced switch lowering can create really complex intra-block
+      // control flow, so when we recognize this, we should just stop splitting
+      // for the remainder of the block (which isn't much since a switch
+      // instruction is a terminator).
+      //
+      // 4. Other complex lowering, e.g. an i64 icmp on a 32-bit architecture,
+      // can result in an if/then/else like structure with two labels.  One
+      // possibility would be to suspect splitting for the remainder of the
+      // lowered instruction, and then resume for the remainder of the block,
+      // but since we don't have high-level instruction markers, we might as
+      // well just stop splitting for the remainder of the block.
+      if (Splitter.handleLabel())
+        continue;
+      if (Splitter.handleIntraBlockBranch())
+        continue;
+      if (Splitter.handleUnwantedInstruction())
+        continue;
+
+      // Intra-block bookkeeping is complete, now do the transformations.
+
+      // Determine the transformation based on the kind of instruction, and
+      // whether its Variables are infinite-weight.  New instructions can be
+      // inserted before the current instruction via Iter, or after the current
+      // instruction via NextIter.
+      if (Splitter.handleSimpleVarAssign())
+        continue;
+      if (Splitter.handleGeneralInst())
+        continue;
+    }
+    Splitter.finalizeNode();
+  }
+
+  Func->dump("After splitting local variables");
+}
+
+} // end of namespace Ice
diff --git a/src/IceVariableSplitting.h b/src/IceVariableSplitting.h
new file mode 100644
index 0000000..0da8d1d
--- /dev/null
+++ b/src/IceVariableSplitting.h
@@ -0,0 +1,25 @@
+//===- subzero/src/IceVariableSplitting.h - Local var splitting -*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Aggressive block-local variable splitting to improve linear-scan
+/// register allocation.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICEVARIABLESPLITTING_H
+#define SUBZERO_SRC_ICEVARIABLESPLITTING_H
+
+namespace Ice {
+
+void splitBlockLocalVariables(class Cfg *Func);
+
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICEVARIABLESPLITTING_H