Subzero: Implementation of "advanced Phi lowering".

Delays Phi lowering until after register allocation.  This lets the Phi assignment order take register allocation into account and avoid creating false dependencies.

All edges that lead to Phi instructions are split, and the new node gets mov instructions in the correct topological order, using available physical registers as needed.

This lowering style is controllable under -O2 using -phi-edge-split (enabled by default).

The result is faster translation time (due to fewer temporaries leading to faster liveness analysis and register allocation) as well as better code quality (due to better register allocation and fewer phi-based assignments).

BUG= none
R=jvoung@chromium.org

Review URL: https://codereview.chromium.org/680733002
diff --git a/pydir/szbuild_spec2k.py b/pydir/szbuild_spec2k.py
index fb4321f..092a835 100755
--- a/pydir/szbuild_spec2k.py
+++ b/pydir/szbuild_spec2k.py
@@ -16,10 +16,11 @@
     './run_all.sh RunBenchmarks SetupGccX8632Opt {train|ref} ...'
     """
     nacl_root = FindBaseNaCl()
-    components = [ '164.gzip', '175.vpr', '176.gcc', '177.mesa', '179.art', 
-                   '181.mcf', '183.equake', '186.crafty', '188.ammp',
-                   '197.parser', '252.eon', '253.perlbmk', '254.gap',
-                   '255.vortex', '256.bzip2', '300.twolf' ]
+    # Use the same default ordering as spec2k/run_all.sh.
+    components = [ '177.mesa', '179.art', '183.equake', '188.ammp', '164.gzip',
+                   '175.vpr', '176.gcc', '181.mcf', '186.crafty', '197.parser',
+                   '253.perlbmk', '254.gap', '255.vortex', '256.bzip2',
+                   '300.twolf', '252.eon' ]
     
     argparser = argparse.ArgumentParser(description=main.__doc__)
     szbuild.AddOptionalArgs(argparser)
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 04e7acd..84aff16 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -116,6 +116,87 @@
     Node->deletePhis();
 }
 
+void Cfg::advancedPhiLowering() {
+  TimerMarker T(TimerStack::TT_advancedPhiLowering, this);
+  // This splits edges and appends new nodes to the end of the node
+  // list.  This can invalidate iterators, so don't use an iterator.
+  SizeT NumNodes = getNumNodes();
+  for (SizeT I = 0; I < NumNodes; ++I)
+    Nodes[I]->advancedPhiLowering();
+}
+
+// Find a reasonable placement for nodes that have not yet been
+// placed, while maintaining the same relative ordering among already
+// placed nodes.
+void Cfg::reorderNodes() {
+  typedef std::list<CfgNode *> PlacedList;
+  PlacedList Placed;      // Nodes with relative placement locked down
+  PlacedList Unreachable; // Unreachable nodes
+  PlacedList::iterator NoPlace = Placed.end();
+  // Keep track of where each node has been tentatively placed so that
+  // we can manage insertions into the middle.
+  std::vector<PlacedList::iterator> PlaceIndex(Nodes.size(), NoPlace);
+  for (CfgNode *Node : Nodes) {
+    // The "do ... while(0);" construct is to factor out the
+    // --PlaceIndex and assert() statements before moving to the next
+    // node.
+    do {
+      if (!Node->needsPlacement()) {
+        // Add to the end of the Placed list.
+        Placed.push_back(Node);
+        PlaceIndex[Node->getIndex()] = Placed.end();
+        continue;
+      }
+      Node->setNeedsPlacement(false);
+      if (Node != getEntryNode() && Node->getInEdges().size() == 0) {
+        // The node has essentially been deleted since it is not a
+        // successor of any other node.
+        Unreachable.push_back(Node);
+        PlaceIndex[Node->getIndex()] = Unreachable.end();
+        continue;
+      }
+      // Assume for now that the unplaced node is from edge-splitting
+      // and therefore has 1 in-edge and 1 out-edge (actually, possibly
+      // more than 1 in-edge if the predecessor node was contracted).
+      // If this changes in the future, rethink the strategy.
+      assert(Node->getInEdges().size() >= 1);
+      assert(Node->getOutEdges().size() == 1);
+
+      // If it's a (non-critical) edge where the successor has a single
+      // in-edge, then place it before the successor.
+      CfgNode *Succ = Node->getOutEdges()[0];
+      if (Succ->getInEdges().size() == 1 &&
+          PlaceIndex[Succ->getIndex()] != NoPlace) {
+        Placed.insert(PlaceIndex[Succ->getIndex()], Node);
+        PlaceIndex[Node->getIndex()] = PlaceIndex[Succ->getIndex()];
+        continue;
+      }
+
+      // Otherwise, place it after the (first) predecessor.
+      CfgNode *Pred = Node->getInEdges()[0];
+      auto PredPosition = PlaceIndex[Pred->getIndex()];
+      // It shouldn't be the case that PredPosition==NoPlace, but if
+      // that somehow turns out to be true, we just insert Node before
+      // PredPosition=NoPlace=Placed.end() .
+      if (PredPosition != NoPlace)
+        ++PredPosition;
+      Placed.insert(PredPosition, Node);
+      PlaceIndex[Node->getIndex()] = PredPosition;
+    } while (0);
+
+    --PlaceIndex[Node->getIndex()];
+    assert(*PlaceIndex[Node->getIndex()] == Node);
+  }
+
+  // Reorder Nodes according to the built-up lists.
+  SizeT Cur = 0;
+  for (CfgNode *Node : Placed)
+    Nodes[Cur++] = Node;
+  for (CfgNode *Node : Unreachable)
+    Nodes[Cur++] = Node;
+  assert(Cur == Nodes.size());
+}
+
 void Cfg::doArgLowering() {
   TimerMarker T(TimerStack::TT_doArgLowering, this);
   getTarget()->lowerArguments();
@@ -297,6 +378,12 @@
   }
 }
 
+void Cfg::contractEmptyNodes() {
+  for (CfgNode *Node : Nodes) {
+    Node->contractIfEmpty();
+  }
+}
+
 void Cfg::doBranchOpt() {
   TimerMarker T(TimerStack::TT_doBranchOpt, this);
   for (auto I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
diff --git a/src/IceCfg.h b/src/IceCfg.h
index 93aa159..ad02831 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -111,6 +111,8 @@
   void placePhiLoads();
   void placePhiStores();
   void deletePhis();
+  void advancedPhiLowering();
+  void reorderNodes();
   void doAddressOpt();
   void doArgLowering();
   void doNopInsertion();
@@ -120,6 +122,7 @@
   void liveness(LivenessMode Mode);
   bool validateLiveness() const;
   void deleteRedundantAssignments();
+  void contractEmptyNodes();
   void doBranchOpt();
 
   // Manage the CurrentNode field, which is used for validating the
diff --git a/src/IceCfgNode.cpp b/src/IceCfgNode.cpp
index 4243648..661dd73 100644
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -24,7 +24,7 @@
 
 CfgNode::CfgNode(Cfg *Func, SizeT LabelNumber, IceString Name)
     : Func(Func), Number(LabelNumber), Name(Name), HasReturn(false),
-      InstCountEstimate(0) {}
+      NeedsPlacement(false), InstCountEstimate(0) {}
 
 // Returns the name the node was created with.  If no name was given,
 // it synthesizes a (hopefully) unique name.
@@ -204,6 +204,276 @@
     I->setDeleted();
 }
 
+// Splits the edge from Pred to this node by creating a new node and
+// hooking up the in and out edges appropriately.  (The EdgeIndex
+// parameter is only used to make the new node's name unique when
+// there are multiple edges between the same pair of nodes.)  The new
+// node's instruction list is initialized to the empty list, with no
+// terminator instruction.  If there are multiple edges from Pred to
+// this node, only one edge is split, and the particular choice of
+// edge is undefined.  This could happen with a switch instruction, or
+// a conditional branch that weirdly has both branches to the same
+// place.  TODO(stichnot,kschimpf): Figure out whether this is legal
+// in the LLVM IR or the PNaCl bitcode, and if so, we need to
+// establish a strong relationship among the ordering of Pred's
+// out-edge list, this node's in-edge list, and the Phi instruction's
+// operand list.
+CfgNode *CfgNode::splitIncomingEdge(CfgNode *Pred, SizeT EdgeIndex) {
+  CfgNode *NewNode =
+      Func->makeNode("split_" + Pred->getName() + "_" + getName() + "_" +
+                     std::to_string(EdgeIndex));
+  // The new node is added to the end of the node list, and will later
+  // need to be sorted into a reasonable topological order.
+  NewNode->setNeedsPlacement(true);
+  // Repoint Pred's out-edge.
+  bool Found = false;
+  for (auto I = Pred->OutEdges.begin(), E = Pred->OutEdges.end();
+       !Found && I != E; ++I) {
+    if (*I == this) {
+      *I = NewNode;
+      NewNode->InEdges.push_back(Pred);
+      Found = true;
+    }
+  }
+  assert(Found);
+  // Repoint this node's in-edge.
+  Found = false;
+  for (auto I = InEdges.begin(), E = InEdges.end(); !Found && I != E; ++I) {
+    if (*I == Pred) {
+      *I = NewNode;
+      NewNode->OutEdges.push_back(this);
+      Found = true;
+    }
+  }
+  assert(Found);
+  // Repoint a suitable branch instruction's target.
+  Found = false;
+  for (auto I = Pred->getInsts().rbegin(), E = Pred->getInsts().rend();
+       !Found && I != E; ++I) {
+    if (!(*I)->isDeleted()) {
+      Found = (*I)->repointEdge(this, NewNode);
+    }
+  }
+  assert(Found);
+  return NewNode;
+}
+
+namespace {
+
+// Helper function used by advancedPhiLowering().
+bool sameVarOrReg(const Variable *Var, const Operand *Opnd) {
+  if (Var == Opnd)
+    return true;
+  if (const auto Var2 = llvm::dyn_cast<Variable>(Opnd)) {
+    if (Var->hasReg() && Var->getRegNum() == Var2->getRegNum())
+      return true;
+  }
+  return false;
+}
+
+} // end of anonymous namespace
+
+// This the "advanced" version of Phi lowering for a basic block, in
+// contrast to the simple version that lowers through assignments
+// involving temporaries.
+//
+// All Phi instructions in a basic block are conceptually executed in
+// parallel.  However, if we lower Phis early and commit to a
+// sequential ordering, we may end up creating unnecessary
+// interferences which lead to worse register allocation.  Delaying
+// Phi scheduling until after register allocation can help unless
+// there are no free registers for shuffling registers or stack slots
+// and spilling becomes necessary.
+//
+// The advanced Phi lowering starts by finding a topological sort of
+// the Phi instructions, where "A=B" comes before "B=C" due to the
+// anti-dependence on B.  If a topological sort is not possible due to
+// a cycle, the cycle is broken by introducing a non-parallel
+// temporary.  For example, a cycle arising from a permutation like
+// "A=B;B=C;C=A" can become "T=A;A=B;B=C;C=T".  All else being equal,
+// prefer to schedule assignments with register-allocated Src operands
+// earlier, in case that register becomes free afterwards, and prefer
+// to schedule assignments with register-allocated Dest variables
+// later, to keep that register free for longer.
+//
+// Once the ordering is determined, the Cfg edge is split and the
+// assignment list is lowered by the target lowering layer.  The
+// specific placement of the new node within the Cfg node list is
+// deferred until later, including after empty node contraction.
+void CfgNode::advancedPhiLowering() {
+  if (getPhis().empty())
+    return;
+
+  // Count the number of non-deleted Phi instructions.
+  struct {
+    InstPhi *Phi;
+    Variable *Dest;
+    Operand *Src;
+    bool Processed;
+    size_t NumPred; // number of entries whose Src is this Dest
+    int32_t Weight; // preference for topological order
+  } Desc[getPhis().size()];
+
+  size_t NumPhis = 0;
+  for (InstPhi *Inst : getPhis()) {
+    if (!Inst->isDeleted()) {
+      Desc[NumPhis].Phi = Inst;
+      Desc[NumPhis].Dest = Inst->getDest();
+      ++NumPhis;
+    }
+  }
+  if (NumPhis == 0)
+    return;
+
+  SizeT InEdgeIndex = 0;
+  for (CfgNode *Pred : InEdges) {
+    CfgNode *Split = splitIncomingEdge(Pred, InEdgeIndex++);
+    AssignList Assignments;
+    SizeT Remaining = NumPhis;
+
+    // First pass computes Src and initializes NumPred.
+    for (size_t I = 0; I < NumPhis; ++I) {
+      Variable *Dest = Desc[I].Dest;
+      Operand *Src = Desc[I].Phi->getOperandForTarget(Pred);
+      Desc[I].Src = Src;
+      Desc[I].Processed = false;
+      Desc[I].NumPred = 0;
+      // Cherry-pick any trivial assignments, so that they don't
+      // contribute to the running complexity of the topological sort.
+      if (sameVarOrReg(Dest, Src)) {
+        Desc[I].Processed = true;
+        --Remaining;
+        if (Dest != Src)
+          // If Dest and Src are syntactically the same, don't bother
+          // adding the assignment, because in all respects it would
+          // be redundant, and if Dest/Src are on the stack, the
+          // target lowering may naively decide to lower it using a
+          // temporary register.
+          Assignments.push_back(InstAssign::create(Func, Dest, Src));
+      }
+    }
+    // Second pass computes NumPred by comparing every pair of Phi
+    // instructions.
+    for (size_t I = 0; I < NumPhis; ++I) {
+      if (Desc[I].Processed)
+        continue;
+      const Variable *Dest = Desc[I].Dest;
+      for (size_t J = 0; J < NumPhis; ++J) {
+        if (Desc[J].Processed)
+          continue;
+        if (I != J) {
+          // There shouldn't be two Phis with the same Dest variable
+          // or register.
+          assert(!sameVarOrReg(Dest, Desc[J].Dest));
+        }
+        const Operand *Src = Desc[J].Src;
+        if (sameVarOrReg(Dest, Src))
+          ++Desc[I].NumPred;
+      }
+    }
+
+    // Another pass to compute initial Weight values.
+
+    // Always pick NumPred=0 over NumPred>0.
+    const int32_t WeightNoPreds = 4;
+    // Prefer Src as a register because the register might free up.
+    const int32_t WeightSrcIsReg = 2;
+    // Prefer Dest not as a register because the register stays free
+    // longer.
+    const int32_t WeightDestNotReg = 1;
+
+    for (size_t I = 0; I < NumPhis; ++I) {
+      if (Desc[I].Processed)
+        continue;
+      int32_t Weight = 0;
+      if (Desc[I].NumPred == 0)
+        Weight += WeightNoPreds;
+      if (auto Var = llvm::dyn_cast<Variable>(Desc[I].Src))
+        if (Var->hasReg())
+          Weight += WeightSrcIsReg;
+      if (!Desc[I].Dest->hasReg())
+        Weight += WeightDestNotReg;
+      Desc[I].Weight = Weight;
+    }
+
+    // Repeatedly choose and process the best candidate in the
+    // topological sort, until no candidates remain.  This
+    // implementation is O(N^2) where N is the number of Phi
+    // instructions, but with a small constant factor compared to a
+    // likely implementation of O(N) topological sort.
+    for (; Remaining; --Remaining) {
+      size_t BestIndex = 0;
+      int32_t BestWeight = -1;
+      // Find the best candidate.
+      for (size_t I = 0; I < NumPhis; ++I) {
+        if (Desc[I].Processed)
+          continue;
+        int32_t Weight = 0;
+        Weight = Desc[I].Weight;
+        if (Weight > BestWeight) {
+          BestIndex = I;
+          BestWeight = Weight;
+        }
+      }
+      assert(BestWeight >= 0);
+      assert(Desc[BestIndex].NumPred <= 1);
+      Variable *Dest = Desc[BestIndex].Dest;
+      Operand *Src = Desc[BestIndex].Src;
+      assert(!sameVarOrReg(Dest, Src));
+      // Break a cycle by introducing a temporary.
+      if (Desc[BestIndex].NumPred) {
+        bool Found = false;
+        // If the target instruction "A=B" is part of a cycle, find
+        // the "X=A" assignment in the cycle because it will have to
+        // be rewritten as "X=tmp".
+        for (size_t J = 0; !Found && J < NumPhis; ++J) {
+          if (Desc[J].Processed)
+            continue;
+          Operand *OtherSrc = Desc[J].Src;
+          if (Desc[J].NumPred && sameVarOrReg(Dest, OtherSrc)) {
+            SizeT VarNum = Func->getNumVariables();
+            Variable *Tmp = Func->makeVariable(
+                OtherSrc->getType(), "__split_" + std::to_string(VarNum));
+            Tmp->setNeedsStackSlot();
+            Assignments.push_back(InstAssign::create(Func, Tmp, OtherSrc));
+            Desc[J].Src = Tmp;
+            Found = true;
+          }
+        }
+        assert(Found);
+      }
+      // Now that a cycle (if any) has been broken, create the actual
+      // assignment.
+      Assignments.push_back(InstAssign::create(Func, Dest, Src));
+      // Update NumPred for all Phi assignments using this Phi's Src
+      // as their Dest variable.  Also update Weight if NumPred
+      // dropped from 1 to 0.
+      if (auto Var = llvm::dyn_cast<Variable>(Src)) {
+        for (size_t I = 0; I < NumPhis; ++I) {
+          if (Desc[I].Processed)
+            continue;
+          if (sameVarOrReg(Var, Desc[I].Dest)) {
+            if (--Desc[I].NumPred == 0)
+              Desc[I].Weight += WeightNoPreds;
+          }
+        }
+      }
+      Desc[BestIndex].Processed = true;
+    }
+
+    Func->getTarget()->lowerPhiAssignments(Split, Assignments);
+
+    // Renumber the instructions to be monotonically increasing so
+    // that addNode() doesn't assert when multi-definitions are added
+    // out of order.
+    Split->renumberInstructions();
+    Func->getVMetadata()->addNode(Split);
+  }
+
+  for (InstPhi *Inst : getPhis())
+    Inst->setDeleted();
+}
+
 // Does address mode optimization.  Pass each instruction to the
 // TargetLowering object.  If it returns a new instruction
 // (representing the optimized address mode), then insert the new
@@ -240,7 +510,7 @@
 void CfgNode::genCode() {
   TargetLowering *Target = Func->getTarget();
   LoweringContext &Context = Target->getContext();
-  // Lower only the regular instructions.  Defer the Phi instructions.
+  // Lower the regular instructions.
   Context.init(this);
   while (!Context.atEnd()) {
     InstList::iterator Orig = Context.getCur();
@@ -250,6 +520,8 @@
     // Ensure target lowering actually moved the cursor.
     assert(Context.getCur() != Orig);
   }
+  // Do preliminary lowering of the Phi instructions.
+  Target->prelowerPhis();
 }
 
 void CfgNode::livenessLightweight() {
@@ -300,7 +572,6 @@
   Liveness->getLiveOut(this) = Live;
 
   // Process regular instructions in reverse order.
-  // TODO(stichnot): Use llvm::make_range with LLVM 3.5.
   for (auto I = Insts.rbegin(), E = Insts.rend(); I != E; ++I) {
     if ((*I)->isDeleted())
       continue;
@@ -309,13 +580,15 @@
   // Process phis in forward order so that we can override the
   // instruction number to be that of the earliest phi instruction in
   // the block.
+  SizeT NumNonDeadPhis = 0;
   InstNumberT FirstPhiNumber = Inst::NumberSentinel;
   for (InstPhi *I : Phis) {
     if (I->isDeleted())
       continue;
     if (FirstPhiNumber == Inst::NumberSentinel)
       FirstPhiNumber = I->getNumber();
-    I->liveness(FirstPhiNumber, Live, Liveness, LiveBegin, LiveEnd);
+    if (I->liveness(FirstPhiNumber, Live, Liveness, LiveBegin, LiveEnd))
+      ++NumNonDeadPhis;
   }
 
   // When using the sparse representation, after traversing the
@@ -350,9 +623,12 @@
   // Add in current LiveIn
   Live |= LiveIn;
   // Check result, set LiveIn=Live
-  Changed = (Live != LiveIn);
-  if (Changed)
+  SizeT &PrevNumNonDeadPhis = Liveness->getNumNonDeadPhis(this);
+  bool LiveInChanged = (Live != LiveIn);
+  Changed = (NumNonDeadPhis != PrevNumNonDeadPhis || LiveInChanged);
+  if (LiveInChanged)
     LiveIn = Live;
+  PrevNumNonDeadPhis = NumNonDeadPhis;
   return Changed;
 }
 
@@ -472,6 +748,40 @@
   }
 }
 
+// If this node contains only deleted instructions, and ends in an
+// unconditional branch, contract the node by repointing all its
+// in-edges to its successor.
+void CfgNode::contractIfEmpty() {
+  if (InEdges.size() == 0)
+    return;
+  Inst *Branch = NULL;
+  for (Inst *I : Insts) {
+    if (!I->isDeleted() && !I->isUnconditionalBranch())
+      return;
+    Branch = I;
+  }
+  Branch->setDeleted();
+  assert(OutEdges.size() == 1);
+  // Repoint all this node's in-edges to this node's successor.
+  for (CfgNode *Pred : InEdges) {
+    for (auto I = Pred->OutEdges.begin(), E = Pred->OutEdges.end(); I != E;
+         ++I) {
+      if (*I == this) {
+        *I = OutEdges[0];
+        OutEdges[0]->InEdges.push_back(Pred);
+      }
+    }
+    for (Inst *I : Pred->getInsts()) {
+      if (!I->isDeleted())
+        I->repointEdge(this, OutEdges[0]);
+    }
+  }
+  InEdges.clear();
+  // Don't bother removing the single out-edge, which would also
+  // require finding the corresponding in-edge in the successor and
+  // removing it.
+}
+
 void CfgNode::doBranchOpt(const CfgNode *NextNode) {
   TargetLowering *Target = Func->getTarget();
   // Check every instruction for a branch optimization opportunity.
@@ -479,8 +789,11 @@
   // first opportunity, unless there is some target lowering where we
   // have the possibility of multiple such optimizations per block
   // (currently not the case for x86 lowering).
-  for (Inst *I : Insts)
-    Target->doBranchOpt(I, NextNode);
+  for (Inst *I : Insts) {
+    if (!I->isDeleted()) {
+      Target->doBranchOpt(I, NextNode);
+    }
+  }
 }
 
 // ======================== Dump routines ======================== //
diff --git a/src/IceCfgNode.h b/src/IceCfgNode.h
index 031c76c..aa836c0 100644
--- a/src/IceCfgNode.h
+++ b/src/IceCfgNode.h
@@ -46,6 +46,9 @@
   void setHasReturn() { HasReturn = true; }
   bool getHasReturn() const { return HasReturn; }
 
+  void setNeedsPlacement(bool Value) { NeedsPlacement = Value; }
+  bool needsPlacement() const { return NeedsPlacement; }
+
   // Access predecessor and successor edge lists.
   const NodeList &getInEdges() const { return InEdges; }
   const NodeList &getOutEdges() const { return OutEdges; }
@@ -64,16 +67,19 @@
   // Add a predecessor edge to the InEdges list for each of this
   // node's successors.
   void computePredecessors();
+  CfgNode *splitIncomingEdge(CfgNode *Pred, SizeT InEdgeIndex);
 
   void placePhiLoads();
   void placePhiStores();
   void deletePhis();
+  void advancedPhiLowering();
   void doAddressOpt();
   void doNopInsertion();
   void genCode();
   void livenessLightweight();
   bool liveness(Liveness *Liveness);
   void livenessPostprocess(LivenessMode Mode, Liveness *Liveness);
+  void contractIfEmpty();
   void doBranchOpt(const CfgNode *NextNode);
   void emit(Cfg *Func) const;
   void dump(Cfg *Func) const;
@@ -84,6 +90,7 @@
   const SizeT Number; // label index
   IceString Name;     // for dumping only
   bool HasReturn;     // does this block need an epilog?
+  bool NeedsPlacement;
   InstNumberT InstCountEstimate; // rough instruction count estimate
   NodeList InEdges;   // in no particular order
   NodeList OutEdges;  // in no particular order
diff --git a/src/IceDefs.h b/src/IceDefs.h
index 8974bae..0beb3a3 100644
--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -42,6 +42,7 @@
 class GlobalContext;
 class GlobalDeclaration;
 class Inst;
+class InstAssign;
 class InstPhi;
 class InstTarget;
 class LiveRange;
@@ -56,6 +57,7 @@
 // http://llvm.org/docs/ProgrammersManual.html#picking-the-right-data-structure-for-a-task
 typedef std::string IceString;
 typedef std::list<Inst *> InstList;
+typedef std::list<InstAssign *> AssignList;
 typedef std::list<InstPhi *> PhiList;
 typedef std::vector<Variable *> VarList;
 typedef std::vector<Operand *> OperandList;
diff --git a/src/IceInst.cpp b/src/IceInst.cpp
index c9eb95e..70b54b6 100644
--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -135,12 +135,12 @@
   }
 }
 
-void Inst::liveness(InstNumberT InstNumber, LivenessBV &Live,
+bool Inst::liveness(InstNumberT InstNumber, LivenessBV &Live,
                     Liveness *Liveness, LiveBeginEndMap *LiveBegin,
                     LiveBeginEndMap *LiveEnd) {
   assert(!isDeleted());
   if (llvm::isa<InstFakeKill>(this))
-    return;
+    return true;
 
   Dead = false;
   if (Dest) {
@@ -158,7 +158,7 @@
     }
   }
   if (Dead)
-    return;
+    return false;
   // Phi arguments only get added to Live in the predecessor node, but
   // we still need to update LiveRangesEnded.
   bool IsPhi = llvm::isa<InstPhi>(this);
@@ -202,6 +202,7 @@
       }
     }
   }
+  return true;
 }
 
 InstAlloca::InstAlloca(Cfg *Func, Operand *ByteCount, uint32_t AlignInBytes,
@@ -261,6 +262,17 @@
   return OutEdges;
 }
 
+bool InstBr::repointEdge(CfgNode *OldNode, CfgNode *NewNode) {
+  if (TargetFalse == OldNode) {
+    TargetFalse = NewNode;
+    return true;
+  } else if (TargetTrue == OldNode) {
+    TargetTrue = NewNode;
+    return true;
+  }
+  return false;
+}
+
 InstCast::InstCast(Cfg *Func, OpKind CastKind, Variable *Dest, Operand *Source)
     : InstHighLevel(Func, Inst::Cast, 1, Dest), CastKind(CastKind) {
   addSource(Source);
@@ -410,6 +422,20 @@
   return OutEdges;
 }
 
+bool InstSwitch::repointEdge(CfgNode *OldNode, CfgNode *NewNode) {
+  if (LabelDefault == OldNode) {
+    LabelDefault = NewNode;
+    return true;
+  }
+  for (SizeT I = 0; I < NumCases; ++I) {
+    if (Labels[I] == OldNode) {
+      Labels[I] = NewNode;
+      return true;
+    }
+  }
+  return false;
+}
+
 InstUnreachable::InstUnreachable(Cfg *Func)
     : InstHighLevel(Func, Inst::Unreachable, 0, NULL) {}
 
diff --git a/src/IceInst.h b/src/IceInst.h
index 0a47e1a..e6edb4c 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -100,11 +100,27 @@
         "getTerminatorEdges() called on a non-terminator instruction");
     return NodeList();
   }
+  virtual bool isUnconditionalBranch() const { return false; }
+  // If the instruction is a branch-type instruction with OldNode as a
+  // target, repoint it to NewNode and return true, otherwise return
+  // false.  Only repoint one instance, even if the instruction has
+  // multiple instances of OldNode as a target.
+  virtual bool repointEdge(CfgNode *OldNode, CfgNode *NewNode) {
+    (void)OldNode;
+    (void)NewNode;
+    return false;
+  }
 
   virtual bool isSimpleAssign() const { return false; }
 
   void livenessLightweight(Cfg *Func, LivenessBV &Live);
-  void liveness(InstNumberT InstNumber, LivenessBV &Live, Liveness *Liveness,
+  // Calculates liveness for this instruction.  Returns true if this
+  // instruction is (tentatively) still live and should be retained,
+  // and false if this instruction is (tentatively) dead and should be
+  // deleted.  The decision is tentative until the liveness dataflow
+  // algorithm has converged, and then a separate pass permanently
+  // deletes dead instructions.
+  bool liveness(InstNumberT InstNumber, LivenessBV &Live, Liveness *Liveness,
                 LiveBeginEndMap *LiveBegin, LiveBeginEndMap *LiveEnd);
 
   // Get the number of native instructions that this instruction
@@ -304,6 +320,8 @@
     return getTargetFalse();
   }
   NodeList getTerminatorEdges() const override;
+  bool isUnconditionalBranch() const override { return isUnconditional(); }
+  bool repointEdge(CfgNode *OldNode, CfgNode *NewNode) override;
   void dump(const Cfg *Func) const override;
   static bool classof(const Inst *Inst) { return Inst->getKind() == Br; }
 
@@ -314,8 +332,8 @@
   InstBr(Cfg *Func, CfgNode *Target);
   ~InstBr() override {}
 
-  CfgNode *const TargetFalse; // Doubles as unconditional branch target
-  CfgNode *const TargetTrue;  // NULL if unconditional branch
+  CfgNode *TargetFalse; // Doubles as unconditional branch target
+  CfgNode *TargetTrue;  // NULL if unconditional branch
 };
 
 // Call instruction.  The call target is captured as getSrc(0), and
@@ -671,6 +689,7 @@
   }
   void addBranch(SizeT CaseIndex, uint64_t Value, CfgNode *Label);
   NodeList getTerminatorEdges() const override;
+  bool repointEdge(CfgNode *OldNode, CfgNode *NewNode) override;
   void dump(const Cfg *Func) const override;
   static bool classof(const Inst *Inst) { return Inst->getKind() == Switch; }
 
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 23948e4..5b77a37 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -184,6 +184,17 @@
   return false;
 }
 
+bool InstX8632Br::repointEdge(CfgNode *OldNode, CfgNode *NewNode) {
+  if (TargetFalse == OldNode) {
+    TargetFalse = NewNode;
+    return true;
+  } else if (TargetTrue == OldNode) {
+    TargetTrue = NewNode;
+    return true;
+  }
+  return false;
+}
+
 InstX8632Call::InstX8632Call(Cfg *Func, Variable *Dest, Operand *CallTarget)
     : InstX8632(Func, InstX8632::Call, 1, Dest) {
   HasSideEffects = true;
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 21a95ba..b9868c0 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -387,6 +387,10 @@
       ++Sum;
     return Sum;
   }
+  bool isUnconditionalBranch() const override {
+    return !Label && Condition == CondX86::Br_None;
+  }
+  bool repointEdge(CfgNode *OldNode, CfgNode *NewNode) override;
   void emit(const Cfg *Func) const override;
   void emitIAS(const Cfg *Func) const override;
   void dump(const Cfg *Func) const override;
diff --git a/src/IceLiveness.cpp b/src/IceLiveness.cpp
index f16eb84..2f7b20a 100644
--- a/src/IceLiveness.cpp
+++ b/src/IceLiveness.cpp
@@ -53,6 +53,7 @@
   for (SizeT i = 0; i < NumNodes; ++i) {
     Nodes[i].LiveToVarMap.assign(Nodes[i].NumLocals, NULL);
     Nodes[i].NumLocals = 0;
+    Nodes[i].NumNonDeadPhis = 0;
   }
   LiveToVarMap.assign(NumGlobals, NULL);
 
diff --git a/src/IceLiveness.h b/src/IceLiveness.h
index f6af0e1..a542f3e 100644
--- a/src/IceLiveness.h
+++ b/src/IceLiveness.h
@@ -32,9 +32,14 @@
   // LivenessNode &operator=(const LivenessNode &) = delete;
 
 public:
-  LivenessNode() : NumLocals(0) {}
+  LivenessNode() : NumLocals(0), NumNonDeadPhis(0) {}
   // NumLocals is the number of Variables local to this block.
   SizeT NumLocals;
+  // NumNonDeadPhis tracks the number of Phi instructions that
+  // Inst::liveness() identified as tentatively live.  If
+  // NumNonDeadPhis changes from the last liveness pass, then liveness
+  // has not yet converged.
+  SizeT NumNonDeadPhis;
   // LiveToVarMap maps a liveness bitvector index to a Variable.  This
   // is generally just for printing/dumping.  The index should be less
   // than NumLocals + Liveness::NumGlobals.
@@ -66,20 +71,36 @@
   SizeT getNumVarsInNode(const CfgNode *Node) const {
     return NumGlobals + Nodes[Node->getIndex()].NumLocals;
   }
+  SizeT &getNumNonDeadPhis(const CfgNode *Node) {
+    return Nodes[Node->getIndex()].NumNonDeadPhis;
+  }
   LivenessBV &getLiveIn(const CfgNode *Node) {
-    return Nodes[Node->getIndex()].LiveIn;
+    SizeT Index = Node->getIndex();
+    resize(Index);
+    return Nodes[Index].LiveIn;
   }
   LivenessBV &getLiveOut(const CfgNode *Node) {
-    return Nodes[Node->getIndex()].LiveOut;
+    SizeT Index = Node->getIndex();
+    resize(Index);
+    return Nodes[Index].LiveOut;
   }
   LiveBeginEndMap *getLiveBegin(const CfgNode *Node) {
-    return &Nodes[Node->getIndex()].LiveBegin;
+    SizeT Index = Node->getIndex();
+    resize(Index);
+    return &Nodes[Index].LiveBegin;
   }
   LiveBeginEndMap *getLiveEnd(const CfgNode *Node) {
-    return &Nodes[Node->getIndex()].LiveEnd;
+    SizeT Index = Node->getIndex();
+    resize(Index);
+    return &Nodes[Index].LiveEnd;
   }
 
 private:
+  // Resize Nodes so that Nodes[Index] is valid.
+  void resize(SizeT Index) {
+    if (Index >= Nodes.size())
+      Nodes.resize(Index + 1);
+  }
   Cfg *Func;
   LivenessMode Mode;
   SizeT NumGlobals;
diff --git a/src/IceOperand.cpp b/src/IceOperand.cpp
index a2ee8f4..4b55737 100644
--- a/src/IceOperand.cpp
+++ b/src/IceOperand.cpp
@@ -320,38 +320,63 @@
         .markUse(Kind, NoInst, EntryNode, IsFromDef, IsImplicit);
   }
 
-  for (CfgNode *Node : Func->getNodes()) {
-    for (Inst *I : Node->getInsts()) {
-      if (I->isDeleted())
-        continue;
-      if (InstFakeKill *Kill = llvm::dyn_cast<InstFakeKill>(I)) {
-        // A FakeKill instruction indicates certain Variables (usually
-        // physical scratch registers) are redefined, so we register
-        // them as defs.
-        for (SizeT SrcNum = 0; SrcNum < I->getSrcSize(); ++SrcNum) {
-          Variable *Var = llvm::cast<Variable>(I->getSrc(SrcNum));
-          SizeT VarNum = Var->getIndex();
-          assert(VarNum < Metadata.size());
-          Metadata[VarNum].markDef(Kind, Kill, Node);
-        }
-        continue; // no point in executing the rest
+  for (CfgNode *Node : Func->getNodes())
+    addNode(Node);
+}
+
+void VariablesMetadata::addNode(CfgNode *Node) {
+  if (Func->getNumVariables() >= Metadata.size())
+    Metadata.resize(Func->getNumVariables());
+
+  for (InstPhi *I : Node->getPhis()) {
+    if (I->isDeleted())
+      continue;
+    if (Variable *Dest = I->getDest()) {
+      SizeT DestNum = Dest->getIndex();
+      assert(DestNum < Metadata.size());
+      Metadata[DestNum].markDef(Kind, I, Node);
+    }
+    for (SizeT SrcNum = 0; SrcNum < I->getSrcSize(); ++SrcNum) {
+      if (const Variable *Var = llvm::dyn_cast<Variable>(I->getSrc(SrcNum))) {
+        SizeT VarNum = Var->getIndex();
+        assert(VarNum < Metadata.size());
+        const bool IsFromDef = false;
+        const bool IsImplicit = false;
+        Metadata[VarNum].markUse(Kind, I, Node, IsFromDef, IsImplicit);
       }
-      if (Variable *Dest = I->getDest()) {
-        SizeT DestNum = Dest->getIndex();
-        assert(DestNum < Metadata.size());
-        Metadata[DestNum].markDef(Kind, I, Node);
-      }
+    }
+  }
+
+  for (Inst *I : Node->getInsts()) {
+    if (I->isDeleted())
+      continue;
+    if (InstFakeKill *Kill = llvm::dyn_cast<InstFakeKill>(I)) {
+      // A FakeKill instruction indicates certain Variables (usually
+      // physical scratch registers) are redefined, so we register
+      // them as defs.
       for (SizeT SrcNum = 0; SrcNum < I->getSrcSize(); ++SrcNum) {
-        Operand *Src = I->getSrc(SrcNum);
-        SizeT NumVars = Src->getNumVars();
-        for (SizeT J = 0; J < NumVars; ++J) {
-          const Variable *Var = Src->getVar(J);
-          SizeT VarNum = Var->getIndex();
-          assert(VarNum < Metadata.size());
-          const bool IsFromDef = false;
-          const bool IsImplicit = false;
-          Metadata[VarNum].markUse(Kind, I, Node, IsFromDef, IsImplicit);
-        }
+        Variable *Var = llvm::cast<Variable>(I->getSrc(SrcNum));
+        SizeT VarNum = Var->getIndex();
+        assert(VarNum < Metadata.size());
+        Metadata[VarNum].markDef(Kind, Kill, Node);
+      }
+      continue; // no point in executing the rest
+    }
+    if (Variable *Dest = I->getDest()) {
+      SizeT DestNum = Dest->getIndex();
+      assert(DestNum < Metadata.size());
+      Metadata[DestNum].markDef(Kind, I, Node);
+    }
+    for (SizeT SrcNum = 0; SrcNum < I->getSrcSize(); ++SrcNum) {
+      Operand *Src = I->getSrc(SrcNum);
+      SizeT NumVars = Src->getNumVars();
+      for (SizeT J = 0; J < NumVars; ++J) {
+        const Variable *Var = Src->getVar(J);
+        SizeT VarNum = Var->getIndex();
+        assert(VarNum < Metadata.size());
+        const bool IsFromDef = false;
+        const bool IsImplicit = false;
+        Metadata[VarNum].markUse(Kind, I, Node, IsFromDef, IsImplicit);
       }
     }
   }
diff --git a/src/IceOperand.h b/src/IceOperand.h
index 6d31127..76a0181 100644
--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -398,6 +398,8 @@
   void setIgnoreLiveness() { IgnoreLiveness = true; }
   bool getIgnoreLiveness() const { return IgnoreLiveness; }
 
+  bool needsStackSlot() const { return NeedsStackSlot; }
+  void setNeedsStackSlot() { NeedsStackSlot = true; }
   int32_t getStackOffset() const { return StackOffset; }
   void setStackOffset(int32_t Offset) { StackOffset = Offset; }
 
@@ -474,9 +476,9 @@
 protected:
   Variable(OperandKind K, Type Ty, SizeT Index, const IceString &Name)
       : Operand(K, Ty), Number(Index), Name(Name), IsArgument(false),
-        IsImplicitArgument(false), IgnoreLiveness(false), StackOffset(0),
-        RegNum(NoRegister), RegNumTmp(NoRegister), Weight(1), LoVar(NULL),
-        HiVar(NULL) {
+        IsImplicitArgument(false), IgnoreLiveness(false), NeedsStackSlot(false),
+        StackOffset(0), RegNum(NoRegister), RegNumTmp(NoRegister), Weight(1),
+        LoVar(NULL), HiVar(NULL) {
     Vars = VarsReal;
     Vars[0] = this;
     NumVars = 1;
@@ -492,6 +494,9 @@
   // constructing and validating live ranges.  This is usually
   // reserved for the stack pointer.
   bool IgnoreLiveness;
+  // NeedsStackSlot starts out false, and is set to true once we know
+  // for sure that the variable needs a stack slot.
+  bool NeedsStackSlot;
   // StackOffset is the canonical location on stack (only if
   // RegNum==NoRegister || IsArgument).
   int32_t StackOffset;
@@ -578,6 +583,9 @@
   // Initialize the state by traversing all instructions/variables in
   // the CFG.
   void init(MetadataKind TrackingKind);
+  // Add a single node.  This is called by init(), and can be called
+  // incrementally from elsewhere, e.g. after edge-splitting.
+  void addNode(CfgNode *Node);
   // Returns whether the given Variable is tracked in this object.  It
   // should only return false if changes were made to the CFG after
   // running init(), in which case the state is stale and the results
diff --git a/src/IceRegAlloc.cpp b/src/IceRegAlloc.cpp
index 1b4d0c2..d3fae45 100644
--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
@@ -105,12 +105,15 @@
     for (Variable *Var : Vars) {
       // Explicitly don't consider zero-weight variables, which are
       // meant to be spill slots.
-      if (Var->getWeight() == RegWeight::Zero)
+      if (Var->getWeight() == RegWeight::Zero) {
+        Var->setNeedsStackSlot();
         continue;
+      }
       // Don't bother if the variable has a null live range, which means
       // it was never referenced.
       if (Var->getLiveRange().isEmpty())
         continue;
+      Var->setNeedsStackSlot();
       Var->untrimLiveRange();
       Unhandled.push_back(Var);
       if (Var->hasReg()) {
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index b18999f..350605b 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -44,12 +44,16 @@
 
 void LoweringContext::init(CfgNode *N) {
   Node = N;
+  End = getNode()->getInsts().end();
+  rewind();
+  advanceForward(Next);
+}
+
+void LoweringContext::rewind() {
   Begin = getNode()->getInsts().begin();
   Cur = Begin;
-  End = getNode()->getInsts().end();
   skipDeleted(Cur);
   Next = Cur;
-  advanceForward(Next);
 }
 
 void LoweringContext::insert(Inst *Inst) {
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index e46399d..7aa2483 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -64,6 +64,7 @@
   Inst *getLastInserted() const;
   void advanceCur() { Cur = Next; }
   void advanceNext() { advanceForward(Next); }
+  void rewind();
   void setInsertPoint(const InstList::iterator &Position) { Next = Position; }
 
 private:
@@ -134,8 +135,18 @@
   void doAddressOpt();
   // Randomly insert NOPs.
   void doNopInsertion();
-  // Lowers a single instruction.
+  // Lowers a single non-Phi instruction.
   void lower();
+  // Does preliminary lowering of the set of Phi instructions in the
+  // current node.  The main intention is to do what's needed to keep
+  // the unlowered Phi instructions consistent with the lowered
+  // non-Phi instructions, e.g. to lower 64-bit operands on a 32-bit
+  // target.
+  virtual void prelowerPhis() {}
+  // Lowers a list of "parallel" assignment instructions representing
+  // a topological sort of the Phi instructions.
+  virtual void lowerPhiAssignments(CfgNode *Node,
+                                   const AssignList &Assignments) = 0;
   // Tries to do branch optimization on a single instruction.  Returns
   // true if some optimization was done.
   virtual bool doBranchOpt(Inst * /*I*/, const CfgNode * /*NextNode*/) {
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 8082c52..df7d04f 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -25,6 +25,7 @@
 #include "IceDefs.h"
 #include "IceGlobalInits.h"
 #include "IceInstX8632.h"
+#include "IceLiveness.h"
 #include "IceOperand.h"
 #include "IceRegistersX8632.h"
 #include "IceTargetLoweringX8632.def"
@@ -276,8 +277,7 @@
 TargetX8632::TargetX8632(Cfg *Func)
     : TargetLowering(Func), InstructionSet(CLInstructionSet),
       IsEbpBasedFrame(false), NeedsStackAlignment(false), FrameSizeLocals(0),
-      SpillAreaSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false),
-      PhysicalRegisters(VarList(RegX8632::Reg_NUM)) {
+      SpillAreaSizeBytes(0), NextLabelNumber(0), ComputedLiveRanges(false) {
   // TODO: Don't initialize IntegerRegisters and friends every time.
   // Instead, initialize in some sort of static initializer for the
   // class.
@@ -316,17 +316,19 @@
 void TargetX8632::translateO2() {
   TimerMarker T(TimerStack::TT_O2, Func);
 
-  // Lower Phi instructions.
-  Func->placePhiLoads();
-  if (Func->hasError())
-    return;
-  Func->placePhiStores();
-  if (Func->hasError())
-    return;
-  Func->deletePhis();
-  if (Func->hasError())
-    return;
-  Func->dump("After Phi lowering");
+  if (!Ctx->getFlags().PhiEdgeSplit) {
+    // Lower Phi instructions.
+    Func->placePhiLoads();
+    if (Func->hasError())
+      return;
+    Func->placePhiStores();
+    if (Func->hasError())
+      return;
+    Func->deletePhis();
+    if (Func->hasError())
+      return;
+    Func->dump("After Phi lowering");
+  }
 
   // Address mode optimization.
   Func->getVMetadata()->init(VMK_SingleDefs);
@@ -356,6 +358,7 @@
   Func->genCode();
   if (Func->hasError())
     return;
+  Func->dump("After x86 codegen");
 
   // Register allocation.  This requires instruction renumbering and
   // full liveness analysis.
@@ -378,6 +381,11 @@
     return;
   Func->dump("After linear scan regalloc");
 
+  if (Ctx->getFlags().PhiEdgeSplit) {
+    Func->advancedPhiLowering();
+    Func->dump("After advanced Phi lowering");
+  }
+
   // Stack frame mapping.
   Func->genFrame();
   if (Func->hasError())
@@ -385,6 +393,8 @@
   Func->dump("After stack frame mapping");
 
   Func->deleteRedundantAssignments();
+  Func->contractEmptyNodes();
+  Func->reorderNodes();
 
   // Branch optimization.  This needs to be done just before code
   // emission.  In particular, no transformations that insert or
@@ -451,12 +461,14 @@
 Variable *TargetX8632::getPhysicalRegister(SizeT RegNum, Type Ty) {
   if (Ty == IceType_void)
     Ty = IceType_i32;
-  assert(RegNum < PhysicalRegisters.size());
-  Variable *Reg = PhysicalRegisters[RegNum];
+  if (PhysicalRegisters[Ty].empty())
+    PhysicalRegisters[Ty].resize(RegX8632::Reg_NUM);
+  assert(RegNum < PhysicalRegisters[Ty].size());
+  Variable *Reg = PhysicalRegisters[Ty][RegNum];
   if (Reg == NULL) {
     Reg = Func->makeVariable(Ty);
     Reg->setRegNum(RegNum);
-    PhysicalRegisters[RegNum] = Reg;
+    PhysicalRegisters[Ty][RegNum] = Reg;
     // Specially mark esp as an "argument" so that it is considered
     // live upon function entry.
     if (RegNum == RegX8632::Reg_esp) {
@@ -711,7 +723,7 @@
     if (Var->getIsArg())
       continue;
     // An unreferenced variable doesn't need a stack slot.
-    if (ComputedLiveRanges && Var->getLiveRange().isEmpty())
+    if (ComputedLiveRanges && !Var->needsStackSlot())
       continue;
     // A spill slot linked to a variable with a stack slot should reuse
     // that stack slot.
@@ -1695,8 +1707,10 @@
     _mov(T_Hi, Src0Hi);
     _mov(DestHi, T_Hi);
   } else {
-    // RI is either a physical register or an immediate.
-    Operand *RI = legalize(Src0, Legal_Reg | Legal_Imm);
+    // If Dest is in memory, then RI is either a physical register or
+    // an immediate, otherwise RI can be anything.
+    Operand *RI =
+        legalize(Src0, Dest->hasReg() ? Legal_All : Legal_Reg | Legal_Imm);
     if (isVectorType(Dest->getType()))
       _movp(Dest, RI);
     else
@@ -4098,6 +4112,168 @@
   lowerCall(Call);
 }
 
+// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to
+// preserve integrity of liveness analysis.  Undef values are also
+// turned into zeroes, since loOperand() and hiOperand() don't expect
+// Undef input.
+void TargetX8632::prelowerPhis() {
+  CfgNode *Node = Context.getNode();
+  for (InstPhi *Phi : Node->getPhis()) {
+    if (Phi->isDeleted())
+      continue;
+    Variable *Dest = Phi->getDest();
+    if (Dest->getType() == IceType_i64) {
+      Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+      Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+      InstPhi *PhiLo = InstPhi::create(Func, Phi->getSrcSize(), DestLo);
+      InstPhi *PhiHi = InstPhi::create(Func, Phi->getSrcSize(), DestHi);
+      for (SizeT I = 0; I < Phi->getSrcSize(); ++I) {
+        Operand *Src = Phi->getSrc(I);
+        CfgNode *Label = Phi->getLabel(I);
+        if (llvm::isa<ConstantUndef>(Src))
+          Src = Ctx->getConstantZero(Dest->getType());
+        PhiLo->addArgument(loOperand(Src), Label);
+        PhiHi->addArgument(hiOperand(Src), Label);
+      }
+      Node->getPhis().push_back(PhiLo);
+      Node->getPhis().push_back(PhiHi);
+      Phi->setDeleted();
+    }
+  }
+}
+
+namespace {
+
+bool isMemoryOperand(const Operand *Opnd) {
+  if (const auto Var = llvm::dyn_cast<Variable>(Opnd))
+    return !Var->hasReg();
+  if (llvm::isa<Constant>(Opnd))
+    return isScalarFloatingType(Opnd->getType());
+  return true;
+}
+
+} // end of anonymous namespace
+
+// Lower the pre-ordered list of assignments into mov instructions.
+// Also has to do some ad-hoc register allocation as necessary.
+void TargetX8632::lowerPhiAssignments(CfgNode *Node,
+                                      const AssignList &Assignments) {
+  // Check that this is a properly initialized shell of a node.
+  assert(Node->getOutEdges().size() == 1);
+  assert(Node->getInsts().empty());
+  assert(Node->getPhis().empty());
+  CfgNode *Succ = Node->getOutEdges()[0];
+  getContext().init(Node);
+  // Register set setup similar to regAlloc() and postLower().
+  RegSetMask RegInclude = RegSet_All;
+  RegSetMask RegExclude = RegSet_StackPointer;
+  if (hasFramePointer())
+    RegExclude |= RegSet_FramePointer;
+  llvm::SmallBitVector Available = getRegisterSet(RegInclude, RegExclude);
+  bool NeedsRegs = false;
+  // Initialize the set of available registers to the set of what is
+  // available (not live) at the beginning of the successor block,
+  // minus all registers used as Dest operands in the Assignments.  To
+  // do this, we start off assuming all registers are available, then
+  // iterate through the Assignments and remove Dest registers.
+  // During this iteration, we also determine whether we will actually
+  // need any extra registers for memory-to-memory copies.  If so, we
+  // do the actual work of removing the live-in registers from the
+  // set.  TODO(stichnot): This work is being repeated for every split
+  // edge to the successor, so consider updating LiveIn just once
+  // after all the edges are split.
+  for (InstAssign *Assign : Assignments) {
+    Variable *Dest = Assign->getDest();
+    if (Dest->hasReg()) {
+      Available[Dest->getRegNum()] = false;
+    } else if (isMemoryOperand(Assign->getSrc(0))) {
+      NeedsRegs = true; // Src and Dest are both in memory
+    }
+  }
+  if (NeedsRegs) {
+    LivenessBV &LiveIn = Func->getLiveness()->getLiveIn(Succ);
+    for (int i = LiveIn.find_first(); i != -1; i = LiveIn.find_next(i)) {
+      Variable *Var = Func->getLiveness()->getVariable(i, Succ);
+      if (Var->hasReg())
+        Available[Var->getRegNum()] = false;
+    }
+  }
+  // Iterate backwards through the Assignments.  After lowering each
+  // assignment, add Dest to the set of available registers, and
+  // remove Src from the set of available registers.  Iteration is
+  // done backwards to enable incremental updates of the available
+  // register set, and the lowered instruction numbers may be out of
+  // order, but that can be worked around by renumbering the block
+  // afterwards if necessary.
+  for (auto I = Assignments.rbegin(), E = Assignments.rend(); I != E; ++I) {
+    Context.rewind();
+    InstAssign *Assign = *I;
+    Variable *Dest = Assign->getDest();
+    Operand *Src = Assign->getSrc(0);
+    Variable *SrcVar = llvm::dyn_cast<Variable>(Src);
+    // Use normal assignment lowering, except lower mem=mem specially
+    // so we can register-allocate at the same time.
+    if (!isMemoryOperand(Dest) || !isMemoryOperand(Src)) {
+      lowerAssign(Assign);
+    } else {
+      assert(Dest->getType() == Src->getType());
+      const llvm::SmallBitVector &RegsForType =
+          getRegisterSetForType(Dest->getType());
+      llvm::SmallBitVector AvailRegsForType = RegsForType & Available;
+      Variable *SpillLoc = NULL;
+      Variable *Preg = NULL;
+      // TODO(stichnot): Opportunity for register randomization.
+      int32_t RegNum = AvailRegsForType.find_first();
+      bool IsVector = isVectorType(Dest->getType());
+      bool NeedSpill = (RegNum == -1);
+      if (NeedSpill) {
+        // Pick some register to spill and update RegNum.
+        // TODO(stichnot): Opportunity for register randomization.
+        RegNum = RegsForType.find_first();
+        Preg = getPhysicalRegister(RegNum, Dest->getType());
+        SpillLoc = Func->makeVariable(Dest->getType());
+        SpillLoc->setNeedsStackSlot();
+        if (IsVector)
+          _movp(SpillLoc, Preg);
+        else
+          _mov(SpillLoc, Preg);
+      }
+      assert(RegNum >= 0);
+      if (llvm::isa<ConstantUndef>(Src))
+        // Materialize an actual constant instead of undef.  RegNum is
+        // passed in for vector types because undef vectors are
+        // lowered to vector register of zeroes.
+        Src =
+            legalize(Src, Legal_All, IsVector ? RegNum : Variable::NoRegister);
+      Variable *Tmp = makeReg(Dest->getType(), RegNum);
+      if (IsVector) {
+        _movp(Tmp, Src);
+        _movp(Dest, Tmp);
+      } else {
+        _mov(Tmp, Src);
+        _mov(Dest, Tmp);
+      }
+      if (NeedSpill) {
+        // Restore the spilled register.
+        if (IsVector)
+          _movp(Preg, SpillLoc);
+        else
+          _mov(Preg, SpillLoc);
+      }
+    }
+    // Update register availability before moving to the previous
+    // instruction on the Assignments list.
+    if (Dest->hasReg())
+      Available[Dest->getRegNum()] = true;
+    if (SrcVar && SrcVar->hasReg())
+      Available[SrcVar->getRegNum()] = false;
+  }
+
+  // Add the terminator branch instruction to the end.
+  Context.setInsertPoint(Context.end());
+  _br(Succ);
+}
+
 // There is no support for loading or emitting vector constants, so the
 // vector values returned from makeVectorOfZeros, makeVectorOfOnes,
 // etc. are initialized with register operations.
@@ -4229,7 +4405,7 @@
       // then the result should be split and the lo and hi components will
       // need to go in uninitialized registers.
       if (isVectorType(From->getType()))
-        return makeVectorOfZeros(From->getType());
+        return makeVectorOfZeros(From->getType(), RegNum);
       From = Ctx->getConstantZero(From->getType());
     }
     // There should be no constants of vector type (other than undef).
diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index 88b75ac..2ba6a15 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -105,6 +105,9 @@
   void lowerStore(const InstStore *Inst) override;
   void lowerSwitch(const InstSwitch *Inst) override;
   void lowerUnreachable(const InstUnreachable *Inst) override;
+  void prelowerPhis() override;
+  void lowerPhiAssignments(CfgNode *Node,
+                           const AssignList &Assignments) override;
   void doAddressOptLoad() override;
   void doAddressOptStore() override;
   void randomlyInsertNop(float Probability) override;
@@ -482,7 +485,7 @@
   llvm::SmallBitVector RegsUsed;
   SizeT NextLabelNumber;
   bool ComputedLiveRanges;
-  VarList PhysicalRegisters;
+  VarList PhysicalRegisters[IceType_NUM];
   static IceString RegNames[];
 
 private:
diff --git a/src/IceTimerTree.def b/src/IceTimerTree.def
index 59d6f12..6de5055 100644
--- a/src/IceTimerTree.def
+++ b/src/IceTimerTree.def
@@ -18,6 +18,7 @@
   /* enum value */         \
   X(O2)                    \
   X(Om1)                   \
+  X(advancedPhiLowering)   \
   X(convertToIce)          \
   X(deletePhis)            \
   X(doAddressOpt)          \
diff --git a/src/IceTypes.cpp b/src/IceTypes.cpp
index 7155baa..975001d 100644
--- a/src/IceTypes.cpp
+++ b/src/IceTypes.cpp
@@ -19,7 +19,7 @@
 namespace {
 
 const char *TargetArchName[] = {
-#define X(tag, str)  str ,
+#define X(tag, str) str,
   TARGETARCH_TABLE
 #undef X
 };
diff --git a/src/IceTypes.h b/src/IceTypes.h
index 1619523..58f0aa4 100644
--- a/src/IceTypes.h
+++ b/src/IceTypes.h
@@ -31,7 +31,7 @@
 #define X(tag, str) tag,
   TARGETARCH_TABLE
 #undef X
-  TargetArch_NUM
+      TargetArch_NUM
 };
 
 const char *targetArchString(TargetArch Arch);
diff --git a/src/llvm2ice.cpp b/src/llvm2ice.cpp
index d117b58..1d7d193 100644
--- a/src/llvm2ice.cpp
+++ b/src/llvm2ice.cpp
@@ -169,10 +169,11 @@
     "exit-success", cl::desc("Exit with success status, even if errors found"),
     cl::init(false));
 
-static cl::opt<bool> GenerateBuildAtts(
-    "build-atts", cl::desc("Generate list of build attributes associated with "
+static cl::opt<bool>
+GenerateBuildAtts("build-atts",
+                  cl::desc("Generate list of build attributes associated with "
                            "this executable."),
-    cl::init(false));
+                  cl::init(false));
 
 static int GetReturnValue(int Val) {
   if (AlwaysExitSuccess)
@@ -183,13 +184,12 @@
 static struct {
   const char *FlagName;
   int FlagValue;
-} ConditionalBuildAttributes[] = {
-  { "text_asm", ALLOW_TEXT_ASM },
-  { "dump", ALLOW_DUMP },
-  { "llvm_cl", ALLOW_LLVM_CL },
-  { "llvm_ir", ALLOW_LLVM_IR },
-  { "llvm_ir_as_input", ALLOW_LLVM_IR_AS_INPUT }
-};
+} ConditionalBuildAttributes[] = { { "text_asm", ALLOW_TEXT_ASM },
+                                   { "dump", ALLOW_DUMP },
+                                   { "llvm_cl", ALLOW_LLVM_CL },
+                                   { "llvm_ir", ALLOW_LLVM_IR },
+                                   { "llvm_ir_as_input",
+                                     ALLOW_LLVM_IR_AS_INPUT } };
 
 // Validates values of build attributes. Prints them to Stream if
 // Stream is non-null.
diff --git a/tests_lit/llvm2ice_tests/nacl-atomic-cmpxchg-optimization.ll b/tests_lit/llvm2ice_tests/nacl-atomic-cmpxchg-optimization.ll
index d347dc1..258f294 100644
--- a/tests_lit/llvm2ice_tests/nacl-atomic-cmpxchg-optimization.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-cmpxchg-optimization.ll
@@ -39,11 +39,7 @@
 ; O2-LABEL: test_atomic_cmpxchg_loop
 ; O2: lock
 ; O2-NEXT: cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
-; O2-NOT: cmp
-; Make sure the phi assignment for succeeded_first_try is still there.
-; O2: mov {{.*}}, 2
-; O2-NOT: cmp
-; O2: jne
+; O2-NEXT: j{{e|ne}}
 ; Make sure the call isn't accidentally deleted.
 ; O2: call
 ;
@@ -97,8 +93,7 @@
 ; O2-LABEL: test_atomic_cmpxchg_loop_const
 ; O2: lock
 ; O2-NEXT: cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
-; O2-NOT: cmp
-; O2: jne
+; O2-NEXT: j{{e|ne}}
 
 ; This is a case where the flags cannot be reused (compare is for some
 ; other condition).
@@ -120,7 +115,6 @@
 ; O2-LABEL: test_atomic_cmpxchg_no_opt
 ; O2: lock
 ; O2-NEXT: cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
-; O2: mov {{.*}}
 ; O2: cmp
 ; O2: jle
 
diff --git a/tests_lit/llvm2ice_tests/simple-loop.ll b/tests_lit/llvm2ice_tests/simple-loop.ll
index 02b78ab..39337b5 100644
--- a/tests_lit/llvm2ice_tests/simple-loop.ll
+++ b/tests_lit/llvm2ice_tests/simple-loop.ll
@@ -35,14 +35,15 @@
 ; CHECK-LABEL: simple_loop
 ; CHECK:      mov ecx, dword ptr [esp{{.*}}+{{.*}}{{[0-9]+}}]
 ; CHECK:      cmp ecx, 0
-; CHECK-NEXT: jle {{[0-9]}}
+; CHECK-NEXT: j{{le|g}} {{[0-9]}}
 
-; TODO: the mov from ebx to esi seems redundant here - so this may need to be
-; modified later
-
-; CHECK:      add [[IREG:[a-z]+]], 1
-; CHECK-NEXT: mov [[ICMPREG:[a-z]+]], [[IREG]]
-; CHECK:      cmp [[ICMPREG]], ecx
+; Check for the combination of address mode inference, register
+; allocation, and load/arithmetic fusing.
+; CHECK:      add e{{..}}, dword ptr [e{{..}} + 4*[[IREG:e..]]]
+; Check for incrementing of the register-allocated induction variable.
+; CHECK-NEXT: add [[IREG]], 1
+; Check for comparing the induction variable against the loop size.
+; CHECK-NEXT: cmp [[IREG]],
 ; CHECK-NEXT: jl -{{[0-9]}}
 ;
 ; There's nothing remarkable under Om1 to test for, since Om1 generates